In [1]:
    
from pin import pin
import matplotlib.pyplot as plt
from collections import Counter
import json
import os
import autograd.numpy as np
import pickle as pkl
%matplotlib inline
%load_ext autoreload
%autoreload 2
    
    
In [2]:
    
with open('../data/batch_summary.json') as f:
    model_data = json.load(f)
    
In [3]:
    
model_data['projects'][0]
    
    Out[3]:
In [4]:
    
# Grab some summary statistics of the modelling.
# I'm curious to know how many times a particular template was chosen.
template_counter = Counter()
gmqe_scores = []
sequence_identity = []
sequence_similarity = []
for p in model_data['projects']:
    for m in p['models']:
        template_counter[m['template']] += 1
        gmqe_scores.append(m['GMQE'])
        sequence_identity.append(m['seq_id'])
        sequence_similarity.append(m['seq_sim'])
    
In [7]:
    
plt.hist(gmqe_scores)
    
    Out[7]:
    
In [8]:
    
plt.hist(sequence_identity)
    
    Out[8]:
    
In [9]:
    
plt.hist(sequence_similarity)
    
    Out[9]:
    
In [10]:
    
model_data['projects'][0]
    
    Out[10]:
In [25]:
    
# Write the python script here
script = '\n\
import networkx as nx\n\
import pickle\n\
from pin.pin import ProteinInteractionNetwork\n\
\n\
p = ProteinInteractionNetwork("model_01.pdb")\n\
with open("model_01.pkl", "wb") as f:\n\
    pickle.dump(p, f)\n\
'
print(script)
    
    
In [26]:
    
# Write the bash script here
bash = '\n\
#!/bin/sh \n\
#$ -S /bin/sh \n\
#$ -cwd \n\
#$ -V \n\
#$ -m e \n\
#$ -M ericmjl@mit.edu \n\
#$ -pe whole_nodes 1 \n\
#############################################\n\
\n\
python script.py\n\
'
print(bash)
    
    
In [27]:
    
# Write a master script for q-subbing
qsub = '\n\
#!/bin/sh \n\
#$ -S /bin/sh \n\
#$ -cwd \n\
#$ -V \n\
#$ -m e \n\
#$ -M ericmjl@mit.edu \n\
#$ -pe whole_nodes 1 \n\
#############################################\n\
\n\
'
print(qsub)
    
    
In [28]:
    
# Write script to disk
mdl_dir = '../data/batch_models/'
for project in model_data['projects']:
    code = project['code']
    with open('{0}/{1}/script.py'.format(mdl_dir, code), 'w+') as f:
        f.write(script)
    with open('{0}/{1}/{1}.sh'.format(mdl_dir, code), 'w+') as f:
        f.write(bash)
        
with open('{0}/master.sh'.format(mdl_dir), 'w+') as f:
    f.write(qsub)
    for project in model_data['projects']:
        
        f.write('cd {0}\n'.format(project['code']))
        f.write('qsub {0}.sh\n'.format(project['code']))
        f.write('cd ..\n')
        f.write('\n')
    
In [14]:
    
import pandas as pd
protease_data = pd.read_csv('../data/hiv_data/hiv-protease-data-expanded.csv', index_col=0)
protease_data.describe()
    
    Out[14]:
Based on the number of samples available for each drug, let us start with NFV, which has the most number of values available.
In [15]:
    
protease_data['NFV'].describe()
    
    Out[15]:
I will log-transform these values, and then Z-standardize them.
In [16]:
    
import sklearn.preprocessing as pp
# from sklearn.preprocessing import scale
import numpy as np
# Drop NaN values on only the NFV column
fpv_data = protease_data.dropna(subset=['FPV'])[['FPV', 'seqid']]
fpv_data['FPV'] = fpv_data['FPV'].apply(np.log10)
# fpv_data['FPV_scaled'] = pp.scale(fpv_data['FPV'])
fpv_data.hist()
fpv_data.head()
    
    Out[16]:
    
In [17]:
    
# Grab out a sample of projects, based on the seqid.
import random as rnd
# seqids_interest = rnd.sample(list(fpv_data['seqid']), 25)
proj_titles = {c['title']:c['code'] for c in model_data['projects']}
projs_interest = dict()
n_graphs = 4
while len(projs_interest) < n_graphs:
    seqid = rnd.choice(list(fpv_data['seqid']))
    if seqid in proj_titles.keys():
        projs_interest[seqid] = proj_titles[seqid]
        
        
        
# for seqid in :
#     if seqid not in proj_titles:
#         print(seqid)
# projects_interest = {c['title']:c['code'] for c in model_data['projects'] if c['title'] in seqids_interest}
projs_interest
    
    Out[17]:
In [18]:
    
fpv_data[fpv_data['seqid'].isin(projs_interest.keys())]
    
    Out[18]:
In [19]:
    
projects = [i for i in projs_interest.values()]
p = pin.ProteinInteractionNetwork('../data/batch_models/{0}/model_01.pdb'.format(projects[0]))
p.graph['project'] = project
p.graph['input_shape'] = p.nodes(data=True)[0][1]['features'].shape
p.graph['seqid'] = seqid
    
In [20]:
    
# Takes about 2 hours for the task to finish over 3401 graphs. This is it being done on a single core.
from tqdm import tqdm
from joblib import Parallel, delayed
graphs = []
def make_protein_graphs(project, seqid):
    """
    Custom function for this notebook to parallelize the making of protein graphs over individual cores.
    """
    from pin import pin
    p = pin.ProteinInteractionNetwork('../data/batch_models/{0}/model_01.pdb'.format(project))
    p.graph['project'] = project
    p.graph['input_shape'] = p.nodes(data=True)[0][1]['features'].shape
    p.graph['seqid'] = seqid
    return p
graphs = Parallel(n_jobs=-1)(delayed(make_protein_graphs)(proj, seqid) for seqid, proj in projs_interest.items())
len(graphs)
    
    Out[20]:
In [21]:
    
# Check a sample of the graphs to make sure that the input shapes are correct.
graphs[0].graph['input_shape']
    
    Out[21]:
In [22]:
    
from graphfp.layers import FingerprintLayer, GraphInputLayer, LinearRegressionLayer, GraphConvLayer,\
    FullyConnectedLayer, DropoutLayer
from graphfp.utils import initialize_network, batch_sample
    
In [23]:
    
graphs[0].nodes(data=True)[0][1]['features'].shape
    
    Out[23]:
In [33]:
    
def train_loss(wb_vect, unflattener, batch=True, batch_size=2, debug=False):
    """
    Training loss is MSE.
    We pass in a flattened parameter vector and its unflattener.
    """
    wb_struct = unflattener(wb_vect)
    if batch:
        batch_size = batch_size
    else:
        batch_size = len(graphs)
    samp_graphs, samp_inputs = batch_sample(graphs, input_shape, batch_size)
    preds = predict(wb_struct, samp_inputs, samp_graphs)
    graph_ids = [g.graph['seqid'] for g in samp_graphs]
    # graph_scores = fpv_data[fpv_data['seqid'].isin(graph_ids)]['FPV'].values.reshape(preds.shape)
    graph_scores = fpv_data.set_index('seqid').ix[graph_ids]['FPV'].values.reshape(preds.shape)
    assert preds.shape == graph_scores.shape
    
    if debug:
        print(graph_ids)
        print('Predictions:')
        print(preds)
        print('Mean: {0}'.format(np.mean(preds)))
        print('')
        print('Actual')
        print(graph_scores)
        print('Mean: {0}'.format(np.mean(graph_scores)))
        print('')
        print('Difference')
        print(preds - graph_scores)
        print('Mean Squared Error: {0}'.format(np.mean(np.power(preds - graph_scores, 2))))
        print('')
    # print(preds.shape, graph_scores.shape)
    # assert preds.shape[1] == graph_scores.shape[1]
    # print(preds - graph_scores)
    mse = np.mean(np.power(preds - graph_scores, 2))
    return mse
    
In [34]:
    
# fpv_data[fpv_data['seqid'].isin(['2996-1', '2996-0'])]
graph_ids = ['4387-1', '2996-1', '2996-0', '4482-1']
fpv_data.set_index('seqid').ix[graph_ids]['FPV'].values.reshape((4,1))
    
    Out[34]:
In [35]:
    
def predict(wb_struct, inputs, graphs):
    """
    Makes predictions by running the forward pass over all of the layers.
    Parameters:
    ===========
    - wb_struct: a dictionary of weights and biases stored for each layer.
    - inputs: the input data matrix. should be one row per graph.
    - graphs: a list of all graphs.
    """
    curr_inputs = inputs
    for i, layer in enumerate(layers):
        # print(type(wb_struct))
        wb = wb_struct['layer{0}_{1}'.format(i, layer)]
        curr_inputs = layer.forward_pass(wb, curr_inputs, graphs)
    return curr_inputs
    
In [36]:
    
def callback(wb, i):
    """
    Any function you want to run at each iteration of the optimization.
    """
    # from time import time
    from graphfp.flatten import flatten
    # new_time = time()
    wb_vect, wb_unflattener = flatten(wb)
    print('Iteration: {0}'.format(i))
    # print('Computing gradient w.r.t. weights...')
    print('Training Loss: ')
    
    tl = train_loss(wb_vect, wb_unflattener, batch=False)
    print(tl)
    print('')
    training_losses.append(tl)
    
In [37]:
    
from autograd import grad
grad_tl = grad(train_loss)
    
In [38]:
    
len(graphs[0])
    
    Out[38]:
In [39]:
    
import networkx as nx
from collections import Counter
Counter([len(graphs[0].neighbors(n)) for n in graphs[0].nodes()])
    
    Out[39]:
In [63]:
    
from graphfp.optimizers import sgd, adam
input_shape = graphs[0].graph['input_shape']
layers = [# GraphConvLayer((input_shape[1], input_shape[1] * 2)),
          # GraphConvLayer((input_shape[1], input_shape[1])),
          # GraphConvLayer((input_shape[1], input_shape[1])),
          GraphConvLayer((input_shape[1], input_shape[1])),
          FingerprintLayer(input_shape[1]),
          # FullyConnectedLayer((input_shape[1], input_shape[1])),
          # DropoutLayer(p=0),
          # FullyConnectedLayer((input_shape[1], input_shape[1])),
          # DropoutLayer(p=0.3),
          # FullyConnectedLayer((input_shape[1], input_shape[1])),
          LinearRegressionLayer((input_shape[1], 1)),
]
wb_all = initialize_network(input_shape, graphs, layers)
training_losses = list()
    
In [64]:
    
from graphfp.flatten import flatten
wb_vect, wb_unflattener = flatten(wb_all)
train_loss(wb_vect, wb_unflattener, debug=True)
    
    
    Out[64]:
In [65]:
    
wb_vect, wb_unflattener = adam(grad_tl, wb_all, callback=clbk, num_iters=1000)
wb_all = wb_unflattener(wb_vect)
    
    
In [69]:
    
plt.plot(training_losses)
plt.yscale('log')
    
    
In [70]:
    
from graphfp.flatten import flatten
wb_vect, wb_unflattener = flatten(wb_all)
train_loss(wb_vect, wb_unflattener, debug=True)
    
    
    Out[70]:
In [72]:
    
wb_all['layer0_GraphConvLayer']['weights'].max()
    
    Out[72]:
In [ ]: