In [1]:
from pin import pin
import matplotlib.pyplot as plt
from collections import Counter
import json
import os
import autograd.numpy as np
import pickle as pkl
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [2]:
with open('../data/batch_summary.json') as f:
model_data = json.load(f)
In [3]:
model_data['projects'][0]
Out[3]:
In [4]:
# Grab some summary statistics of the modelling.
# I'm curious to know how many times a particular template was chosen.
template_counter = Counter()
gmqe_scores = []
sequence_identity = []
sequence_similarity = []
for p in model_data['projects']:
for m in p['models']:
template_counter[m['template']] += 1
gmqe_scores.append(m['GMQE'])
sequence_identity.append(m['seq_id'])
sequence_similarity.append(m['seq_sim'])
In [7]:
plt.hist(gmqe_scores)
Out[7]:
In [8]:
plt.hist(sequence_identity)
Out[8]:
In [9]:
plt.hist(sequence_similarity)
Out[9]:
In [10]:
model_data['projects'][0]
Out[10]:
In [25]:
# Write the python script here
script = '\n\
import networkx as nx\n\
import pickle\n\
from pin.pin import ProteinInteractionNetwork\n\
\n\
p = ProteinInteractionNetwork("model_01.pdb")\n\
with open("model_01.pkl", "wb") as f:\n\
pickle.dump(p, f)\n\
'
print(script)
In [26]:
# Write the bash script here
bash = '\n\
#!/bin/sh \n\
#$ -S /bin/sh \n\
#$ -cwd \n\
#$ -V \n\
#$ -m e \n\
#$ -M ericmjl@mit.edu \n\
#$ -pe whole_nodes 1 \n\
#############################################\n\
\n\
python script.py\n\
'
print(bash)
In [27]:
# Write a master script for q-subbing
qsub = '\n\
#!/bin/sh \n\
#$ -S /bin/sh \n\
#$ -cwd \n\
#$ -V \n\
#$ -m e \n\
#$ -M ericmjl@mit.edu \n\
#$ -pe whole_nodes 1 \n\
#############################################\n\
\n\
'
print(qsub)
In [28]:
# Write script to disk
mdl_dir = '../data/batch_models/'
for project in model_data['projects']:
code = project['code']
with open('{0}/{1}/script.py'.format(mdl_dir, code), 'w+') as f:
f.write(script)
with open('{0}/{1}/{1}.sh'.format(mdl_dir, code), 'w+') as f:
f.write(bash)
with open('{0}/master.sh'.format(mdl_dir), 'w+') as f:
f.write(qsub)
for project in model_data['projects']:
f.write('cd {0}\n'.format(project['code']))
f.write('qsub {0}.sh\n'.format(project['code']))
f.write('cd ..\n')
f.write('\n')
In [14]:
import pandas as pd
protease_data = pd.read_csv('../data/hiv_data/hiv-protease-data-expanded.csv', index_col=0)
protease_data.describe()
Out[14]:
Based on the number of samples available for each drug, let us start with NFV, which has the most number of values available.
In [15]:
protease_data['NFV'].describe()
Out[15]:
I will log-transform these values, and then Z-standardize them.
In [16]:
import sklearn.preprocessing as pp
# from sklearn.preprocessing import scale
import numpy as np
# Drop NaN values on only the NFV column
fpv_data = protease_data.dropna(subset=['FPV'])[['FPV', 'seqid']]
fpv_data['FPV'] = fpv_data['FPV'].apply(np.log10)
# fpv_data['FPV_scaled'] = pp.scale(fpv_data['FPV'])
fpv_data.hist()
fpv_data.head()
Out[16]:
In [17]:
# Grab out a sample of projects, based on the seqid.
import random as rnd
# seqids_interest = rnd.sample(list(fpv_data['seqid']), 25)
proj_titles = {c['title']:c['code'] for c in model_data['projects']}
projs_interest = dict()
n_graphs = 4
while len(projs_interest) < n_graphs:
seqid = rnd.choice(list(fpv_data['seqid']))
if seqid in proj_titles.keys():
projs_interest[seqid] = proj_titles[seqid]
# for seqid in :
# if seqid not in proj_titles:
# print(seqid)
# projects_interest = {c['title']:c['code'] for c in model_data['projects'] if c['title'] in seqids_interest}
projs_interest
Out[17]:
In [18]:
fpv_data[fpv_data['seqid'].isin(projs_interest.keys())]
Out[18]:
In [19]:
projects = [i for i in projs_interest.values()]
p = pin.ProteinInteractionNetwork('../data/batch_models/{0}/model_01.pdb'.format(projects[0]))
p.graph['project'] = project
p.graph['input_shape'] = p.nodes(data=True)[0][1]['features'].shape
p.graph['seqid'] = seqid
In [20]:
# Takes about 2 hours for the task to finish over 3401 graphs. This is it being done on a single core.
from tqdm import tqdm
from joblib import Parallel, delayed
graphs = []
def make_protein_graphs(project, seqid):
"""
Custom function for this notebook to parallelize the making of protein graphs over individual cores.
"""
from pin import pin
p = pin.ProteinInteractionNetwork('../data/batch_models/{0}/model_01.pdb'.format(project))
p.graph['project'] = project
p.graph['input_shape'] = p.nodes(data=True)[0][1]['features'].shape
p.graph['seqid'] = seqid
return p
graphs = Parallel(n_jobs=-1)(delayed(make_protein_graphs)(proj, seqid) for seqid, proj in projs_interest.items())
len(graphs)
Out[20]:
In [21]:
# Check a sample of the graphs to make sure that the input shapes are correct.
graphs[0].graph['input_shape']
Out[21]:
In [22]:
from graphfp.layers import FingerprintLayer, GraphInputLayer, LinearRegressionLayer, GraphConvLayer,\
FullyConnectedLayer, DropoutLayer
from graphfp.utils import initialize_network, batch_sample
In [23]:
graphs[0].nodes(data=True)[0][1]['features'].shape
Out[23]:
In [33]:
def train_loss(wb_vect, unflattener, batch=True, batch_size=2, debug=False):
"""
Training loss is MSE.
We pass in a flattened parameter vector and its unflattener.
"""
wb_struct = unflattener(wb_vect)
if batch:
batch_size = batch_size
else:
batch_size = len(graphs)
samp_graphs, samp_inputs = batch_sample(graphs, input_shape, batch_size)
preds = predict(wb_struct, samp_inputs, samp_graphs)
graph_ids = [g.graph['seqid'] for g in samp_graphs]
# graph_scores = fpv_data[fpv_data['seqid'].isin(graph_ids)]['FPV'].values.reshape(preds.shape)
graph_scores = fpv_data.set_index('seqid').ix[graph_ids]['FPV'].values.reshape(preds.shape)
assert preds.shape == graph_scores.shape
if debug:
print(graph_ids)
print('Predictions:')
print(preds)
print('Mean: {0}'.format(np.mean(preds)))
print('')
print('Actual')
print(graph_scores)
print('Mean: {0}'.format(np.mean(graph_scores)))
print('')
print('Difference')
print(preds - graph_scores)
print('Mean Squared Error: {0}'.format(np.mean(np.power(preds - graph_scores, 2))))
print('')
# print(preds.shape, graph_scores.shape)
# assert preds.shape[1] == graph_scores.shape[1]
# print(preds - graph_scores)
mse = np.mean(np.power(preds - graph_scores, 2))
return mse
In [34]:
# fpv_data[fpv_data['seqid'].isin(['2996-1', '2996-0'])]
graph_ids = ['4387-1', '2996-1', '2996-0', '4482-1']
fpv_data.set_index('seqid').ix[graph_ids]['FPV'].values.reshape((4,1))
Out[34]:
In [35]:
def predict(wb_struct, inputs, graphs):
"""
Makes predictions by running the forward pass over all of the layers.
Parameters:
===========
- wb_struct: a dictionary of weights and biases stored for each layer.
- inputs: the input data matrix. should be one row per graph.
- graphs: a list of all graphs.
"""
curr_inputs = inputs
for i, layer in enumerate(layers):
# print(type(wb_struct))
wb = wb_struct['layer{0}_{1}'.format(i, layer)]
curr_inputs = layer.forward_pass(wb, curr_inputs, graphs)
return curr_inputs
In [36]:
def callback(wb, i):
"""
Any function you want to run at each iteration of the optimization.
"""
# from time import time
from graphfp.flatten import flatten
# new_time = time()
wb_vect, wb_unflattener = flatten(wb)
print('Iteration: {0}'.format(i))
# print('Computing gradient w.r.t. weights...')
print('Training Loss: ')
tl = train_loss(wb_vect, wb_unflattener, batch=False)
print(tl)
print('')
training_losses.append(tl)
In [37]:
from autograd import grad
grad_tl = grad(train_loss)
In [38]:
len(graphs[0])
Out[38]:
In [39]:
import networkx as nx
from collections import Counter
Counter([len(graphs[0].neighbors(n)) for n in graphs[0].nodes()])
Out[39]:
In [63]:
from graphfp.optimizers import sgd, adam
input_shape = graphs[0].graph['input_shape']
layers = [# GraphConvLayer((input_shape[1], input_shape[1] * 2)),
# GraphConvLayer((input_shape[1], input_shape[1])),
# GraphConvLayer((input_shape[1], input_shape[1])),
GraphConvLayer((input_shape[1], input_shape[1])),
FingerprintLayer(input_shape[1]),
# FullyConnectedLayer((input_shape[1], input_shape[1])),
# DropoutLayer(p=0),
# FullyConnectedLayer((input_shape[1], input_shape[1])),
# DropoutLayer(p=0.3),
# FullyConnectedLayer((input_shape[1], input_shape[1])),
LinearRegressionLayer((input_shape[1], 1)),
]
wb_all = initialize_network(input_shape, graphs, layers)
training_losses = list()
In [64]:
from graphfp.flatten import flatten
wb_vect, wb_unflattener = flatten(wb_all)
train_loss(wb_vect, wb_unflattener, debug=True)
Out[64]:
In [65]:
wb_vect, wb_unflattener = adam(grad_tl, wb_all, callback=clbk, num_iters=1000)
wb_all = wb_unflattener(wb_vect)
In [69]:
plt.plot(training_losses)
plt.yscale('log')
In [70]:
from graphfp.flatten import flatten
wb_vect, wb_unflattener = flatten(wb_all)
train_loss(wb_vect, wb_unflattener, debug=True)
Out[70]:
In [72]:
wb_all['layer0_GraphConvLayer']['weights'].max()
Out[72]:
In [ ]: