We have some models that are trained on a subset of the data (e.g. the one that was trained on 100 proteases for 100,000 iterations). Let's check the generalization error on those.
In [1]:
from pin import pin
import pandas as pd
import json
import pickle as pkl
In [15]:
with open('../experiments/outputs/100-graphs_10000-iters_wbs.pkl', 'rb') as f:
wb = pkl.load(f)
In [16]:
wb['layer0_GraphConvLayer']['weights'].min()
Out[16]:
In [17]:
data = pd.read_csv('../data/hiv_data/hiv-protease-data-expanded.csv', index_col=0)
data.head()
Out[17]:
In [18]:
with open('../data/batch_summary.json') as f:
model_data = json.load(f)
In [23]:
model_data['projects'][0]
Out[23]:
In [86]:
# Make the model.
n_graphs = 8
def make_protein_graphs(project, seqid):
"""
Custom function for this script to parallelize the making of protein
graphs over individual cores.
"""
p = pin.ProteinInteractionNetwork('../data/batch_models/{0}/model_01.pdb'
.format(project))
p.graph['project'] = project
p.graph['input_shape'] = p.nodes(data=True)[0][1]['features'].shape
p.graph['seqid'] = seqid
return p
project_id = 10 # change this number to play around with it.
project = model_data['projects'][project_id]['code']
p = pin.ProteinInteractionNetwork('../data/batch_models/{0}/model_01.pdb'.format(project))
p.graph['project'] = project
p.graph['input_shape'] = p.nodes(data=True)[project_id][1]['features'].shape
p.graph['seqid'] = model_data['projects'][project_id]['title']
In [79]:
p.graph
Out[79]:
In [80]:
from graphfp.layers import GraphConvLayer, FingerprintLayer, LinearRegressionLayer
from graphfp.flatten import flatten
from graphfp.utils import batch_sample
In [81]:
input_shape = p.graph['input_shape']
layers = [GraphConvLayer(kernel_shape=(input_shape[1], input_shape[1])),
FingerprintLayer(shape=(input_shape)),
LinearRegressionLayer(shape=(input_shape, 1)),
]
In [82]:
graphs = [p]
batch_size = 1
samp_graphs, samp_inputs = batch_sample(graphs, input_shape, batch_size)
In [83]:
def predict(wb_struct, inputs, graphs):
"""
Makes predictions by running the forward pass over all of the layers.
Parameters:
===========
- wb_struct: a dictionary of weights and biases stored for each layer.
- inputs: the input data matrix. should be one row per graph.
- graphs: a list of all graphs.
"""
curr_inputs = inputs
for i, layer in enumerate(layers):
# print(type(wb_struct))
wb = wb_struct['layer{0}_{1}'.format(i, layer)]
curr_inputs = layer.forward_pass(wb, curr_inputs, graphs)
return curr_inputs
In [84]:
preds = predict(wb, samp_inputs, samp_graphs)
preds
Out[84]:
In [85]:
data[data.seqid == p.graph['seqid']]['FPV']
Out[85]:
Clearly there is overfitting going on.
In [ ]: