We have some models that are trained on a subset of the data (e.g. the one that was trained on 100 proteases for 100,000 iterations). Let's check the generalization error on those.


In [1]:
from pin import pin
import pandas as pd
import json
import pickle as pkl


/Users/ericmjl/anaconda/lib/python3.5/site-packages/sklearn/preprocessing/data.py:583: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
/Users/ericmjl/anaconda/lib/python3.5/site-packages/sklearn/preprocessing/data.py:583: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)

In [15]:
with open('../experiments/outputs/100-graphs_10000-iters_wbs.pkl', 'rb') as f:
    wb = pkl.load(f)

In [16]:
wb['layer0_GraphConvLayer']['weights'].min()


Out[16]:
-1.0994781577520054

In [17]:
data = pd.read_csv('../data/hiv_data/hiv-protease-data-expanded.csv', index_col=0)
data.head()


Out[17]:
ATV DRV FPV IDV LPV NFV SQV SeqID TPV seqid sequence sequence_object weight
0 NaN NaN 2.5 16.3 NaN 38.6 16.1 2996 NaN 2996-0 PQITLWQRPIVTIKIGGQLKEALLDTGADDTVLEDVNLPGRWKPKM... ID: 2996-0\nName: <unknown name>\nDescription:... 0.50
1 NaN NaN 2.5 16.3 NaN 38.6 16.1 2996 NaN 2996-1 PQITLWQRPIVTIKIGGQLKEALLDTGADDTVLEDVNLPGRWKPKM... ID: 2996-1\nName: <unknown name>\nDescription:... 0.50
2 NaN NaN 0.7 0.8 NaN 0.8 1.1 4387 NaN 4387-0 PQITLWQRPLVTIKVGGQLKEALLDTGADDTVLEDMELPGRWKPKM... ID: 4387-0\nName: <unknown name>\nDescription:... 0.25
3 NaN NaN 0.7 0.8 NaN 0.8 1.1 4387 NaN 4387-1 PQITLWQRPLVTIKVGGQLKEALLDTGADDTVLEDMELPGRWKPKM... ID: 4387-1\nName: <unknown name>\nDescription:... 0.25
4 NaN NaN 0.7 0.8 NaN 0.8 1.1 4387 NaN 4387-2 PQITLWQRPLVTIKVGGQLKEALLDTGADDTVLEDMELPGRWKPKM... ID: 4387-2\nName: <unknown name>\nDescription:... 0.25

In [18]:
with open('../data/batch_summary.json') as f:
    model_data = json.load(f)

In [23]:
model_data['projects'][0]


Out[23]:
{'code': 'wJ9TDy',
 'created': '2016-01-02',
 'models': [{'GMQE': 0.9800000191,
   'id': '01',
   'oligo_state': 'homo-dimer',
   'qmean': 0.3202740713,
   'seq_coverage': 1.0,
   'seq_id': 91.9191894531,
   'seq_sim': 0.5824936032,
   'status': 'COMPLETED',
   'template': '4ll3.1.B',
   'tpl_seq': 'PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF',
   'trg_seq': 'PQITLWQRPLVTIKIEGQLKEALLDTGADDTVLEEINLSGKWKPKMIGGIGGFIKVGQYDQITIEICGHKVIGTVLVGPTPVNIIGRNLLTQLGCTLNF'},
  {'GMQE': 0.9800000191,
   'id': '02',
   'oligo_state': 'homo-dimer',
   'qmean': -0.9170268789,
   'seq_coverage': 1.0,
   'seq_id': 91.9191894531,
   'seq_sim': 0.5824936032,
   'status': 'COMPLETED',
   'template': '1a8g.1.A',
   'tpl_seq': 'PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF',
   'trg_seq': 'PQITLWQRPLVTIKIEGQLKEALLDTGADDTVLEEINLSGKWKPKMIGGIGGFIKVGQYDQITIEICGHKVIGTVLVGPTPVNIIGRNLLTQLGCTLNF'},
  {'GMQE': 0.9800000191,
   'id': '03',
   'oligo_state': 'homo-dimer',
   'qmean': -0.5994937920000001,
   'seq_coverage': 1.0,
   'seq_id': 91.9191894531,
   'seq_sim': 0.5824936032,
   'status': 'COMPLETED',
   'template': '1vik.1.A',
   'tpl_seq': 'PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNF',
   'trg_seq': 'PQITLWQRPLVTIKIEGQLKEALLDTGADDTVLEEINLSGKWKPKMIGGIGGFIKVGQYDQITIEICGHKVIGTVLVGPTPVNIIGRNLLTQLGCTLNF'}],
 'target': 'PQITLWQRPLVTIKIEGQLKEALLDTGADDTVLEEINLSGKWKPKMIGGIGGFIKVGQYDQITIEICGHKVIGTVLVGPTPVNIIGRNLLTQLGCTLNF',
 'title': '259265-1',
 'type': 'TARGET_SEQUENCE'}

In [86]:
# Make the model.

n_graphs = 8

def make_protein_graphs(project, seqid):
    """
    Custom function for this script to parallelize the making of protein
    graphs over individual cores.
    """
    p = pin.ProteinInteractionNetwork('../data/batch_models/{0}/model_01.pdb'
                                      .format(project))
    p.graph['project'] = project
    p.graph['input_shape'] = p.nodes(data=True)[0][1]['features'].shape
    p.graph['seqid'] = seqid
    return p

project_id = 10  # change this number to play around with it.

project = model_data['projects'][project_id]['code']
p = pin.ProteinInteractionNetwork('../data/batch_models/{0}/model_01.pdb'.format(project))
p.graph['project'] = project
p.graph['input_shape'] = p.nodes(data=True)[project_id][1]['features'].shape
p.graph['seqid'] = model_data['projects'][project_id]['title']

In [79]:
p.graph


Out[79]:
{'input_shape': (1, 36), 'project': 'rGXLEm', 'seqid': '259241-1'}

In [80]:
from graphfp.layers import GraphConvLayer, FingerprintLayer, LinearRegressionLayer
from graphfp.flatten import flatten
from graphfp.utils import batch_sample

In [81]:
input_shape = p.graph['input_shape']

layers = [GraphConvLayer(kernel_shape=(input_shape[1], input_shape[1])),
          FingerprintLayer(shape=(input_shape)),
          LinearRegressionLayer(shape=(input_shape, 1)),
         ]

In [82]:
graphs = [p]
batch_size = 1
samp_graphs, samp_inputs = batch_sample(graphs, input_shape, batch_size)

In [83]:
def predict(wb_struct, inputs, graphs):
    """
    Makes predictions by running the forward pass over all of the layers.

    Parameters:
    ===========
    - wb_struct: a dictionary of weights and biases stored for each layer.
    - inputs: the input data matrix. should be one row per graph.
    - graphs: a list of all graphs.
    """
    curr_inputs = inputs

    for i, layer in enumerate(layers):
        # print(type(wb_struct))
        wb = wb_struct['layer{0}_{1}'.format(i, layer)]
        curr_inputs = layer.forward_pass(wb, curr_inputs, graphs)
    return curr_inputs

In [84]:
preds = predict(wb, samp_inputs, samp_graphs)
preds


Out[84]:
array([[-0.42684438]])

In [85]:
data[data.seqid == p.graph['seqid']]['FPV']


Out[85]:
6936    0.8
Name: FPV, dtype: float64

Clearly there is overfitting going on.


In [ ]: