In [1]:
from autograd import grad
from memorytools import summarize_objects
import autograd.numpy as np
import pickle as pkl
import json
import pandas as pd
import matplotlib.pyplot as plt
import tracemalloc
import objgraph
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [2]:
tracemalloc.start()
In [3]:
# Open the data file that contains the HIV protease data
df = pd.read_csv('../data/hiv_data/hiv-protease-data-expanded.csv', index_col=0)
df = df.dropna(subset=['FPV'])
df.head()
Out[3]:
In [4]:
# Open the numpy array of all graphs' data.
graph_arr = np.load('../data/feat_array.npy')
In [5]:
# Open the pickles that contain the graph information and node-nbr information.
def unpickle_data(path):
with open(path, 'rb') as f:
data = pkl.load(f)
return data
graph_idxs = unpickle_data('../data/graph_idxs.pkl')
graph_nodes = unpickle_data('../data/graph_nodes.pkl')
nodes_nbrs = unpickle_data('../data/nodes_nbrs.pkl')
In [6]:
list(graph_idxs.keys())[0:5]
# len(graph_idxs.keys())
Out[6]:
In [7]:
list(graph_nodes.items())[0]
Out[7]:
In [8]:
summarize_objects()
In [10]:
list(nodes_nbrs.items())[0]
Out[10]:
In [11]:
# Keep track of only those that are in both the graph_idxs and in the df['seqid']
intersect = set(df['seqid'].values).intersection(graph_idxs.keys())
len(intersect)
Out[11]:
In [12]:
# Get a reduced list of graph_idxs.
graph_idxs_red = dict()
graph_nodes_red = dict()
for g in intersect:
graph_idxs_red[g] = graph_idxs[g]
graph_nodes_red[g] = graph_nodes[g]
In [13]:
graph_idxs_red['46213-0']
Out[13]:
In [14]:
graph_arr.shape
Out[14]:
In [15]:
# Make one pass over the data to get the old/new index mapping, and
# make the final graph_array that gets passed in as an input.
def reindex_data_matrix(graph_idxs_red, graph_arr):
"""
Parameters:
===========
- graph_idxs_red: reduced graph indices
- graph_arr: the original matrix of (nodes by node_features)
Returns:
========
- graph_arr_fin: a reduced matrix of (nodes by node_features)
- nodes_oldnew, nodes_newold: mapping of new and old indices.
"""
# Initialize a zero-matrix.
idxs = np.concatenate([i for i in graph_idxs_red.values()])
graph_arr_fin = np.zeros(shape=graph_arr[idxs].shape)
# Initialize empty maps of graph indices from the old to the new.
nodes_oldnew = dict() # {old_idx: new_idx}.
nodes_newold = dict() # {new_idx: old_idx}
# Re-assign reduced graphs to the zero-matrix.
curr_idx = 0
for seqid, idxs in sorted(graph_idxs_red.items()):
for idx in idxs:
nodes_oldnew[idx] = curr_idx
nodes_newold[curr_idx] = idx
graph_arr_fin[curr_idx] = graph_arr[idx]
curr_idx += 1
return graph_arr_fin, nodes_oldnew, nodes_newold
graph_arr_fin, nodes_oldnew, nodes_newold = reindex_data_matrix(graph_idxs_red, graph_arr)
In [16]:
graph_arr_fin.shape
Out[16]:
In [17]:
len(nodes_oldnew)
Out[17]:
In [18]:
len(nodes_newold)
Out[18]:
In [19]:
# Check a random sample of the indices to make sure that they are sampled correctly.
from random import sample
n_samples = 10000
rnd_idxs = sample([i for i in range(graph_arr_fin.shape[0])], n_samples)
for new_idx in rnd_idxs:
assert np.all(np.equal(graph_arr_fin[new_idx], graph_arr[nodes_newold[new_idx]]))
In [20]:
objgraph.most_common_types(limit=5)
Out[20]:
In [21]:
graph_arr_fin.shape
Out[21]:
In [22]:
# Finally, rework the nodes_nbrs, graph_idxs, and graph_nodes dictionaries with the corrected idxs.
# THIS IS THE KEY STEP! MUST ENCAPSULATE IN A FUNCTION!
from collections import defaultdict
def filter_and_reindex_nodes_and_neighbors(nodes_nbrs, nodes_oldnew):
"""
- nodes_nbrs: a dictionary of nodes and their neighbors.
- nodes_oldnew: a dictionary mapping old node indices to their new node indices.
"""
nodes_nbrs_fin = defaultdict(list)
for node, nbrs in sorted(nodes_nbrs.items()):
if node in nodes_oldnew.keys(): #
for nbr in nbrs:
nodes_nbrs_fin[nodes_oldnew[node]].append(nodes_oldnew[nbr])
return nodes_nbrs_fin
nodes_nbrs_fin = filter_and_reindex_nodes_and_neighbors(nodes_nbrs, nodes_oldnew)
In [23]:
objgraph.most_common_types(limit=5)
Out[23]:
In [24]:
def filter_and_reindex_graph_idxs(graph_idxs, nodes_oldnew):
"""
- graph_idxs: a dictionary of graphs and their original indices.
- nodes_oldnew: a dictionary mapping old node indices to their new node indices.
"""
graph_idxs_fin = defaultdict(list)
for seqid, nodes in sorted(graph_idxs.items()):
for node in nodes:
if node in nodes_oldnew.keys():
graph_idxs_fin[seqid].append(nodes_oldnew[node])
return graph_idxs_fin
graph_idxs_fin = filter_and_reindex_graph_idxs(graph_idxs, nodes_oldnew)
In [25]:
def filter_and_reindex_graph_nodes(graph_nodes, nodes_oldnew):
"""
- graph_nodes: a dictionary mapping graphs to their dictionary mapping indices to node names.
- nodes_oldnew: a dictionary mapping old node indices to their new node indices.
"""
graph_nodes_fin = defaultdict(dict)
for seqid, idx_node in sorted(graph_nodes.items()):
for old_idx, node_name in idx_node.items():
if old_idx in nodes_oldnew.keys():
graph_nodes_fin[seqid][nodes_oldnew[old_idx]] = node_name
return graph_nodes_fin
graph_nodes_fin = filter_and_reindex_graph_nodes(graph_nodes, nodes_oldnew)
In [26]:
objgraph.most_common_types(limit=5)
Out[26]:
In [27]:
from graphfp.layers import FingerprintLayer, LinearRegressionLayer, GraphConvLayer
from graphfp.utils import initialize_network
from pyflatten import flatten
layers = [GraphConvLayer(weights_shape=(36, 36), biases_shape=(1, 36)),
FingerprintLayer(weights_shape=(36, 36), biases_shape=(1, 36)),
LinearRegressionLayer(weights_shape=(36, 1), biases_shape=(1, 1)),
]
wb = initialize_network(layers_spec=layers)
wb_vect, unflattener = flatten(wb)
In [28]:
objgraph.most_common_types(limit=5)
Out[28]:
In [29]:
# from random import sample
# def batch_sample(inputs, nodes_nbrs, graph_idxs, n_graphs):
# """
# Randomly samples n_graphs from all of the graphs, returns new inputs,
# node_nbr dictionary, and graph_idx dictionary.
# """
# samp_graph_idxs = dict(sample(graph_idxs.items(), n_graphs))
# assert len(samp_graph_idxs) == n_graphs, "There was an error in sampling."
# concat_samp_idxs = np.concatenate([v for k, v in sorted(samp_graph_idxs.items())])
# # print('Samp Idxs Shape')
# # print(concat_samp_idxs.shape)
# samp_nodes_nbrs = {i: nodes_nbrs[i] for i in concat_samp_idxs}
# assert len(samp_nodes_nbrs) == len(concat_samp_idxs)
# samp_inputs, samp_nodes_oldnew, samp_nodes_newold = reindex_data_matrix(samp_graph_idxs, inputs)
# samp_nodes_nbrs = filter_and_reindex_nodes_and_neighbors(samp_nodes_nbrs, samp_nodes_oldnew)
# samp_graph_idxs = filter_and_reindex_graph_idxs(samp_graph_idxs, samp_nodes_oldnew)
# return samp_inputs, samp_nodes_nbrs, samp_graph_idxs
# n_sampled_graphs = 100
# samp_inputs, samp_nodes_nbrs, samp_graph_idxs = batch_sample(graph_arr_fin, nodes_nbrs_fin, graph_idxs_fin, n_sampled_graphs)
# assert samp_inputs.shape[1] == 36
# assert len(samp_nodes_nbrs) == samp_inputs.shape[0]
# assert len(samp_graph_idxs) == n_sampled_graphs
In [30]:
# # Scratchpad cell
# samp_graph_idxs = dict(sample(graph_idxs.items(), 10))
# assert len(samp_graph_idxs) == 10
# concat_samp_idxs = np.concatenate([i for i in samp_graph_idxs.values()])
# samp_nodes_nbrs = {i: nodes_nbrs[i] for i in concat_samp_idxs}
# assert len(samp_nodes_nbrs) == len(concat_samp_idxs)
In [31]:
# Used in conjunction with train_loss function in cell below.
from graphfp.binary_matrix_utils import to_sparse_format, to_scipy_csr_matrix
# samp_graph_arr, samp_node_nbrs, samp_graph_idx = batch_sample(graph_arr_fin, nodes_nbrs_fin, graph_idxs_fin, 10)
node_rows, node_cols, ones = to_sparse_format(nodes_nbrs_fin)
# nodes_nbrs_sparse = to_scipy_csr_matrix(nodes_nbrs_fin)
In [32]:
objgraph.most_common_types(limit=5)
Out[32]:
In [33]:
len(nodes_nbrs_fin)
Out[33]:
In [34]:
from scipy.sparse import csr_matrix
nodes_nbrs_compressed = csr_matrix((ones, (node_rows, node_cols)), shape=(len(nodes_nbrs_fin), len(nodes_nbrs_fin)))
nodes_nbrs_compressed
Out[34]:
In [35]:
objgraph.most_common_types(limit=5)
Out[35]:
In [36]:
# %%prun
def predict(wb_struct, inputs, nodes_nbrs_compressed, graph_idxs, layers):
curr_inputs = inputs
for i, layer in enumerate(layers):
wb = wb_struct['layer{0}_{1}'.format(i, layer)]
curr_inputs = layer.forward_pass(wb, curr_inputs, nodes_nbrs_compressed, graph_idxs)
return curr_inputs
predict(wb, graph_arr_fin, nodes_nbrs_compressed, graph_idxs_fin, layers).shape
Out[36]:
In [37]:
# %%prun
# Prototype train_loss function
wb_vect, unflattener = flatten(wb)
def get_actual(graph_idxs, df, preds):
sorted_graphs = sorted(graph_idxs.keys())
# print(sorted_graphs)
sorted_resistances = df[df['seqid'].isin(sorted_graphs)].set_index('seqid').ix[sorted_graphs]['FPV'].values
# print(sorted_resistances)
actual = sorted_resistances.reshape(preds.shape)
return actual
train_losses = []
preds_iter = []
actual_iter = []
def train_loss(wb_vect, unflattener):
# Old version - sample one random one each time.
# ----------------------------------------------
# samp_graph_arr, samp_nodes_nbrs, samp_graph_idxs = batch_sample(graph_arr_fin, nodes_nbrs_fin, graph_idxs_fin, 1)
# wb_struct = unflattener(wb_vect)
# preds = predict(wb_struct, samp_graph_arr, samp_nodes_nbrs, samp_graph_idxs, layers)
# New version - train on just one sample.
# Uses code in cell above.
# ---------------------------------------
wb_struct = unflattener(wb_vect)
preds = predict(wb_struct, graph_arr_fin, nodes_nbrs_compressed, graph_idxs_fin, layers)
graph_scores = get_actual(graph_idxs_fin, df, preds)
mse = np.mean(np.power(preds - graph_scores, 2))
# train_losses.append(mse)
# preds_iter.append(preds)
# actual_iter.append(graph_scores)
# print(mse)
return mse
train_loss(wb_vect, unflattener)
Out[37]:
In [38]:
objgraph.most_common_types(limit=5)
Out[38]:
In [39]:
gradfunc = grad(train_loss)
In [40]:
gradfunc(wb_vect, unflattener)
Out[40]:
In [41]:
objgraph.most_common_types(limit=5)
Out[41]:
In [44]:
from pympler import muppy, summary
In [43]:
all_objects = muppy.get_objects()
In [45]:
sum1 = summary.summarize(all_objects)
summary.print_(sum1)
In [44]:
top_stats = post_grad.compare_to(pre_grad, 'traceback')
In [45]:
print("[ Top 10 differences ]")
for stat in top_stats[:10]:
print(stat)
In [31]:
from graphfp.optimizers import adam
In [32]:
import gc
from time import time
training_losses = []
def callback(wb, i):
start = time()
tl = train_loss(*flatten(wb))
if i % 1 == 0:
print(tl, time() - start)
training_losses.append(tl)
gc.collect()
In [ ]:
# adam(gradfunc, wb, callback=callback, num_iters=10)
wb_vect, unflattener = adam(gradfunc, wb, callback=callback, num_iters=5)
In [ ]:
# %matplotlib inline
import matplotlib.pyplot as plt
from autograd.core import getval
plt.plot([getval(i) for i in train_losses])
plt.yscale('log')
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: