In [278]:
import tensorflow as tf
import numpy as np
import csv
import pandas as pd
from sklearn.cross_validation import KFold
if 'session' in locals() and session is not None:
print('Close interactive session')
session.close()
DATA = r'C:\Users\Ekpyrotic\Documents\Tensorflow stuff\binding affinity predictor\one_hot_encoded_dat.csv'
HLA_distances = r'C:\Users\Ekpyrotic\Documents\Tensorflow stuff\binding affinity predictor\crystal structures\HLAdistmatrix.csv'
HLA_dist = np.genfromtxt(HLA_distances, delimiter=',')
#1: HLA-A*:0201
#2: HLA-B*58:01
#3: HLA-B*57:01
newrows = []
with open(DATA) as csvfile:
i = 0
filereader = csv.reader(csvfile)
for row in filereader:
newrow = row
if row[0] == 'HLA-A*02:01':
i = i + 1
newrow[181] = HLA_dist[0,0]
elif row[0] == 'HLA-B*57:01':
newrow[181] = HLA_dist[1,0]
elif row[0] == 'HLA-B*57:01':
newrow[181] = HLA_dist[2,0]
else:
newrow = [] #get rid of other HLA types
#newrow[181] = 999 #keep other HLA type rows
if newrow:
newrows.append(newrow)
j = int(np.floor(i/5))
#column 0: HLA type
#column 1-181: one hot encoding of aa values
#column 182: HLA PH distance calc
#column 183: observed binding affinity
arrayform = np.asarray(newrows)
train = arrayform[j:,1:].astype(np.float)
np.random.shuffle(train)
train_x = train[:, 0:181]
train_y = train[:,-1]
test = arrayform[0:j,1:].astype(np.float)
test_set_data = test[:, 0:181]
test_set_target = test[:,-1]
folds = KFold(train_x.shape[0], n_folds= 5)
foldvals = list(folds)
def get_train_inputs(i):
train, test = foldvals[i]
x = tf.constant(train_x[train,:])
y = tf.constant(train_y[train])
return x, y
def get_val_inputs(i):
train, test = foldvals[i]
x = tf.constant(train_x[test,:])
y = tf.constant(train_y[test])
return x, y
def get_test_inputs():
x = tf.constant(test_set_data)
y = tf.constant(test_set_target)
return x, y
#now normalize our inputs and predictors
def maxminNormalize(array):
normalized = np.divide(np.subtract(array, np.amin(array)),
np.subtract(np.amax(array), np.amin(array)))
return normalized
train_x = maxminNormalize(train_x) #topology
#train_x = maxminNormalize(train_x[:,:-1]) #no topology
train_y = maxminNormalize(train_y)
test_set_data = maxminNormalize(test_set_data) #topology
#test_set_data = maxminNormalize(test_set_data[:,:-1]) #no topology
test_set_target = maxminNormalize(test_set_target)
In [268]:
Out[268]:
In [285]:
features = [tf.contrib.layers.real_valued_column("",dimension = 181)] #9mer data + distance from HLA a0201
#features = [tf.contrib.layers.real_valued_column("",dimension = 180)] #9mer data only
accuracy_score = np.zeros(5)
tf.logging.set_verbosity(tf.logging.ERROR)
#tf.logging.set_verbosity(tf.logging.INFO)
# Monitor performance on a validation set --> early stopping if we don't improve performance after 200 steps
validation_metrics = {
"accuracy":
tf.contrib.learn.MetricSpec(
metric_fn=tf.contrib.metrics.streaming_accuracy,
prediction_key=tf.contrib.learn.PredictionKey.CLASSES),
"precision":
tf.contrib.learn.MetricSpec(
metric_fn=tf.contrib.metrics.streaming_precision,
prediction_key=tf.contrib.learn.PredictionKey.CLASSES),
"recall":
tf.contrib.learn.MetricSpec(
metric_fn=tf.contrib.metrics.streaming_recall,
prediction_key=tf.contrib.learn.PredictionKey.CLASSES)
}
# train our networks
for i in range(0,5):
validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
input_fn = lambda:get_val_inputs(i),
eval_steps=1, # fix an issue
every_n_steps=50,
#metrics=validation_metrics,
early_stopping_metric='loss',
early_stopping_metric_minimize=True,
early_stopping_rounds=200)
classifier = tf.contrib.learn.DNNRegressor(hidden_units = [55,20,10],feature_columns=features,
optimizer=tf.train.ProximalAdagradOptimizer(
learning_rate=0.01,
),
#optimizer = tf.train.AdadeltaOptimizer(learning_rate = alpha_vals),
config = tf.contrib.learn.RunConfig(save_checkpoints_steps = 50, save_checkpoints_secs = None)
)
classifier.fit(input_fn = lambda:get_train_inputs(i), steps = 20000, monitors = [validation_monitor])
#accuracy_score[i] = classifier.evaluate(input_fn=get_test_inputs, steps = 500)['loss']
accuracy_score[i] = classifier.evaluate(input_fn=lambda:get_val_inputs(i), steps = 500)['loss']
#accuracy_score[i] = classifier.evaluate(x = test_set_data, y = test_set_target, steps = 500)['loss']
print(i)
print("\nTest MSE: {0:f}\n".format(np.average(accuracy_score)))
In [291]:
accuracy_score[i] = classifier.evaluate(input_fn=lambda:get_val_inputs(i), steps = 500)['loss']
print(accuracy_score[0])
In [130]:
classifier.evaluate(input_fn=get_test_inputs,steps = 20)
Out[130]:
In [274]:
accuracy_score (no top)
Out[274]: