In [278]:
import tensorflow as tf
import numpy as np
import csv
import pandas as pd
from sklearn.cross_validation import KFold

if 'session' in locals() and session is not None:
    print('Close interactive session')
    session.close()
    

DATA      = r'C:\Users\Ekpyrotic\Documents\Tensorflow stuff\binding affinity predictor\one_hot_encoded_dat.csv'
HLA_distances  = r'C:\Users\Ekpyrotic\Documents\Tensorflow stuff\binding affinity predictor\crystal structures\HLAdistmatrix.csv'
HLA_dist = np.genfromtxt(HLA_distances, delimiter=',')
#1: HLA-A*:0201
#2: HLA-B*58:01
#3: HLA-B*57:01

newrows   = []
with open(DATA) as csvfile:
    i = 0
    filereader = csv.reader(csvfile)
    for row in filereader:
        newrow = row
        if row[0] == 'HLA-A*02:01':
            i = i + 1
            newrow[181] = HLA_dist[0,0]
        elif row[0] == 'HLA-B*57:01':
            newrow[181] = HLA_dist[1,0]
        elif row[0] == 'HLA-B*57:01':
            newrow[181] = HLA_dist[2,0]
        else:
            newrow = []         #get rid of other HLA types
            #newrow[181] = 999   #keep other HLA type rows
        if newrow:
            newrows.append(newrow)

j = int(np.floor(i/5))

#column 0: HLA type
#column 1-181: one hot encoding of aa values
#column 182: HLA PH distance calc
#column 183: observed binding affinity

arrayform = np.asarray(newrows)

train     = arrayform[j:,1:].astype(np.float)
np.random.shuffle(train)
train_x   =  train[:, 0:181]
train_y   =  train[:,-1]

test      = arrayform[0:j,1:].astype(np.float)
test_set_data   =  test[:, 0:181]
test_set_target =  test[:,-1]


folds = KFold(train_x.shape[0], n_folds= 5)
foldvals = list(folds)


def get_train_inputs(i):
    train, test = foldvals[i]
    x = tf.constant(train_x[train,:])    
    y = tf.constant(train_y[train])
    return x, y

def get_val_inputs(i):
    train, test = foldvals[i]
    x = tf.constant(train_x[test,:])
    y = tf.constant(train_y[test])
    return x, y

def get_test_inputs():
    x = tf.constant(test_set_data)          
    y = tf.constant(test_set_target)
    return x, y

#now normalize our inputs and predictors 
def maxminNormalize(array):
    normalized = np.divide(np.subtract(array, np.amin(array)), 
                           np.subtract(np.amax(array), np.amin(array)))
    return normalized

train_x = maxminNormalize(train_x) #topology
#train_x = maxminNormalize(train_x[:,:-1]) #no topology
train_y = maxminNormalize(train_y)

test_set_data   = maxminNormalize(test_set_data)  #topology
#test_set_data   = maxminNormalize(test_set_data[:,:-1])  #no topology
test_set_target = maxminNormalize(test_set_target)

In [268]:



Out[268]:
18843

In [285]:
features = [tf.contrib.layers.real_valued_column("",dimension = 181)]       #9mer data + distance from HLA a0201 
#features = [tf.contrib.layers.real_valued_column("",dimension = 180)]      #9mer data only

accuracy_score = np.zeros(5)  
tf.logging.set_verbosity(tf.logging.ERROR)
#tf.logging.set_verbosity(tf.logging.INFO)

# Monitor performance on a validation set --> early stopping if we don't improve performance after 200 steps
validation_metrics = {
    "accuracy":
        tf.contrib.learn.MetricSpec(
            metric_fn=tf.contrib.metrics.streaming_accuracy,
            prediction_key=tf.contrib.learn.PredictionKey.CLASSES),
    "precision":
        tf.contrib.learn.MetricSpec(
            metric_fn=tf.contrib.metrics.streaming_precision,
            prediction_key=tf.contrib.learn.PredictionKey.CLASSES),
    "recall":
        tf.contrib.learn.MetricSpec(
            metric_fn=tf.contrib.metrics.streaming_recall,
            prediction_key=tf.contrib.learn.PredictionKey.CLASSES)
}

    
# train our networks
for i in range(0,5):
    
    validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
                    input_fn = lambda:get_val_inputs(i),
                    eval_steps=1,  # fix an issue
                    every_n_steps=50,
                    #metrics=validation_metrics,
                    early_stopping_metric='loss',
                    early_stopping_metric_minimize=True,
                    early_stopping_rounds=200)
    
    classifier = tf.contrib.learn.DNNRegressor(hidden_units = [55,20,10],feature_columns=features,
                                            optimizer=tf.train.ProximalAdagradOptimizer(
                                            learning_rate=0.01,
                                            ),
                                            #optimizer = tf.train.AdadeltaOptimizer(learning_rate = alpha_vals),
                                            config = tf.contrib.learn.RunConfig(save_checkpoints_steps = 50, save_checkpoints_secs = None)
                                            )
    
    classifier.fit(input_fn = lambda:get_train_inputs(i), steps = 20000, monitors = [validation_monitor])
    
    #accuracy_score[i] = classifier.evaluate(input_fn=get_test_inputs, steps = 500)['loss']
    accuracy_score[i] = classifier.evaluate(input_fn=lambda:get_val_inputs(i), steps = 500)['loss']
    #accuracy_score[i] = classifier.evaluate(x = test_set_data, y = test_set_target, steps = 500)['loss']
    print(i)

print("\nTest MSE: {0:f}\n".format(np.average(accuracy_score)))


0

Test MSE: 0.000229


In [291]:
accuracy_score[i] = classifier.evaluate(input_fn=lambda:get_val_inputs(i), steps = 500)['loss']
print(accuracy_score[0])


4.82698487758e-05

In [130]:
classifier.evaluate(input_fn=get_test_inputs,steps = 20)


Out[130]:
{'global_step': 20, 'loss': 3.4737761e+09}

In [274]:
accuracy_score (no top)


Out[274]:
array([ 0.0011428 ,  0.00114273,  0.00114245,  0.00114324,  0.00114224,
        0.0011427 ,  0.00114226,  0.00114199,  0.00114431,  0.0011426 ])