In [1]:
#Single task network to compare with Vlado
import numpy as np
import pandas as pd
import torch
import scipy as sp
# from bayes_opt import BayesianOptimization
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from math import sqrt
import random
import itertools
In [2]:
#Load some of the data
exp_data = pd.read_csv('../exp.tab', sep='\t', index_col=0)
cnv_data = pd.read_csv('../cnv.tab', sep='\t', index_col=0)
ydat = pd.read_csv('../labels.tab', sep='\t', index_col=0)
train_activity_data = pd.read_csv('../train_activity.tab', sep='\t')
test_activity_data = pd.read_csv('../test_activity.tab', sep ='\t')
#labels
traininglabels = train_activity_data.columns[1:]
testinglabels = test_activity_data.columns[1:]
#concatenate two data frames
frames = [exp_data, cnv_data]
xdatw = pd.concat(frames)
# traininglabels
In [12]:
#Deep Learning Net Class
class EssentialityNet:
def __init__(self):
self.inputnum = xdatw.shape[0]
self.trainscores = []
self.testscoreslist = []
self.learning_rate = 0.00009
self.H = 100
self.n_iter = 300 #training iterations
self.minimum = 100000
self.stopcounter = 3
self.layernum = 1
self.layers = []
#model
self.model = torch.nn.Sequential(
torch.nn.Linear(self.inputnum, self.H),
torch.nn.ReLU(),
torch.nn.Linear(self.H, 1),
)
#set loss function and optimizer
self.loss = torch.nn.MSELoss()
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
#plot scores
def plot(self, trainscores, testscores):
x = np.arange(self.n_iter)
plt.plot(x, self.trainscores, label='Train')
plt.title('Training vs Test Accuracy')
plt.xlabel('NN Training Iterations')
plt.ylabel('Accuracy')
plt.plot(np.asarray(x), np.asarray(testscores), label='Test') #plot
plt.legend()
#sets the proper method
def setModel(self, Layernum, Neuronnum):
self.layernum = int(round(Layernum))
self.H = int(round(Neuronnum))
#initial input layer
self.layers.append(torch.nn.Linear(self.inputnum, self.H))
for n in range(self.layernum):
if n != 0:
self.layers.append(torch.nn.Linear(self.H, self.H))
self.layers.append(torch.nn.ReLU())
self.layers.append(torch.nn.Linear(self.H, 1))
#set the method to whatever layers were chosen
self.model = torch.nn.Sequential(*self.layers)
def setRegularization(self, L2Reg):
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate, weight_decay= L2Reg)
def fit(self, xtrain, ytrain, xtest, ytest):
#convert to variables
xtrain_var = Variable(torch.FloatTensor(xtrain))
xtest_var = Variable(torch.FloatTensor(xtest))
ytrain_var = Variable(torch.FloatTensor(ytrain))
ytest_var = Variable(torch.FloatTensor(ytest))
for t in range(self.n_iter):
#calculate loss
ypred = self.model(xtrain_var)
diff = self.loss(ypred, ytrain_var)
self.trainscores.append(diff.data[0])
#test performance
ypredtest = self.model(xtest_var)
difftest = self.loss(ypredtest, ytest_var)
#find the best point
if t > 10 and self.minimum < difftest.data[0]:
self.stopcounter -= 1
if self.stopcounter == 0:
self.n_iter = t
self.trainscores.pop()
break
elif t > 10 and self.stopcounter < 3:
self.stopcounter += 1
self.minimum = difftest.data[0]
self.testscoreslist.append(difftest.data[0])
#zero gradients
self.optimizer.zero_grad()
#backpropagate
diff.backward()
#update weights
self.optimizer.step()
# predict with the test data
def predict(self, X):
X_var = Variable(torch.FloatTensor(X))
predicts = self.model(X_var)
return predicts.data.numpy()
#other functions for running the nn
def figureoutnetwork(layernum, neuronnum, l2reg):
n = EssentialityNet()
n.setModel(layernum, neuronnum)
n.setRegularization(l2reg)
n.fit(xtrain_val, ytrain_val, xtest_val, ytest_val)
predictions = n.predict(xtest)
rmse = calculateRMSE_single(predictions, ytest)
cor = calculateCor_single(predictions, ytest)
return([rmse, cor])
#calculate RMSE function
def calculateRMSE_single(predicts, actuals):
#unpack list
predicts2 = [val for sublist in predicts for val in sublist]
return sqrt(((predicts2 - actuals)**2).mean())
#calculate RMSE function for multitask
def calculateRMSE(predicts, actuals):
mses = []
multitaskrmses = []
preds = predicts.data.numpy()
for i in range(preds.shape[1]):
mses.append(((preds[:,i] - actuals[:,i])**2).mean())
multitaskrmses.append(sqrt(mses[i]))
print(len(multitaskrmses))
return(np.mean(multitaskrmses))
def calculateCor_single(predicts, actuals):
predicts2 = [val for sublist in predicts for val in sublist]
cor = str(sp.stats.spearmanr(actuals, predicts2))
cor = float(cor[28:37])
return(cor)
def saveRMSE(predicts, actuals):
mses = []
multitaskrmses = []
preds = predicts.data.numpy()
for i in range(preds.shape[1]):
mses.append(((preds[:,i] - actuals[:,i])**2).mean())
multitaskrmses.append(sqrt(mses[i]))
#open a file for saving rmses
rmses_file = open('rmses_single_' + str(fileno) + ".tab", 'w')
for item in multitaskrmses:
rmses_file.write("%s\n" % item)
In [38]:
#run for first 5 genes
mselist = {}
#best tasks that vlado tested with
topTasks = pd.read_csv("../combined_stats.tab", sep='\t')
tasks = topTasks.iloc[:,0].values
#keep the tasks we want
goodydat = ydat.transpose()[tasks]
goodydat = goodydat.transpose()
goodydat
Out[38]:
In [46]:
#open file to save the rmses and correlations
rmses_file = open('rmses_singles.tab', 'w')
for i in range(1138):
#initialize
n = EssentialityNet()
#get name
name = goodydat.index[i]
#get training and testing labels that vlado used
traininglabelsdat = pd.read_csv('../train_test_splits/train_test_splits/' + name + '_train_idx', header = None)
testinglabelsdat = pd.read_csv('../train_test_splits/train_test_splits/' + name + '_test_idx', header = None)
#convert them to lists
traininglabels = traininglabelsdat.iloc[:,0].values
testinglabels = testinglabelsdat.iloc[:,0].values
#index the data
xtraindat = xdatw[traininglabels].transpose()
xtestdat = xdatw[testinglabels].transpose()
ytraindat = goodydat[traininglabels].transpose()
ytestdat = goodydat[testinglabels].transpose()
#normalize inputs
xtrain = preprocessing.scale(xtraindat)
xtest = preprocessing.scale(xtestdat)
#get column (task)
ytrain = np.array(ytraindat.iloc[:,i])
ytest = np.array(ytestdat.iloc[:,i])
#split for validation
xtrain_val, xtest_val, ytrain_val, ytest_val = train_test_split(xtrain, ytrain, test_size=0.2, random_state=434)
print(i)
#fit and predict
rmse, cor = figureoutnetwork(3, 100, 0.012)
rmses_file.write(name + '\t' + str(rmse) + '\t' + str(cor) + '\n')
#close the file
rmses_file.close()
In [44]:
rmses_file.close()
In [66]:
#run different architectures for random tasks (type of grid search)
for i in range(10):
i = random.randint(0,1137)
#initialize
n = EssentialityNet()
#best tasks
topTasks = pd.read_csv("../combined_stats.tab", sep='\t')
tasks = topTasks.iloc[:,0].values
goodydat = ydat.transpose()[tasks]
goodydat = goodydat.transpose()
#get name
name = goodydat.index[i]
#get training and testing labels that vlado used
traininglabelsdat = pd.read_csv('../train_test_splits/train_test_splits/' + name + '_train_idx', header = None)
testinglabelsdat = pd.read_csv('../train_test_splits/train_test_splits/' + name + '_test_idx', header = None)
#convert them to lists
traininglabels = traininglabelsdat.iloc[:,0].values
testinglabels = testinglabelsdat.iloc[:,0].values
#index the data
xtraindat = xdatw[traininglabels].transpose()
xtestdat = xdatw[testinglabels].transpose()
ytraindat = goodydat[traininglabels].transpose()
ytestdat = goodydat[testinglabels].transpose()
#normalize inputs
xtrain = preprocessing.scale(xtraindat)
xtest = preprocessing.scale(xtestdat)
#get column (task)
ytrain = np.array(ytraindat.iloc[:,i])
ytest = np.array(ytestdat.iloc[:,i])
# print(xtrain)
# print(ytrain)
# print(xtrain_val)
#split for validation
xtrain_val, xtest_val, ytrain_val, ytest_val = train_test_split(xtrain, ytrain, test_size=0.2, random_state=434)
# print(i)
print(name)
#fit and predict
# print("RMSE: ")
# print(figureoutnetwork(1, 100, 0.012))
# print(figureoutnetwork(1, 356, 0.012))
# print(figureoutnetwork(2, 100, 0.012))
# print(figureoutnetwork(2, 356, 0.012))
r1,c1 = figureoutnetwork(3, 100, 0.012)
# print(figureoutnetwork(3, 356, 0.012))
r2,c2 = figureoutnetwork(4, 100, 0.012)
# print(figureoutnetwork(4, 356, 0.012))
print(r1 - r2)
#3 356
# torch.save(n, "/home/user/Documents/stuart_research/data/data/exp_cnv_activity/single_models/" + name + ".txt")