MULTI TASK DEEP LEARNING NEURAL NETWORK

This contains a lot of the code from the other multitask file, but just what is required to run the multitask neural net with random splits multiple times.


In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import scipy as sp
import torch
from bayes_opt import BayesianOptimization
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from math import sqrt
import random
import itertools

#Load some of the data
exp_data = pd.read_csv('../exp.tab', sep='\t', index_col=0)
cnv_data = pd.read_csv('../cnv.tab', sep='\t', index_col=0)
ydat = pd.read_csv('../labels.tab', sep='\t', index_col=0)
train_activity_data = pd.read_csv('../train_activity.tab', sep='\t')
test_activity_data = pd.read_csv('../test_activity.tab', sep ='\t')

#best ~1000 tasks
top_tasks = pd.read_csv("../combined_stats.tab", sep='\t')
tasks = top_tasks.iloc[:,0].values
ydat_best = ydat.transpose()[tasks]
ydat_best = ydat_best.transpose()

#concatenate two data frames
frames = [exp_data, cnv_data]

xdatw = pd.concat(frames)

In [2]:
#Deep Learning Net Class

class EssentialityNet:

    def __init__(self):
        self.inputnum = xdatw.shape[0]
        self.trainscores = []
        self.testscoreslist = []
        self.learning_rate = 0.00009
        self.H = 100
        self.n_iter = 300 #training iterations
        self.minimum = 100000
        self.stopcounter = 3
        self.layernum = 1
        self.layers = []
                
        #model
        self.model = torch.nn.Sequential(
        torch.nn.Linear(self.inputnum, self.H),
        torch.nn.ReLU(),
        torch.nn.Linear(self.H, 1138),
        )
        
        #set loss function and optimizer
        self.loss = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
    
    #plot scores
    def plot(self, trainscores, testscores):
        x = np.arange(self.n_iter)
        plt.plot(x, self.trainscores, label='Train')
        plt.title('Training vs Test Accuracy')
        plt.xlabel('NN Training Iterations')
        plt.ylabel('Accuracy')
    
        plt.plot(np.asarray(x), np.asarray(testscores), label='Test') #plot
        plt.legend()
        
    #sets the proper method
    def setModel(self, Layernum, Neuronnum):  
        
        self.layernum = int(round(Layernum))
        self.H = int(round(Neuronnum))
        
        #initial input layer
        self.layers.append(torch.nn.Linear(self.inputnum, self.H))
        
        for n in range(self.layernum):
            if n != 0:
                self.layers.append(torch.nn.Linear(self.H, self.H))
            self.layers.append(torch.nn.ReLU())
            
        self.layers.append(torch.nn.Linear(self.H, 1138))
        
        #set the method to whatever layers were chosen
        self.model = torch.nn.Sequential(*self.layers)
    
    def setRegularization(self, L2Reg):
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate, weight_decay= L2Reg)

    def fit(self, xtrain, ytrain, xtest, ytest):
      
        #convert to variables
        xtrain_var = Variable(torch.FloatTensor(xtrain))
        xtest_var = Variable(torch.FloatTensor(xtest))
        ytrain_var = Variable(torch.FloatTensor(ytrain))
        ytest_var = Variable(torch.FloatTensor(ytest))
        
        for t in range(self.n_iter):
        
            #calculate loss
            ypred = self.model(xtrain_var)

            diff = self.loss(ypred, ytrain_var)
            self.trainscores.append(diff.data[0])
            
            #test performance
            ypredtest = self.model(xtest_var)
            difftest = self.loss(ypredtest, ytest_var)
            
            #find the best point
            if t > 10 and self.minimum < difftest.data[0]:
                self.stopcounter -= 1

                if self.stopcounter == 0:
                    self.n_iter = t
                    self.trainscores.pop()
                    break
            elif t > 10 and self.stopcounter < 3:
                self.stopcounter += 1
            
            self.minimum = difftest.data[0]
            
            self.testscoreslist.append(difftest.data[0])
            
            #zero gradients
            self.optimizer.zero_grad()
            #backpropagate
            diff.backward() 
            #update weights
            self.optimizer.step() 

    # predict with the test data
    def predict(self, X):
        
        X_var = Variable(torch.FloatTensor(X))
        return self.model(X_var) 
    
#other functions for running the nn

def figureoutnetwork(layernum, neuronnum, l2reg):
    n = EssentialityNet()
    n.setModel(layernum, neuronnum)
    n.setRegularization(l2reg)
            
    n.fit(xtrain_val, ytrain_val, xtest_val, ytest_val)
    predictions = n.predict(xtest)
#     return(calculateRMSE(predictions, ytest))
    saveRMSE(predictions, ytest)

def figureoutnetwork3(neuronnum, l2reg):
    n = EssentialityNet()
    n.setModel(3, neuronnum)
    n.setRegularization(l2reg)
            
    n.fit(xtrain_val, ytrain_val, xtest_val, ytest_val)
    predictions = n.predict(xtest)
    return(calculateRMSE(predictions, ytest))
    
#calculate RMSE function
def calculateRMSE(predicts, actuals):
    mses = []  
    multitaskrmses = []
    preds = predicts.data.numpy()

    for i in range(preds.shape[1]):
        mses.append(((preds[:,i] - actuals[:,i])**2).mean())
        multitaskrmses.append(sqrt(mses[i]))

    print(len(multitaskrmses))       
    return(np.mean(multitaskrmses))

def saveRMSE(predicts, actuals):
    mses = []  
    multitaskrmses = []
    preds = predicts.data.numpy()

    for i in range(preds.shape[1]):
        mses.append(((preds[:,i] - actuals[:,i])**2).mean())
        multitaskrmses.append(sqrt(mses[i]))
    
    #open a file for saving rmses
    rmses_file = open('rmses_' + str(fileno) + ".tab", 'w')
    
    for item in multitaskrmses:
          rmses_file.write("%s\n" % item)

In [5]:
for fileno in range(10):

    #starting runs with random splits:
    traininglabels = random.sample(range(0, 206), 142)
    traininglabels.sort()
    testinglabels = random.sample([x for x in range(206) if x not in traininglabels], 64)

    #index the data with the proper labels
    xtrain_not_norm = xdatw.iloc[:,traininglabels].transpose()
    xtest_not_norm = xdatw.iloc[:,testinglabels].transpose()
    ytrain = ydat_best.iloc[:,traininglabels].transpose().values
    ytest = ydat_best.iloc[:,testinglabels].transpose().values

    #normalize inputs
    xtrain = preprocessing.scale(xtrain_not_norm)
    xtest = preprocessing.scale(xtest_not_norm)

    #create validation set
    xtrain_val, xtest_val, ytrain_val, ytest_val = train_test_split(xtrain, ytrain, test_size=0.2, random_state=434)

    #do not uncomment this if you do not want to retrain
###     figureoutnetwork(3, 356, 0.012)

In [22]:
#open and concatenate the files
myrmses0 = pd.read_csv('rmses_0.tab', header=None)
myrmses1 = pd.read_csv('rmses_1.tab', header=None)
myrmses2 = pd.read_csv('rmses_2.tab', header=None)
myrmses3 = pd.read_csv('rmses_3.tab', header=None)
myrmses4 = pd.read_csv('rmses_4.tab', header=None)
myrmses5 = pd.read_csv('rmses_5.tab', header=None)
myrmses6 = pd.read_csv('rmses_6.tab', header=None)
myrmses7 = pd.read_csv('rmses_7.tab', header=None)
myrmses8 = pd.read_csv('rmses_8.tab', header=None)
myrmses9 = pd.read_csv('rmses_9.tab', header=None)

rmses_total = pd.concat((myrmses0, myrmses1, myrmses2, myrmses3, myrmses4, myrmses5, myrmses6, myrmses7, myrmses8, myrmses9), axis = 1, )

In [49]:
#get means across trials
rmse_means = pd.DataFrame.mean(rmses_total, axis=1)
dnnlist = (list(rmse_means.values))
print(np.mean(dnnlist))
#save to file
mean_rmses_file = open('mean_rmses.tab', 'w')

# for item in list(rmse_means.values):
#     mean_rmses_file.write("%s\n" % item)


1.01065570239

In [46]:
#compare
top_tasks['dnn.rmse'] = dnnlist

top_tasks = top_tasks[['jklm.rmse', 'ranger.rmse', 'mkl.d9.rmse', 'rf.d9.rmse', 'glmm.dense.rmse', 'glmm.sparse.rmse', 'dnn.rmse']]
top_tasks


Out[46]:
jklm.rmse ranger.rmse mkl.d9.rmse rf.d9.rmse glmm.dense.rmse glmm.sparse.rmse dnn.rmse
0 0.952833 0.960496 0.948655 0.961768 0.962843 0.962658 0.882877
1 1.060365 1.074067 1.058601 1.067794 1.032906 1.124052 1.007232
2 0.946342 0.931658 0.975606 0.941134 0.934963 0.973344 0.881774
3 1.074409 1.053793 1.068510 1.056220 1.171792 1.019199 1.017913
4 1.298735 1.284004 1.276571 1.346348 1.254156 1.316945 1.292043
5 1.019320 1.018533 1.026431 1.039957 1.031455 1.032844 0.968281
6 0.903709 0.906174 0.892708 0.901269 0.897027 0.939720 0.954082
7 0.987778 0.990707 1.014963 1.050599 1.015290 1.056278 0.916968
8 0.962102 0.944543 0.925711 0.934242 0.961486 0.963235 0.937306
9 1.025180 1.034207 1.028851 1.061582 1.021382 0.997408 0.945645
10 0.864839 0.879444 0.866002 0.860268 0.882059 0.903176 0.877987
11 1.026121 1.016678 1.006676 1.025923 1.029813 1.054445 0.974374
12 1.066715 1.078802 1.086855 1.074777 1.098704 1.082767 0.962467
13 0.912657 0.927705 0.919456 0.933019 0.914373 0.892304 1.049915
14 0.960091 0.955081 0.952893 0.968033 0.971832 0.971949 0.898844
15 1.014610 1.005765 0.995154 1.026589 1.030348 1.007847 1.027795
16 0.936305 0.960090 0.969868 0.978125 0.932842 0.953257 0.849989
17 0.968715 0.978110 0.968780 1.003761 0.963905 1.009469 0.984028
18 1.026594 0.972322 0.963828 0.990733 0.992096 0.951621 0.983280
19 0.906035 0.934353 0.924058 0.931046 0.944596 0.927455 0.931251
20 1.153014 1.157117 1.152325 1.167979 1.161337 1.186105 1.182163
21 1.018916 1.042071 1.102796 1.184638 1.059646 1.030637 1.055953
22 0.998337 1.010587 1.002333 1.036592 1.013084 1.065147 1.040864
23 0.970767 0.990813 0.993429 0.999777 1.022317 0.960106 0.912818
24 0.898563 0.910198 0.928346 0.931359 0.971286 0.971286 0.899371
25 1.101058 1.122674 1.122188 1.147670 1.131612 1.133874 1.182630
26 0.905662 0.927240 0.929007 0.974975 0.926293 0.924893 0.931853
27 1.076554 1.078623 1.086000 1.107094 1.092432 1.080861 0.975462
28 0.911745 0.899485 0.933249 0.954863 0.944321 0.937255 0.887756
29 1.278873 1.300430 1.264563 1.287667 1.264107 1.288997 1.319286
... ... ... ... ... ... ... ...
1108 0.824665 0.818785 0.828190 0.857140 0.851707 0.826038 0.893217
1109 0.877833 0.871966 0.900916 0.908297 0.917863 0.917566 0.888731
1110 0.913307 0.895742 0.907654 0.924261 0.911022 0.939010 0.889690
1111 0.890177 0.909035 0.920792 0.966972 0.954292 0.997018 0.816057
1112 0.869413 0.862565 0.894360 0.912791 0.899760 0.890375 0.889486
1113 0.910647 0.934896 0.941350 0.945469 0.970018 0.981674 0.996912
1114 0.941791 0.946309 0.974123 1.014387 0.990833 0.994785 0.913325
1115 0.880239 0.892297 0.877591 0.911228 0.886479 0.896507 0.880617
1116 0.935164 0.938772 0.969023 0.990525 0.973272 0.968305 0.936234
1117 0.804401 0.796536 0.812495 0.803534 0.808039 0.779075 0.891616
1118 1.003451 1.005641 0.971408 0.997005 0.995884 0.999130 1.016141
1119 1.109512 1.113757 1.089068 1.113132 1.180525 1.114830 1.049505
1120 0.853283 0.842644 0.857619 0.882108 0.883380 0.876703 0.896816
1121 1.053327 1.055018 1.047172 1.058848 1.046278 1.070476 1.042978
1122 0.839807 0.825289 0.864418 0.882356 0.866215 0.843092 0.890023
1123 0.810731 0.793734 0.781794 0.801653 0.816091 0.794254 0.912470
1124 1.016015 1.016341 1.012411 1.026898 1.007053 1.006702 1.053143
1125 1.006789 1.009246 0.987176 1.019357 1.035754 1.017252 1.012933
1126 0.900048 0.887136 0.923358 0.924773 0.995944 0.906718 0.887253
1127 0.866667 0.873464 0.842159 0.861137 0.864173 1.004010 0.921179
1128 1.124805 1.139015 1.148699 1.161558 1.185100 1.161221 1.026753
1129 0.781559 0.798718 0.808136 0.842453 0.816329 0.814878 0.815916
1130 0.923668 0.927178 0.956415 0.954501 0.976227 0.994896 0.909458
1131 0.887437 0.885006 0.895667 0.902937 0.895119 0.903921 0.949747
1132 0.868581 0.863576 0.876061 0.867810 0.904657 0.911519 1.015141
1133 1.269603 1.269768 1.266867 1.300112 1.267734 1.304537 1.370521
1134 0.859855 0.840592 0.885673 0.909632 0.899399 0.879226 0.896755
1135 0.822099 0.806193 0.823133 0.826042 0.821425 0.919244 0.888820
1136 0.935305 0.925420 0.940226 0.955089 0.946809 1.058379 0.981741
1137 0.948006 0.957404 0.961448 0.966228 0.941657 0.945153 0.905251

1138 rows × 7 columns


In [47]:
#comparing with other methods for each task
best_rmse = {'dnn.rmse':0,'glmm.dense.rmse':0, 'mkl.d9.rmse':0, 'rf.d9.rmse':0,'jklm.rmse':0, 'glmm.sparse.rmse':0, 'ranger.rmse':0}

for i in range(1138):
    best_rmse[np.argmin(top_tasks.iloc[i])] += 1

best_rmse


Out[47]:
{'dnn.rmse': 393,
 'glmm.dense.rmse': 115,
 'glmm.sparse.rmse': 113,
 'jklm.rmse': 213,
 'mkl.d9.rmse': 112,
 'ranger.rmse': 163,
 'rf.d9.rmse': 29}

In [48]:
#plotting who did best per task
colors = 'rgbkymc'
keys = []

keys = [s.replace(".rmse", "").upper() for s in list(best_rmse.keys())]

plt.bar(range(len(best_rmse)), best_rmse.values(), align='center', color=colors)
plt.xticks(range(len(best_rmse)), keys, rotation='vertical')
plt.title('Number of Essentiality Scores Predicted Best for Each Method')

plt.show()