In [7]:
#'ridge,poly,3'.split(',')
import pandas as pd

In [9]:
hu = pd.Series()

In [13]:
hu.append(pd.Series([1,2,3]))


Out[13]:
0    1
1    2
2    3
dtype: int64

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [1]:
import matplotlib
import numpy as np
import math
import itertools
import sklearn
import warnings
import pandas as pd
import sklearn.linear_model
import matplotlib.pyplot as plt
from prompred import *
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-1-fcb638e39843> in <module>()
      8 import sklearn.linear_model
      9 import matplotlib.pyplot as plt
---> 10 from prompred import *
     11 import warnings
     12 warnings.filterwarnings('ignore')

ImportError: No module named 'prompred'

Using ordinary mutalik dataset

promoter sequences range = $[-42,1]$

Import data

In [7]:
# Import data for model
dfDatasetOrder = pd.read_csv("data/mut_rand_mod_lib.csv")
# Shuffle
dfDataset = dfDatasetOrder.reindex(np.random.permutation(dfDatasetOrder.index))
Determine functions

Compatible Models:

  • Ordinary Least Squares: OLS
      Parameters: coef0
  • Random Forests (Classification + Regression) : forestReg, forestClass
      Parameters: max_depth, max_features, min_samples
  • Ridge Regressen: ridge
      Parameters: alpha, coef0
  • Lasso Regression: lasso
      Parameters: alpha, coef0
  • Support vectors (Classification + Regression) : SVR, SVC
      Parameters: alpha, gamma, coef0
      Kernels: poly, ...
  • Ridge kernels:
      Parameters: alpha, gamma, coef0
      Kernels: poly, ...

In [8]:
plt.close('all')


# sequence region
seqRange = [-42,1]
ROI =  [[-5,14],[-8,12]]


#model selection and hyperparameters
regType = 'ridge'
kernel = 'poly'
poly= 3

# To be evaluated parameter(s)
parLabel = ['alpha']
parRange = [20] 



## OPTIONAL ##

testSize = 0.2 
k = 5 
kInner = 5
pdf = None

treeCount = 10
gamma = 0.1
alpha = 0.1
coef0 = 0.00
penalty = 0.1
epsilon = 1.95


parModel = {"regType":regType, "poly":poly, "kernel":kernel, "treeCount":treeCount, "gamma":gamma, "coef0":coef0}


###############################################
####### DO NOT TOUCH ##########################

labels, positionBox, spacer = regionSelect(dfDataset, ROI , seqRange)
positionMatrix = positionBox.values

Xdf = positionBox
X = positionMatrix
y = [math.sqrt(math.sqrt(u)) for u in labels]

if len(parLabel) is 1: 
    scoresParCV, optimalParCV = KfoldCV(X,y,k,parModel,parLabel[0],parRange[0]) 
    scoresParNCV, optimalParNCV, scoresNCV = nestedKfoldCV(X,y,k,kInner,parModel,parLabel[0],parRange[0])    
    meanScores = np.mean(np.ndarray.max(scoresParCV,axis=1))
    print("K FOLD CV \n---------- \n\n Maximum Score: ",np.max(scoresParCV), "\n Mean optimal score: ", meanScores ,"\n sd optimal scores: ", math.sqrt(np.sum(np.power((np.ndarray.max(scoresParCV,axis=1)-meanScores),2)))   , "\n Optimal parEval:\n", optimalParCV, "\n parEval Scores:\n", scoresParCV,"\n\n\n")
    print("NESTED K FOLD CV \n----------------- \n\n Maximum Score: ",np.max(scoresParNCV), "\n Mean optimal score: ", np.mean(scoresParNCV) ,"\n sd optimal scores: ", math.sqrt(np.sum(np.power((np.ndarray.max(scoresParCV,axis=1)-np.mean(scoresParNCV)),2))) , "\n Optimal parEval:\n", optimalParNCV, "\n parEval Scores:\n", scoresParNCV,"\n\n\n")

if len(parLabel) is 2:
    evaluateMultiPar(X, y, parModel, parLabel, parRange)
    
    
if (regType in ["ridge","lasso","OLS"] and poly is None): 
    reg = selectRegression(regType, alpha=np.median(optimalParCV), poly=poly, kernel=kernel) 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testSize) 
    reg.fit(X_train,y_train) 
    plotPositionCoefficients(reg, positionBox, posRange)


K FOLD CV 
---------- 

 Maximum Score:  0.810860649274 
 Mean optimal score:  0.690810757593 
 sd optimal scores:  0.2624868796385607 
 Optimal parEval:
 [ 10.  10.  10.  10.  10.] 
 parEval Scores:
 [[ 0.71611078  0.71611078  0.71611078  0.7161108   0.71611106  0.71611358
   0.71613886  0.71639082  0.71883945  0.73801897  0.79532796  0.81086065
   0.60076835 -0.63133802 -3.31943219 -4.7210334  -4.93421924 -4.95666493
  -4.95892138 -4.95914714]
 [ 0.56990507  0.56990507  0.56990507  0.56990511  0.56990545  0.56990889
   0.56994328  0.57028575  0.57357545  0.59702223  0.64896171  0.67791996
   0.60281545 -0.08441013 -2.09341978 -3.3491861  -3.54739822 -3.56835177
  -3.57045906 -3.57066991]
 [ 0.65402488  0.65402488  0.65402488  0.65402489  0.65402498  0.6540259
   0.65403513  0.65412707  0.65501776  0.66213235  0.68827192  0.70847576
   0.5390123  -0.67374781 -3.51495945 -5.05676838 -5.29104575 -5.31569659
  -5.31817455 -5.31842247]
 [ 0.34493072  0.34493072  0.34493072  0.34493073  0.3449308   0.34493155
   0.34493903  0.3450134   0.34571907  0.35064616  0.37579747  0.47638122
   0.46700151 -0.3625758  -3.0317792  -4.61485602 -4.85966051 -4.88547231
  -4.88806751 -4.88832717]
 [ 0.54512492  0.54512492  0.54512492  0.54512496  0.54512535  0.54512921
   0.54516781  0.54555267  0.54928485  0.57820361  0.67383874  0.7804162
   0.61635972 -0.73406701 -3.79786622 -5.46185919 -5.71627118 -5.7430645
  -5.74575807 -5.74602757]] 



NESTED K FOLD CV 
----------------- 

 Maximum Score:  0.810860649274 
 Mean optimal score:  0.690810757593 
 sd optimal scores:  0.2624868796385607 
 Optimal parEval:
 [[ 10.  10.  10.  10.  10.]
 [ 10.  10.  10.  10.  10.]
 [ 10.  10.  10.  10.  10.]
 [ 10.  10.  10.  10.  10.]
 [ 10.  10.  10.  10.  10.]] 
 parEval Scores:
 [ 0.81086065  0.67791996  0.70847576  0.47638122  0.7804162 ] 



Evaluate datasets using ranking scoring

Pearson's ranking


In [10]:
##################################
### TEST SETS
plt.close('all')
###################################


#data input
dfDatasetTest = pd.read_csv("data/anderson_lib.csv")

#sequence range
seqRange = [-42,1]

#regions of interest (wrt 35- and 10-box)
ROI =  [[0,12],[-12,12]]

#model selection and hyperparameters
regType = 'ridge'
kernel = 'poly'
poly= '3'

#parameters to be evaluated
parLabel = ['alpha']
parRange = [20] 

treeCount = 100
gamma = 0.1

manInput = True

parInput = {"regType":regType, "poly":poly, "kernel":kernel, "gamma":0.1, "treeCount":treeCount, "alpha": 10000, "coef0":1} 


#####################################################################################################
#####################################################################################################

# Import data for model

dfDatasetTrain = pd.read_csv("data/mut_rand_mod_lib.csv")

dfDatasetTest['sequence'] = dfDatasetTest['sequence'].str.upper()
dfDatasetTest = dfDatasetTest.sort(columns='mean_score',ascending=False)

labelsTrain, positionBoxTrain, spacerTrain = regionSelect(dfDatasetTrain, ROI, seqRange)
labelsTest, positionBoxTest, spacerTest = regionSelect(dfDatasetTest, ROI, seqRange)

positionMatrixTrain = positionBoxTrain.values
positionMatrixTest = positionBoxTest.values


Xdf = positionBoxTrain
X = positionMatrixTrain
y = [math.sqrt(math.sqrt(u)) for u in labelsTrain]


Xtest = positionMatrixTest
ytest = labelsTest

#############################


if manInput == False:
    
    parModel = {"regType":regType, "poly":poly, "kernel":kernel, "treeCount":treeCount, "gamma":gamma }

    parEval = getParameters(parLabel,parRange)
    reg = selectRegression(**parModel)
    GS = GridSearchCV(reg, parEval)
    GS.fit(X,y)
    reg = GS.best_estimator_
    print(GS.best_estimator_)
    
else:
    reg = selectRegression(**parInput)
    
    
reg.fit(X,y)
rankPredict = reg.predict(Xtest)

print(np.transpose(np.vstack((dfDatasetTest['sequence'].values,dfDatasetTest['mean_score'].values,rankPredict))))
print(stats.spearmanr(dfDatasetTest['mean_score'].values,rankPredict))
plt.plot(dfDatasetTest['mean_score'].values,rankPredict, 'ro')


[['TTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGC' 1.0 0.6773006547906902]
 ['TTGACAGCTAGCTCAGTCCTAGGTACTGTGCTAGC' 0.86 0.6637173864464466]
 ['TTGACAGCTAGCTCAGTCCTAGGTATTGTGCTAGC' 0.72 0.7172305992508803]
 ['TTTACAGCTAGCTCAGTCCTAGGTATTATGCTAGC' 0.7 0.6948214680387206]
 ['TTGACGGCTAGCTCAGTCCTAGGTATAGTGCTAGC' 0.58 0.7330175282302626]
 ['TTGACGGCTAGCTCAGTCCTAGGTATTGTGCTAGC' 0.56 0.6706066745564058]
 ['CTGACAGCTAGCTCAGTCCTAGGTATAATGCTAGC' 0.51 0.748994095544717]
 ['TTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGC' 0.47 0.6588864157596785]
 ['TTTACGGCTAGCTCAGCCCTAGGTATTATGCTAGC' 0.36 0.6735205990348696]
 ['TTTACGGCTAGCTCAGTCCTAGGTACAATGCTAGC' 0.33 0.6560475111066038]
 ['TTTACGGCTAGCTCAGTCCTAGGTACTATGCTAGC' 0.24 0.5977600425149104]
 ['TTGACAGCTAGCTCAGTCCTAGGGACTATGCTAGC' 0.16 0.6529016547220068]
 ['TTTATAGCTAGCTCAGCCCTTGGTACAATGCTAGC' 0.15 0.6839614060584718]
 ['TTTATGGCTAGCTCAGTCCTAGGTACAATGCTAGC' 0.1 0.6115189768644659]
 ['TTGACAGCTAGCTCAGTCCTAGGGATTGTGCTAGC' 0.06 0.6558067439921897]
 ['TTTACAGCTAGCTCAGTCCTAGGGACTGTGCTAGC' 0.04 0.5394713415471155]
 ['CTGATAGCTAGCTCAGTCCTAGGGATTATGCTAGC' 0.01 0.5810662553440638]
 ['CTGATGGCTAGCTCAGTCCTAGGGATTATGCTAGC' 0.01 0.5424169758140124]
 ['CTGATAGCTAGCTCAGTCCTAGGGATTATGCTAGC' 0.0 0.5810662553440636]]
SpearmanrResult(correlation=0.79157532850691492, pvalue=5.4029213121474631e-05)
Out[10]:
[<matplotlib.lines.Line2D at 0x7f1c41fa35c0>]

In [ ]:
##################################
### TEST SETS
plt.close('all')
###################################

#data input
dfDatasetTest = pd.read_csv("data/brewster_lib.csv")


#sequence range
seqRange = [-42,1]

#regions of interest (wrt 35- and 10-box)
ROI =  [[-5,13],[-8,12]]

#model selection and hyperparameters
regType = 'ridge'
kernel = 'poly'
poly= '3'

#parameters to be evaluated
parLabel = ['alpha']
parRange = [20] 

treeCount = 100
gamma = 0.1


# Evaluate for best hyperparameters <-> Manual input
manInput = True

parInput =  {"regType":regType, "poly":poly, "kernel":kernel, "gamma":0.1, "treeCount":treeCount, "alpha": 0.000001, "coef0":1 } 
    

#####################################################################################################
#####################################################################################################

# Import data for model

dfDatasetTrain = pd.read_csv("data/mut_rand_mod_lib.csv")

dfDatasetTest['sequence'] = dfDatasetTest['sequence'].str.upper()
dfDatasetTest = dfDatasetTest.sort(columns='mean_score',ascending=False)

labelsTrain, positionBoxTrain, spacerTrain = regionSelect(dfDatasetTrain, ROI, seqRange)
labelsTest, positionBoxTest, spacerTest = regionSelect(dfDatasetTest, ROI, seqRange)

positionMatrixTrain = positionBoxTrain.values
positionMatrixTest = positionBoxTest.values


Xdf = positionBoxTrain
X = positionMatrixTrain
y = [math.sqrt(math.sqrt(u)) for u in labelsTrain]


Xtest = positionMatrixTest
ytest = labelsTest

#############################


if manInput == False:
    
    parModel = {"regType":regType, "poly":poly, "kernel":kernel, "treeCount":treeCount, "gamma":gamma }

    parEval = getParameters(parLabel,parRange)
    reg = selectRegression(**parModel)
    GS = GridSearchCV(reg, parEval)
    GS.fit(X,y)
    reg = GS.best_estimator_
    print(GS.best_estimator_)
    
else:
    reg = selectRegression(**parInput)
    
reg.fit(X,y)
rankPredict = reg.predict(Xtest)

print(np.transpose(np.vstack((dfDatasetTest['sequence'].values,dfDatasetTest['mean_score'].values,rankPredict))))
print(stats.spearmanr(dfDatasetTest['mean_score'].values,rankPredict))
plt.plot(dfDatasetTest['mean_score'].values,rankPredict, 'ro')

In [1]:
dfDatasetTest.style


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-8d224b371336> in <module>()
----> 1 dfDatasetTest.style

NameError: name 'dfDatasetTest' is not defined

In [68]:
##################################
### TEST SETS
plt.close('all')
###################################

#data input
dfDatasetTest = pd.read_csv("data/inbio_lib.csv")


#sequence range
seqRange = [-45,1]

#regions of interest (wrt 35- and 10-box)
ROI =  [[-7,12],[-8,11]]

#model selection and hyperparameters
regType = 'forestReg'
kernel = None
poly= None

#parameters to be evaluated
parLabel = ['max_depth']
parRange = [50] 

treeCount = 100
gamma = 0.1


parModelTweak =  {"regType":regType, "poly":poly, "kernel":kernel, "gamma":0.0001, "treeCount":treeCount, "alpha": 1.0, "coef0":0.1 } 

evaluateMutalik(dfDatasetTest, ROI, seqRange, parInput)


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=15.0,
           max_features=None, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
[[ 'CTTCATTCTATAAGTTTCTTGACATCTTGGCCGGCATATGGTATAATAGGGAAATTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  7606.83 4.862701708999151]
 [ 'CTTACATGAAAAAGGTTCTTGACATTTTAAATCCATGTGGTATATGTCATTTTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  7176.96 3.273334465022138]
 [ 'CTTTGTCGGAAAACATTCTTACATTTTCAGCCAATATGATATAATTCGCGGGTTACCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  6852.0 3.610108296685446]
 [ 'GCATTGACGAAAACGTTCTTGACATGTTTTGCAGTTTATGGTATAATTTCGGAATTACCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  5422.75 4.924092811617408]
 [ 'CTTTAGCCTAAATGGTTCTTGACATGTTTAGGCGGCTTGTGGTATAATTTTGGCGTATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  4857.16 4.91149335862847]
 [ 'GCTTGAGGTATAAAATTCTTGACATATTCCGTCGTTATGGTATAATGGTTCCATATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  4783.13 4.576701449968292]
 [ 'CTTGTTTATATATGGTTCTTGACATATTTCATCGCCTATGATATAATTCGGATATACCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  4769.97 4.540532435447666]
 [ 'CTAGTGTCCAAATCTTTCTTGACATGTTGCTCACAATATGATATAATATGATTGTTACCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  4702.94 5.024606740863461]
 [ 'CTAGTCTGGATTTACTTCTTGACATGTTGTCATCATTATGGTATAATGTTTAGGTATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  4338.84 5.044077271389852]
 [ 'CTTTCACTTAAAACTTTCTTGACATCTTTCGTTACATGTGCTATAATTTAGGCGTTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  4122.0 4.837097099949872]
 [ 'CTTTTATGTATATGCTTCTTACATATTACAGTTCATCTGGTATAATGTACCTGTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  4003.84 3.667891412701752]
 [ 'TTTGAGGACAAAAGGTTCTTGACATGTTCTAATGTTTATGCTATAATTTTTCGGTATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  3818.07 4.943041361264005]
 [ 'TTAATCAATATTAGTTTCTTGACATCTTCTCCCGCATATGATATAATTAGTGAGTTACCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  3779.86 4.959996832152586]
 [ 'CATTGAAATATAACTTTCTTGACATCTTGCTCCTCATGTGATATAATGCATTGGTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  3535.71 4.509431507900614]
 [ 'CTTAGAAGGAATTTGTTCTTGACATGTTGGTGTTGATGTGGTATAATTTTTAGGTATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  3154.04 4.888812941822642]
 [ 'CATACAGTTAAATTCTTCTTGACATATTCGGGTAAATATGGTATAATGGGAGCATATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  3149.04 4.918740875509253]
 [ 'GCTTCATTTATAAATTTCTTGACATTTTGGAATAGATGTGATATAATGTGTACATATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  3129.22 5.049139224527675]
 [ 'ATTTTATGCAAATATTTCTTGACATTTTAACGAGAATGTGGTATAATTGGGTCGTATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  2996.55 4.87001337148525]
 [ 'CCTAGTCGGAATTCGTTCTTACATATTCGTTAGCTATGATATAATGCTTTTGTATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  2796.75 3.6988185060127545]
 [ 'GCTAGAGGTAAAAATTTCTTGACATGTTTGSGACGATATGTTATAATTATAAGGTATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  2546.92 4.88645438737119]
 [ 'CTTTTTGGAAATATTTTCTTGACATTTTGCCATGCATGTGATATAATGCGTGTGTTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  2498.88 5.01807416856084]
 [ 'TTAATGGGGAAAAGGTTCTTGACATTTTGGGCAAAATGTGATATAATTTATTTGTTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  2435.19 4.760141214949761]
 [ 'ATTTTATACAAATTTTTCTTGACATGTTGTGGGCTATGTGGTATAAGCGGAAGTAACCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  2226.37 4.598316625003322]
 [ 'CTAGTGCGTAATAAGTTCTTGACATGTTATCGTGAATGTGATATAATTACGTTGTATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  1958.75 4.838408601895756]
 [ 'ATATTTTAAATAAGATTCTTGACATCTTAGCTCGTATATGGTATAATACTGTAATATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  1929.78 4.851341743406877]
 [ 'GCTTGAAGTAAATTATTCTTGACATATTATCCTATATGGTATAATAGTGCGGTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  1925.57 4.5899365278039825]
 [ 'CGTTATTTTAAAATGTTCTTGACATCTTCGGGACGTTGTGATATAATGAACGGATTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  1863.15 4.7810984686138776]
 [ 'GTAAAGTGCATCAGGTTCTTGACATGTTCCGCACTTTATGATATAATAGATCTGTATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  1862.49 4.719209786271668]
 [ 'GCTAAAGTGATTAGGTTCTTGACATATTCTCGTGATGTGATATAATAGAAGAGTATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  1320.98 4.372812375285003]
 [ 'GCTACCATGAATTATTTCTTGACATTTTGGGCCGTTGTGATATAATGGAGACATACCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  1288.43 4.5720501267681115]
 [ 'GCATAGTTTAATATTTTCTTGACATGTTTCTTGTAATGTGATATAATGAGCTGGTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  1254.07 4.532677774507183]
 [ 'CTAGAAAGGAATTGTTTCTTGACATTTTCACAATTTATGGTATAATGGTCCAATAACCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  1225.65 4.510197035344558]
 [ 'GCTTATATGATTATGTTCTTGACATTTTCCACCTCATATGATATAATTGAAAAATACCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  1207.79 4.518972824099849]
 [ 'ATTCAATGCAAATCATTCTTGACATCTTCCGTTTATTGTGGTATAATGTAGGTGTTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  1158.7 4.784965359194024]
 [ 'ATTGAGTAGATAAGCTTCTTGACATCTTGGATGGATATGATATAATGTATGTGTATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  854.61 4.528912848408678]
 [ 'CTTTTTGGTAAAATTTTCTTGACATTTTCAACAGATGTGATATAATTGTCCTATAACCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  826.21 4.517085059999931]
 [ 'TTACTGGACAAAATTTTCTTGACATATTTTTCGTTATATGATATAATGATGAGGTTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  787.37 4.860934669154106]
 [ 'TTACAAGGAATATTGTTCTTGACATCTTATCCTCTATGTGATATAATAACGACGTTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  784.45 4.853155174718352]
 [ 'TTAGAGTTGAACTAATTCTTGACATGTTGTACTGAATATGGTATAATGGTTATGTTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  740.84 4.945258472670968]
 [ 'CTTTTTTGGAATTGTTTCTTGACATGTTACACTATATCTGGTATAATGAGATGGTTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  710.97 4.847685245158881]
 [ 'TTTTTCAGCAAAATTTTCTTGACATCTTCGGGGTAATGTGGTATAATAGGACCATATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  659.77 4.812156025201406]
 [ 'TTATATGGTATAAGGTTCTTGACATGTTTCATGCAAATGTGATATAATGGTACGGTTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  647.6 4.784430105596471]
 [ 'CTTAATTGTATTTGTTTCTTGACATCTTCGTGACCATGTGATATAATGTATCAGTTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  611.05 4.91182238772428]
 [ 'GCTACTTCGATAACTTTCTTGACATGTTAAGCCCTTTGTGGTATAATAGCAAGGTAACCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  502.07 4.809690401754526]
 [ 'CTTTGTACTAAAAGTTTCTTGACATATTAACCCCTATGTGATATAATTTGTAGGTTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  487.24 4.963189043983071]
 [ 'CCCTATTAAATATAATTCTTGACATGTTGAGCCTCTTGTGGTATAATGGACGAGTTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  485.06 4.70158650505285]
 [ 'CGCTTGTGGAATATCTTCTTGACATATTCGAGTAGTATGTTATAATAATGGTGTATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  359.81 4.419475905081916]
 [ 'TTTGAGGACAAAAGGTTCTTGACATGTTCTAATGTTTATGCTATAATTTTTCGGTATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  327.81 4.943041361264005]
 [ 'GCTTTTGGCAATTGGTTCTTGACATGTTGGCAGTTTGTGATATAATGGGGTCGTATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  297.88 4.518287761604056]
 [ 'CGCTATATAAAATTGTTCTTGACATCTTGGCCAGTATATGATATAATGTCGTAGTTTCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  210.64 4.95415462692795]
 [ 'TTAGAAGGAATTTGGTTCTTACATATTCGCACGCATGTGATATAATGGGGTTATTTCATGGCGGCCGCTSTAGAAGAAGCTTGGGATCCGTCGACCTSGAATTCGGAGGAAACAAAGATG'
  154.69 3.8051767848290177]
 [ 'CATAATGTTAAATTGTTCTTGACATATTGATACCTTTGTGATATAATATTGCAGTATCCATGGCGGCCGCTCTAGAAGAAGCTTGGGATCCGTCGACCTCGAATTCGGAGGAAACAAAGATG'
  147.42 4.911942402078932]]
SpearmanrResult(correlation=0.037395146323431425, pvalue=0.792398672949432)
Out[68]:
[<matplotlib.lines.Line2D at 0x7fe1ff416400>]

In [3]:
##################################
### TEST SETS
plt.close('all')
###################################

#data input
dfDatasetTest = pd.read_csv("data/hammer_lib.csv")


#sequence range
seqRange = [-45,1]

#regions of interest (wrt 35- and 10-box)
ROI =  [[-7,14],[-8,11]]

#model selection and hyperparameters
regType = 'ridge'
kernel = 'poly'
poly= 3

#parameters to be evaluated
parLabel = ['alpha']
parRange = [50] 

treeCount = 100
gamma = 0.1


# Evaluate for best hyperparameters <-> Manual input
manInput = False
##

parInput = {"regType":regType, "poly":poly, "kernel":kernel, "gamma":0.01, "treeCount":treeCount, "alpha": 0.01, "coef0":0.1 } 

evaluateMutalik(dfDatasetTest, ROI, seqRange, parInput)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-7b60bf0e2b4b> in <module>()
     33 parInput = {"regType":regType, "poly":poly, "kernel":kernel, "gamma":0.01, "treeCount":treeCount, "alpha": 0.01, "coef0":0.1 }
     34 
---> 35 evaluateMutalik(dfDatasetTest, ROI, seqRange, parInput)
     36 
     37 

/home/jim/Doctoraat/github/prompred/prompred.py in evaluateMutalik(dfDatasetTest, ROI, seqRange, parModelTweak)
    239     ytest = labelsTest
    240 
--> 241     if manInput == False:
    242 
    243         parModel = {"regType":regType, "poly":poly, "kernel":kernel, "treeCount":treeCount}

NameError: name 'manInput' is not defined

In [32]:
plt.close('all')

#----PARAMETERS -----
Xdf = positionBox
X = XnoOut
y = YnoOut


regType = 'ridge'
poly= 3
gamma = 1
coef0 = 1

kernel = 'poly'
parLabel = ['alpha']
parRange = [15] 

testSize = 0.20 
k = 5 
kInner = 5
pdf = None
#---------------------------

parModel = {"regType":regType, "poly":poly, "kernel":kernel, "gamma":gamma, "coef0":coef0}


if len(parLabel) is 1: 
    scoresParCV, optimalParCV, OutI, OutV = KfoldCV(X,y,k,parModel,parLabel[0],parRange[0]) 
    scoresParNCV, optimalParNCV, scoresNCV = nestedKfoldCV(X,y,k,kInner,parModel,parLabel[0],parRange[0])    
    meanScores = np.mean(np.ndarray.max(scoresParCV,axis=1))
    print("K FOLD CV \n---------- \n\n Maximum Score: ",np.max(scoresParCV), "\n Mean optimal score: ", meanScores ,"\n sd optimal scores: ", math.sqrt(np.sum(np.power((np.ndarray.max(scoresParCV,axis=1)-meanScores),2)))   , "\n Optimal parEval:\n", optimalParCV, "\n parEval Scores:\n", scoresParCV,"\n\n\n")
    print("NESTED K FOLD CV \n----------------- \n\n Maximum Score: ",np.max(scoresParNCV), "\n Mean optimal score: ", np.mean(scoresParNCV) ,"\n sd optimal scores: ", math.sqrt(np.sum(np.power((np.ndarray.max(scoresParCV,axis=1)-np.mean(scoresParNCV)),2))) , "\n Optimal parEval:\n", optimalParNCV, "\n parEval Scores:\n", scoresParNCV,"\n\n\n")
if len(parLabel) is 2:
    evaluateMultiPar(X, y, parModel, parLabel, parRange)
    
if (regType in ["ridge","lasso","OLS"] and poly is None): 
    reg = selectRegression(regType, alpha=np.median(optimalParCV), poly=poly, kernel=kernel) 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testSize) 
    reg.fit(X_train,y_train) 
    plotPositionCoefficients(reg, positionBox, posRange)


K FOLD CV 
---------- 

 Maximum Score:  0.8636674365 
 Mean optimal score:  0.82419440891 
 sd optimal scores:  0.056555113388595396 
 Optimal parEval:
 [  1000.  10000.   1000.   1000.  10000.] 
 parEval Scores:
 [[ 0.70339157  0.70339157  0.70339157  0.70339157  0.70339163  0.70339216  0.70339752  0.70345111  0.70398418
   0.70905269  0.74225203  0.78447419  0.71069898  0.42680514 -0.86323757]
 [ 0.77386117  0.77386117  0.77386117  0.77386117  0.7738612   0.77386151  0.77386461  0.77389557  0.77420363
   0.77713337  0.7966842   0.83691832  0.86366744  0.6795331  -0.44599599]
 [ 0.81285496  0.81285496  0.81285496  0.81285496  0.81285499  0.81285519  0.81285725  0.81287783  0.8130817   0.81494989
   0.8248565   0.83073847  0.79828143  0.54095783 -0.91545982]
 [ 0.81226191  0.81226191  0.81226191  0.81226192  0.81226193  0.81226204  0.81226322  0.81227492  0.81239128
   0.81348609  0.82016728  0.82095822  0.75534359  0.52605366 -0.71299687]
 [ 0.7616594   0.7616594   0.7616594   0.7616594   0.76165942  0.76165966  0.76166196  0.76168503  0.76191448
   0.76409362  0.77899515  0.81544258  0.82113371  0.54076512 -1.09292703]] 



NESTED K FOLD CV 
----------------- 

 Maximum Score:  0.8636674365 
 Mean optimal score:  0.82419440891 
 sd optimal scores:  0.056555113388595396 
 Optimal parEval:
 [[  1000.   1000.   1000.   1000.   1000.]
 [ 10000.  10000.  10000.  10000.  10000.]
 [  1000.   1000.   1000.   1000.   1000.]
 [  1000.   1000.   1000.   1000.   1000.]
 [ 10000.  10000.  10000.  10000.  10000.]] 
 parEval Scores:
 [ 0.78447419  0.86366744  0.83073847  0.82095822  0.82113371] 




In [ ]:


In [11]:
dfDatasetTest = querySQL('prom_test_data', 'SELECT  sequence, mean_score, 35boxstart, 10boxstart, ID  FROM hammer_prom_lib')

dfDatasetTest['sequence'] = dfDatasetTest['sequence'].str.upper()
datasetUnsorted = dfDatasetTest.values
dataset = datasetUnsorted[datasetUnsorted[:,1].argsort()][::-1]


sequences= dataset[:,0]
yData = dataset[:,1]
start35Box = dataset[:,2]
start10Box = dataset[:,3]

ID = dataset[:,4]

posRange = [-35,1]
XtestBox, spacerTest = regionSelect(dataset, [[0,10],[-6,12]], test=positionBox)
Xtest = XtestBox.values

regType = 'ridge'
poly= 3
gamma = 1
coef0 = 1
alpha = 1000
kernel = 'poly'

testSize = 0.20 
k = 5 
kInner = 5
pdf = None
#---------------------------

parModel = {"regType":regType, "poly":poly, "kernel":kernel, "gamma":gamma, "coef0":coef0, "alpha":alpha}

reg = selectRegression(**parModel)
reg.fit(X,y)
rankPredict = reg.predict(Xtest)

print(np.transpose(np.vstack((sequences,yData,rankPredict))))

print(stats.spearmanr(yData,rankPredict))
print(stats.spearmanr(rankPredict,yData))


[['CTTTGGCAGTTTATTCTTGACATGTAGTGAGGGGGCTGGTATAATCACATAGTACTGTT' 528.0 5.708700659453884]
 ['CATGTGGGAGTTTATTCTTGACACAGATATTTCCGGATGATATAATAACTGAGTACTGTT' 280.0 5.708700659453884]
 ['TATGCGGTAGTTTATTCTTGACATGACGAGACAGGTGTGGTATAATGGGTCTAGATTAGG' 134.0 5.708700659453884]
 ['CATATACAAGTTTATTCTTGACACTAGTCGGCCAAAATGATATAATACCTGAGTACTGTT' 101.0 5.708700659453884]
 ['CATAGAGAAGTTTATTCTTGACAGCTAACTTGGCCTTTGATATAATACATGAGTACTGTT' 92.0 5.708700659453884]
 ['CATACGGGAGTTTATTCTTGACATATTGCCGGTGTGTTGGTATAATAACTTAGTACTGTT' 60.0 5.708700659453884]
 ['CATAGTCTAGTTTATTCTTGACACGCGGTCCATTGGCTGGTATAATAATTTAGTACTGTT' 56.0 5.708700659453884]
 ['CATGACAGAGTTTATTCTTGACAGTATTGGGTTACTTTGGTATAATAGTTGAGTACTGTT' 42.0 5.708700659453884]
 ['CATCGGGTAGTTTATTCTTGACAATTAAGTAGAGCCTGATATAATAGTTCAGTACTGTT' 34.0 5.399529980199178]
 ['CATGATGTAGTTTATTCTTGACACTGAGAGGGCCTCTTGATATAATAGTTGAGTACTGTT' 33.0 5.708700659453886]
 ['CATGGGTGAGTTTATTCTTGACAGTGCGGCCGGGGGCTGATATCATAGCAGAGTACTATT' 22.0 4.632584202374565]
 ['CATAGAACAGTTTATTCTTGACATTGAATAAGAAGGCTGATATAATAGCCAGTACTGTT' 19.0 5.708700659453884]
 ['CATCCGCAAGTTTATTCTTGACAGCTGAATGTAGACGTGGTATAATAGTTAAGTACTGTT' 18.0 5.708700659453884]
 ['CATCATTAAGTTTATTCTTCACATTGGCCGGAATTGTTGTATAATACCTTAGTACTGTT' 16.0 3.2941392288802342]
 ['CATTCTACAGTTTATTCTTGACATTGCACTGTCCCCCTGGTATAATAACTATACATGCAT' 10.0 5.708700659453884]
 ['CATTCTGGAGTTTATTCTTGACCGCTCAGTATGCAGTGGTATAATAGTACAGTACTGTT' 10.0 4.539338366463948]
 ['CATCGCGAAGTTTATTCTTCACACACCGCAGAACTTGTGGTATAATACAACAGTACTGTT' 8.4 3.7030504315575854]
 ['CATTCGTAAGTTTATTCTTGACACCTGAGATGAGGCGTGATATAATAAATAAGTACTGTT' 7.2 5.708700659453884]
 ['CATGTTGGAGTTTATTCTTGACATACAATTACTGCAGTGATATAATAGGTGAGTACTGTT' 7.0 5.708700659453885]
 ['CATCGCTTAGTTTTTCTTGACAGGAGGGATCCGGGTTGATATAATAGTTAGTACTGTT' 3.3 5.708700659453884]
 ['CATACCGGAGTTTATTCTTGACAGTTCCACCTCGGGTTGATATAATATCTCAGTACTGTT' 3.1 5.708700659453884]
 ['CATGGGTAAGTTTATTCTTCACACTATCTGGGCCCGATGGTATAATAAGTGACTACTGTT' 3.1 3.7030504315575854]
 ['CATTTGCTAGTTTATTCTTGACATGAAGCGTGCCTAATGGTATATTACTTGAGTACTGTT' 2.8 4.841378506053643]
 ['CATGTAGGAGTTTATTCTTGACAGATTAGTTAGGGGGTGGTATAATATCTCAGTACTGTT' 2.6 5.708700659453884]
 ['CATGGCTTAGTTTATTCTTGACAGGGTAGTATCACTGTGATATAATAGGACAGTACTGTT' 2.5 5.708700659453884]
 ['CATTACGTAGTTTATTCTTGACAGAATTACGATTCGCTGGTATAATATATCAGTACTGTT' 1.5 5.708700659453884]
 ['CATAAGTGAGTTTATTCTTGACCCGGACGCCCCCCTTTGATATAATAAGTAGTACTGTT' 1.4 4.721702362954467]
 ['CATCGGTAAGTTATTCTTGACATCTCAGGGGGGACGTGGTATAATAACTGAGTACTGTT' 1.4 5.708700659453885]
 ['GATGTTTTAGTTTATTCTTGACACCGTATCGTGCGCGTGATATAATCGGGATCCTTAAGA' 1.3 5.708700659453884]
 ['CATTCTTTAGTTTATTCTTGACAAACGTATTGAGGACTGATATAATAGGTGAGTACTGTT' 1.2 5.708700659453884]
 ['CATGCTTTACTTTATTCTTGACAAAACCACCAGCTTTTGGTATAATACGTGAGAACTGTT' 1.0 5.708700659453884]
 ['CATCCTGTAGTTTATTCTTGACACAAGTCGTTAGCTGTGGTATAATAGGAGAGGACTGTT' 0.9 5.708700659453884]
 ['CATGGGGCCGTTTATTCTTGACAACGGCGAGCAGACCTGGTATAATAATATAGTACTGTT' 0.9 5.708700659453884]
 ['CATTGCGAAGTTTATTCTTGACAGTACGTTTTTACCATGATATAATAGTATAGTACTGTT' 0.5 5.708700659453884]
 ['CATGACGGAGTTTATTCTTGACACAGGTATGGACTTATGATATAATAAAACAGTACTGTT' 0.3 5.708700659453884]
 ['CATTGTGTAGTTTATTCTTGACAGCTATGAGTCAATTTGGTATAATAACAGTACTCAG' 0.3 5.708700659453884]
 ['CATTTTGCAGTTTATTCTTGACATTGTGTGCTTCGGGTGTATAATACTAAGTACTGTT' 0.2 5.399529980199181]]
SpearmanrResult(correlation=0.004562755752165362, pvalue=0.97861792261200697)
SpearmanrResult(correlation=0.0045627557521653612, pvalue=0.97861792261200697)

In [121]:
""" LINEAR REGRESSION USING OTHER ELEMENTS

# Query sequences, fluorescence level, reference regions

resultset =  np.squeeze(querySQL('mutalik_prom_lib','SELECT sequence, mean_fluo, 35boxstart, 10boxstart FROM rand_prom_lib').values)

sequences= resultset[:,0]
score = resultset[:,1]
start35Box = resultset[:,2]
start10Box = resultset[:,3]

# Selecting regions

box10 = selectRegions(sequences, [0,6], start10Box)
box35 = selectRegions(sequences, [0,6], start35Box)
spacer = start10Box-start35Box-6

# Creating features


fractionA_10, fractionT_10, fractionC_10, fractionG_10 = extractATCGpercentage(box10)
fractionA_35, fractionT_35, fractionC_35, fractionG_35 = extractATCGpercentage(box35)

fractionGC = [(o+p+q+r)/4 for o,p,q,r in zip(fractionC_10,fractionG_10,fractionC_35,fractionG_35)]
fractionAT = [(o+p+q+r)/4 for o,p,q,r in zip(fractionA_10,fractionT_10,fractionA_35,fractionT_35)]


featureMatrix = np.transpose(np.array([fractionGC,spacer]))


#print(featureMatrix)

print(np.shape(spacer),np.shape(score))
# Fit linear regression

linReg = LinearRegression(fit_intercept=True, normalize=True)
linReg.fit(featureMatrix, score)

#linReg.fit(np.squeeze(spacer), np.squeeze(score))

print(linReg.coef_)
print(linReg.intercept_)


# Plot coefficients
index = np.arange(np.shape(featureMatrix)[1])
coefPlot = plt.bar(index, linReg.coef_, 0.10 )
intPlot = plt.bar(-1, linReg.intercept_, 0.10)
plt.axhline(0, color='black')
plt.axvline(0, color='black')


(138,) (138,)
[-311.19084743   68.32928228]
-1000.49146613
Out[121]:
<matplotlib.lines.Line2D at 0x7f2e4315e940>

In [ ]:
######################### EXPERIMENTAL GROUNDS ###############################################


"""
topScores= np.empty([8,8])
topCombinations = np.empty([8,8])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20)
logIt = [math.pow(10,u) for u in range(-3,5)]
for i in logIt:
    index = int(math.log10(i)+3)
    combinations, results = bestSubsetSelection(X_train, X_test, y_train, y_test,2, 'ridge', alpha=i)
    print(results[0:8])
    print(combinations[0:8])"""

In [ ]: