This notebook, presentation, functions and data can be retrieved from GitHub:
git clone https://github.com/Kleurenprinter/prompred.git
The raw code for this IPython notebook is by default hidden for easier reading. To toggle on/off the raw code, click here.
In [2]:
import matplotlib.pyplot as plt
from prompred import *
import numpy as np
import math
import sklearn
import warnings
import pandas as pd
from IPython.core.display import HTML
warnings.filterwarnings('ignore')
%matplotlib inline
sequence length: $[35,49]$
- varying spacer lengths
- up-element
Insulated promoters
In [4]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 90000;
In [1]:
dfDatasetOrder = pd.read_csv("../../data/mut_rand_mod_lib.csv")
dfDataset = dfDatasetOrder.reindex(np.random.permutation(dfDatasetOrder.index))
dfDataset.iloc[:5, 0:2]
In [6]:
#sequence range
seqRange = [-47,1]
#regions of interest (wrt 35- and 10-box)
ROI = [[-12,14],[-8,12]]
#
labels, positionBox, spacer = regionSelect(dfDataset, ROI, seqRange)
print(dfDataset['sequence'][0],"\n",positionBox.values[0])
In [7]:
dfDataset.iloc[:5,[1,2,4,5]].style.set_properties(**{"font-size":"12px"})
Out[7]:
In [8]:
yData = dfDataset['mean_score']
yRooted = [math.sqrt(math.sqrt(u)) for u in yData]
plt.figure(num=None, figsize=(10, 4), dpi=80, facecolor='w', edgecolor='k')
plt.subplot(121)
plt.hist(yData,10,normed=1)
plt.title('Histogram $fluorescence$ values')
plt.subplot(122)
plt.hist(yRooted,10,normed=1)
plt.title('Histogram $\sqrt[4]{fluorescence}$ values')
Out[8]:
The raw code for this IPython notebook is by default hidden for easier reading. To toggle on/off the raw code, click here.
OLS
Parameters: coef0
ridge
Parameters: alpha, coef0
lasso
Parameters: alpha, coef0
forestReg, forestClass
Parameters: max_depth, max_features, min_samples
SVR
, SVC
Parameters: alpha, gamma, coef0
Kernels: poly, RBF, sigmoid, ...
ridge
Parameters: alpha, gamma, coef0
Kernels: poly, RBF, sigmoid, ...
In [9]:
# Model specification
parModel = {"regType":'ridge', "poly":3, "kernel":"poly", "coef0":1}
# To be evaluated parameter(s)
parLabel = ['alpha']
parRange = [15]
# Define kfold validation parameters
testSize = 0.2
k = 5
kInner = 5
#
X = positionBox.values
y = yRooted
# Run function
scoresParCV, optimalParCV = KfoldCV(X,y,k,parModel,parLabel[0],parRange[0])
In [10]:
meanScores = np.mean(np.ndarray.max(scoresParCV,axis=1))
print("K FOLD CV \n---------- \n\n Maximum Score: ",np.max(scoresParCV), "\n Mean optimal score: ", meanScores ,"\n sd optimal scores: ", math.sqrt(np.sum(np.power((np.ndarray.max(scoresParCV,axis=1)-meanScores),2))) , "\n Optimal parEval:\n", optimalParCV, "\n parEval Scores:\n", scoresParCV,"\n\n\n")\
In [11]:
scoresParNCV, optimalParNCV, scoresNCV = nestedKfoldCV(X,y,k,kInner,parModel,parLabel[0],parRange[0])
In [12]:
print("NESTED K FOLD CV \n----------------- \n\n Maximum Score: ",np.max(scoresParNCV), "\n Mean optimal score: ", np.mean(scoresParNCV) ,"\n sd optimal scores: ", math.sqrt(np.sum(np.power((np.ndarray.max(scoresParCV,axis=1)-np.mean(scoresParNCV)),2))) , "\n Optimal parEval:\n", optimalParNCV, "\n parEval Scores:\n", scoresParNCV,"\n\n\n")
The raw code for this IPython notebook is by default hidden for easier reading. To toggle on/off the raw code, click here.
In [13]:
dfDatasetTest = pd.read_csv("data/anderson_lib.csv")
dfDatasetTest['sequence'] = dfDatasetTest['sequence'].str.upper()
dfDatasetTest.iloc[:5,:3]
Out[13]:
In [14]:
labelsTest, positionBoxTest, spacerTest = regionSelect(dfDatasetTest, ROI, seqRange)
Xtest = positionBoxTest.values
parInput = {"regType":'ridge', "poly":3, "kernel":'poly', "gamma":0.1, "alpha": 10000, "coef0":1}
reg = selectRegression(**parInput)
reg.fit(X,y)
rankPredict = reg.predict(Xtest)
#print(np.transpose(np.vstack((dfDatasetTest['sequence'].values,dfDatasetTest['mean_score'].values,rankPredict))))
print(stats.spearmanr(dfDatasetTest['mean_score'].values,rankPredict))
plt.plot(dfDatasetTest['mean_score'].values,rankPredict, 'ro')
plt.xlabel('True label')
plt.ylabel('Predicted label')
Out[14]:
In [15]:
dfDatasetTest = pd.read_csv("data/brewster_lib.csv")
dfDatasetTest.iloc[:5,:3].style.set_properties(**{"font-size":"12px"})
Out[15]:
In [16]:
labelsTest, positionBoxTest, spacerTest = regionSelect(dfDatasetTest, ROI, seqRange)
Xtest = positionBoxTest.values
rankPredict = reg.predict(Xtest)
#print(np.transpose(np.vstack((dfDatasetTest['sequence'].values,dfDatasetTest['mean_score'].values,rankPredict))))
print(stats.spearmanr(dfDatasetTest['mean_score'].values,rankPredict))
plt.plot(dfDatasetTest['mean_score'].values,rankPredict, 'ro')
plt.xlabel('True label')
plt.ylabel('Predicted label')
Out[16]:
In [65]:
dfDatasetTest = pd.read_csv("data/hammer_lib.csv")
dfDatasetTest.iloc[:5,:3].style.set_properties(**{"font-size":"10px"})
Out[65]:
In [42]:
labelsTest, positionBoxTest, spacerTest = regionSelect(dfDatasetTest, ROI, seqRange)
Xtest = positionBoxTest.values
rankPredict = reg.predict(Xtest)
print(stats.spearmanr(dfDatasetTest['mean_score'].values,rankPredict))
plt.plot(dfDatasetTest['mean_score'].values,rankPredict, 'ro')
Out[42]:
In [46]:
dfDatasetTest = pd.read_csv("data/inbio_lib.csv")
dfDatasetTest.iloc[:5,:3].style.set_properties(**{"font-size":"9px"})
Out[46]:
In [44]:
labelsTest, positionBoxTest, spacerTest = regionSelect(dfDatasetTest, ROI, seqRange)
Xtest = positionBoxTest.values
rankPredict = reg.predict(Xtest)
print(stats.spearmanr(dfDatasetTest['mean_score'].values,rankPredict))
plt.plot(dfDatasetTest['mean_score'].values,rankPredict, 'ro')
plt.xlabel('True label')
plt.ylabel('Predicted label')
Out[44]:
In [49]:
import pandas as pd
dfDatasetTest = pd.read_csv("data/pairedDBanalysis.csv")
dfDatasetTest.iloc[:5]
Out[49]:
In [ ]: