In [7]:
#'ridge,poly,3'.split(',')
import pandas as pd
In [9]:
hu = pd.Series()
In [13]:
hu.append(pd.Series([1,2,3]))
Out[13]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [1]:
import matplotlib
import numpy as np
import math
import itertools
import sklearn
import warnings
import pandas as pd
import sklearn.linear_model
import matplotlib.pyplot as plt
from prompred import *
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
In [7]:
# Import data for model
dfDatasetOrder = pd.read_csv("data/mut_rand_mod_lib.csv")
# Shuffle
dfDataset = dfDatasetOrder.reindex(np.random.permutation(dfDatasetOrder.index))
Compatible Models:
Parameters: coef0
Parameters: max_depth, max_features, min_samples
Parameters: alpha, coef0
Parameters: alpha, coef0
Parameters: alpha, gamma, coef0
Kernels: poly, ...
Parameters: alpha, gamma, coef0
Kernels: poly, ...
In [8]:
plt.close('all')
# sequence region
seqRange = [-42,1]
ROI = [[-5,14],[-8,12]]
#model selection and hyperparameters
regType = 'ridge'
kernel = 'poly'
poly= 3
# To be evaluated parameter(s)
parLabel = ['alpha']
parRange = [20]
## OPTIONAL ##
testSize = 0.2
k = 5
kInner = 5
pdf = None
treeCount = 10
gamma = 0.1
alpha = 0.1
coef0 = 0.00
penalty = 0.1
epsilon = 1.95
parModel = {"regType":regType, "poly":poly, "kernel":kernel, "treeCount":treeCount, "gamma":gamma, "coef0":coef0}
###############################################
####### DO NOT TOUCH ##########################
labels, positionBox, spacer = regionSelect(dfDataset, ROI , seqRange)
positionMatrix = positionBox.values
Xdf = positionBox
X = positionMatrix
y = [math.sqrt(math.sqrt(u)) for u in labels]
if len(parLabel) is 1:
scoresParCV, optimalParCV = KfoldCV(X,y,k,parModel,parLabel[0],parRange[0])
scoresParNCV, optimalParNCV, scoresNCV = nestedKfoldCV(X,y,k,kInner,parModel,parLabel[0],parRange[0])
meanScores = np.mean(np.ndarray.max(scoresParCV,axis=1))
print("K FOLD CV \n---------- \n\n Maximum Score: ",np.max(scoresParCV), "\n Mean optimal score: ", meanScores ,"\n sd optimal scores: ", math.sqrt(np.sum(np.power((np.ndarray.max(scoresParCV,axis=1)-meanScores),2))) , "\n Optimal parEval:\n", optimalParCV, "\n parEval Scores:\n", scoresParCV,"\n\n\n")
print("NESTED K FOLD CV \n----------------- \n\n Maximum Score: ",np.max(scoresParNCV), "\n Mean optimal score: ", np.mean(scoresParNCV) ,"\n sd optimal scores: ", math.sqrt(np.sum(np.power((np.ndarray.max(scoresParCV,axis=1)-np.mean(scoresParNCV)),2))) , "\n Optimal parEval:\n", optimalParNCV, "\n parEval Scores:\n", scoresParNCV,"\n\n\n")
if len(parLabel) is 2:
evaluateMultiPar(X, y, parModel, parLabel, parRange)
if (regType in ["ridge","lasso","OLS"] and poly is None):
reg = selectRegression(regType, alpha=np.median(optimalParCV), poly=poly, kernel=kernel)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testSize)
reg.fit(X_train,y_train)
plotPositionCoefficients(reg, positionBox, posRange)
In [10]:
##################################
### TEST SETS
plt.close('all')
###################################
#data input
dfDatasetTest = pd.read_csv("data/anderson_lib.csv")
#sequence range
seqRange = [-42,1]
#regions of interest (wrt 35- and 10-box)
ROI = [[0,12],[-12,12]]
#model selection and hyperparameters
regType = 'ridge'
kernel = 'poly'
poly= '3'
#parameters to be evaluated
parLabel = ['alpha']
parRange = [20]
treeCount = 100
gamma = 0.1
manInput = True
parInput = {"regType":regType, "poly":poly, "kernel":kernel, "gamma":0.1, "treeCount":treeCount, "alpha": 10000, "coef0":1}
#####################################################################################################
#####################################################################################################
# Import data for model
dfDatasetTrain = pd.read_csv("data/mut_rand_mod_lib.csv")
dfDatasetTest['sequence'] = dfDatasetTest['sequence'].str.upper()
dfDatasetTest = dfDatasetTest.sort(columns='mean_score',ascending=False)
labelsTrain, positionBoxTrain, spacerTrain = regionSelect(dfDatasetTrain, ROI, seqRange)
labelsTest, positionBoxTest, spacerTest = regionSelect(dfDatasetTest, ROI, seqRange)
positionMatrixTrain = positionBoxTrain.values
positionMatrixTest = positionBoxTest.values
Xdf = positionBoxTrain
X = positionMatrixTrain
y = [math.sqrt(math.sqrt(u)) for u in labelsTrain]
Xtest = positionMatrixTest
ytest = labelsTest
#############################
if manInput == False:
parModel = {"regType":regType, "poly":poly, "kernel":kernel, "treeCount":treeCount, "gamma":gamma }
parEval = getParameters(parLabel,parRange)
reg = selectRegression(**parModel)
GS = GridSearchCV(reg, parEval)
GS.fit(X,y)
reg = GS.best_estimator_
print(GS.best_estimator_)
else:
reg = selectRegression(**parInput)
reg.fit(X,y)
rankPredict = reg.predict(Xtest)
print(np.transpose(np.vstack((dfDatasetTest['sequence'].values,dfDatasetTest['mean_score'].values,rankPredict))))
print(stats.spearmanr(dfDatasetTest['mean_score'].values,rankPredict))
plt.plot(dfDatasetTest['mean_score'].values,rankPredict, 'ro')
Out[10]:
In [ ]:
##################################
### TEST SETS
plt.close('all')
###################################
#data input
dfDatasetTest = pd.read_csv("data/brewster_lib.csv")
#sequence range
seqRange = [-42,1]
#regions of interest (wrt 35- and 10-box)
ROI = [[-5,13],[-8,12]]
#model selection and hyperparameters
regType = 'ridge'
kernel = 'poly'
poly= '3'
#parameters to be evaluated
parLabel = ['alpha']
parRange = [20]
treeCount = 100
gamma = 0.1
# Evaluate for best hyperparameters <-> Manual input
manInput = True
parInput = {"regType":regType, "poly":poly, "kernel":kernel, "gamma":0.1, "treeCount":treeCount, "alpha": 0.000001, "coef0":1 }
#####################################################################################################
#####################################################################################################
# Import data for model
dfDatasetTrain = pd.read_csv("data/mut_rand_mod_lib.csv")
dfDatasetTest['sequence'] = dfDatasetTest['sequence'].str.upper()
dfDatasetTest = dfDatasetTest.sort(columns='mean_score',ascending=False)
labelsTrain, positionBoxTrain, spacerTrain = regionSelect(dfDatasetTrain, ROI, seqRange)
labelsTest, positionBoxTest, spacerTest = regionSelect(dfDatasetTest, ROI, seqRange)
positionMatrixTrain = positionBoxTrain.values
positionMatrixTest = positionBoxTest.values
Xdf = positionBoxTrain
X = positionMatrixTrain
y = [math.sqrt(math.sqrt(u)) for u in labelsTrain]
Xtest = positionMatrixTest
ytest = labelsTest
#############################
if manInput == False:
parModel = {"regType":regType, "poly":poly, "kernel":kernel, "treeCount":treeCount, "gamma":gamma }
parEval = getParameters(parLabel,parRange)
reg = selectRegression(**parModel)
GS = GridSearchCV(reg, parEval)
GS.fit(X,y)
reg = GS.best_estimator_
print(GS.best_estimator_)
else:
reg = selectRegression(**parInput)
reg.fit(X,y)
rankPredict = reg.predict(Xtest)
print(np.transpose(np.vstack((dfDatasetTest['sequence'].values,dfDatasetTest['mean_score'].values,rankPredict))))
print(stats.spearmanr(dfDatasetTest['mean_score'].values,rankPredict))
plt.plot(dfDatasetTest['mean_score'].values,rankPredict, 'ro')
In [1]:
dfDatasetTest.style
In [68]:
##################################
### TEST SETS
plt.close('all')
###################################
#data input
dfDatasetTest = pd.read_csv("data/inbio_lib.csv")
#sequence range
seqRange = [-45,1]
#regions of interest (wrt 35- and 10-box)
ROI = [[-7,12],[-8,11]]
#model selection and hyperparameters
regType = 'forestReg'
kernel = None
poly= None
#parameters to be evaluated
parLabel = ['max_depth']
parRange = [50]
treeCount = 100
gamma = 0.1
parModelTweak = {"regType":regType, "poly":poly, "kernel":kernel, "gamma":0.0001, "treeCount":treeCount, "alpha": 1.0, "coef0":0.1 }
evaluateMutalik(dfDatasetTest, ROI, seqRange, parInput)
Out[68]:
In [3]:
##################################
### TEST SETS
plt.close('all')
###################################
#data input
dfDatasetTest = pd.read_csv("data/hammer_lib.csv")
#sequence range
seqRange = [-45,1]
#regions of interest (wrt 35- and 10-box)
ROI = [[-7,14],[-8,11]]
#model selection and hyperparameters
regType = 'ridge'
kernel = 'poly'
poly= 3
#parameters to be evaluated
parLabel = ['alpha']
parRange = [50]
treeCount = 100
gamma = 0.1
# Evaluate for best hyperparameters <-> Manual input
manInput = False
##
parInput = {"regType":regType, "poly":poly, "kernel":kernel, "gamma":0.01, "treeCount":treeCount, "alpha": 0.01, "coef0":0.1 }
evaluateMutalik(dfDatasetTest, ROI, seqRange, parInput)
In [32]:
plt.close('all')
#----PARAMETERS -----
Xdf = positionBox
X = XnoOut
y = YnoOut
regType = 'ridge'
poly= 3
gamma = 1
coef0 = 1
kernel = 'poly'
parLabel = ['alpha']
parRange = [15]
testSize = 0.20
k = 5
kInner = 5
pdf = None
#---------------------------
parModel = {"regType":regType, "poly":poly, "kernel":kernel, "gamma":gamma, "coef0":coef0}
if len(parLabel) is 1:
scoresParCV, optimalParCV, OutI, OutV = KfoldCV(X,y,k,parModel,parLabel[0],parRange[0])
scoresParNCV, optimalParNCV, scoresNCV = nestedKfoldCV(X,y,k,kInner,parModel,parLabel[0],parRange[0])
meanScores = np.mean(np.ndarray.max(scoresParCV,axis=1))
print("K FOLD CV \n---------- \n\n Maximum Score: ",np.max(scoresParCV), "\n Mean optimal score: ", meanScores ,"\n sd optimal scores: ", math.sqrt(np.sum(np.power((np.ndarray.max(scoresParCV,axis=1)-meanScores),2))) , "\n Optimal parEval:\n", optimalParCV, "\n parEval Scores:\n", scoresParCV,"\n\n\n")
print("NESTED K FOLD CV \n----------------- \n\n Maximum Score: ",np.max(scoresParNCV), "\n Mean optimal score: ", np.mean(scoresParNCV) ,"\n sd optimal scores: ", math.sqrt(np.sum(np.power((np.ndarray.max(scoresParCV,axis=1)-np.mean(scoresParNCV)),2))) , "\n Optimal parEval:\n", optimalParNCV, "\n parEval Scores:\n", scoresParNCV,"\n\n\n")
if len(parLabel) is 2:
evaluateMultiPar(X, y, parModel, parLabel, parRange)
if (regType in ["ridge","lasso","OLS"] and poly is None):
reg = selectRegression(regType, alpha=np.median(optimalParCV), poly=poly, kernel=kernel)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testSize)
reg.fit(X_train,y_train)
plotPositionCoefficients(reg, positionBox, posRange)
In [ ]:
In [11]:
dfDatasetTest = querySQL('prom_test_data', 'SELECT sequence, mean_score, 35boxstart, 10boxstart, ID FROM hammer_prom_lib')
dfDatasetTest['sequence'] = dfDatasetTest['sequence'].str.upper()
datasetUnsorted = dfDatasetTest.values
dataset = datasetUnsorted[datasetUnsorted[:,1].argsort()][::-1]
sequences= dataset[:,0]
yData = dataset[:,1]
start35Box = dataset[:,2]
start10Box = dataset[:,3]
ID = dataset[:,4]
posRange = [-35,1]
XtestBox, spacerTest = regionSelect(dataset, [[0,10],[-6,12]], test=positionBox)
Xtest = XtestBox.values
regType = 'ridge'
poly= 3
gamma = 1
coef0 = 1
alpha = 1000
kernel = 'poly'
testSize = 0.20
k = 5
kInner = 5
pdf = None
#---------------------------
parModel = {"regType":regType, "poly":poly, "kernel":kernel, "gamma":gamma, "coef0":coef0, "alpha":alpha}
reg = selectRegression(**parModel)
reg.fit(X,y)
rankPredict = reg.predict(Xtest)
print(np.transpose(np.vstack((sequences,yData,rankPredict))))
print(stats.spearmanr(yData,rankPredict))
print(stats.spearmanr(rankPredict,yData))
In [121]:
""" LINEAR REGRESSION USING OTHER ELEMENTS
# Query sequences, fluorescence level, reference regions
resultset = np.squeeze(querySQL('mutalik_prom_lib','SELECT sequence, mean_fluo, 35boxstart, 10boxstart FROM rand_prom_lib').values)
sequences= resultset[:,0]
score = resultset[:,1]
start35Box = resultset[:,2]
start10Box = resultset[:,3]
# Selecting regions
box10 = selectRegions(sequences, [0,6], start10Box)
box35 = selectRegions(sequences, [0,6], start35Box)
spacer = start10Box-start35Box-6
# Creating features
fractionA_10, fractionT_10, fractionC_10, fractionG_10 = extractATCGpercentage(box10)
fractionA_35, fractionT_35, fractionC_35, fractionG_35 = extractATCGpercentage(box35)
fractionGC = [(o+p+q+r)/4 for o,p,q,r in zip(fractionC_10,fractionG_10,fractionC_35,fractionG_35)]
fractionAT = [(o+p+q+r)/4 for o,p,q,r in zip(fractionA_10,fractionT_10,fractionA_35,fractionT_35)]
featureMatrix = np.transpose(np.array([fractionGC,spacer]))
#print(featureMatrix)
print(np.shape(spacer),np.shape(score))
# Fit linear regression
linReg = LinearRegression(fit_intercept=True, normalize=True)
linReg.fit(featureMatrix, score)
#linReg.fit(np.squeeze(spacer), np.squeeze(score))
print(linReg.coef_)
print(linReg.intercept_)
# Plot coefficients
index = np.arange(np.shape(featureMatrix)[1])
coefPlot = plt.bar(index, linReg.coef_, 0.10 )
intPlot = plt.bar(-1, linReg.intercept_, 0.10)
plt.axhline(0, color='black')
plt.axvline(0, color='black')
Out[121]:
In [ ]:
######################### EXPERIMENTAL GROUNDS ###############################################
"""
topScores= np.empty([8,8])
topCombinations = np.empty([8,8])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20)
logIt = [math.pow(10,u) for u in range(-3,5)]
for i in logIt:
index = int(math.log10(i)+3)
combinations, results = bestSubsetSelection(X_train, X_test, y_train, y_test,2, 'ridge', alpha=i)
print(results[0:8])
print(combinations[0:8])"""
In [ ]: