In [13]:
a = np.array([1, -1, 1, -1])
b = np.array([1, 1, -1, 1])
np.sum(a!=b)
Out[13]:
In [40]:
import itertools
def KendallTau(y_pred, y_true):
a = np.array(y_pred)
b = np.array(y_true)
n = len(y_pred)
score = (np.sum(a==b)-np.sum(a!=b))/n
return score
In [41]:
a = np.array([1, 10, 100, 1000, 10000, 10000000])
b = np.array([1, 10, 100, 1000, 10000, 10000000])
def CreateRankedLabels(a):
pw = list(itertools.combinations(a,2))
labels = [1 if item[0]>item[1] else -1 for item in pw]
return labels
a_labels = CreateRankedLabels(a)
b_labels = CreateRankedLabels(b)
KendallTau(a_labels,b_labels)
Out[41]:
In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import math
import itertools
from scipy import stats
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVC, SVR
from sklearn.preprocessing import PolynomialFeatures
In [6]:
%%writefile ../../src/models/model_utils.py
# %load ../../src/models/model_utils.py
# %%writefile ../../src/models/model_utils.py
"""
Author: Jim Clauwaert
Created in the scope of my PhD
"""
import pandas as pd
import numpy as np
import sklearn as sk
import math
import itertools
from scipy import stats
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVC, SVR
from sklearn.preprocessing import PolynomialFeatures
def CreateRankedLabels(a):
pw = list(itertools.combinations(a,2))
labels = [1 if item[0]>item[1] else -1 for item in pw]
return labels
def GetParameterSet(parLabel, parRange):
"""Retrieve a set of parameter values used for training of a model in sklearn.
Parameters
-----------
parLabel : 1-dimensional numpy array (str)
numpy array holding a set of parameter labels. Valid labels include:
[alpha, gamma, C, coef0, epsilon, max_depth, min_samples, max_features]
parRange : 1-dimensional numpy array (int)
numpy array with the amount of parameters returned for every parameter label.
parLabel and parRange must be of the same dimension.
Returns
--------
parSet : Dictionary
Dictionary containing a set of parameters for every label
"""
if parLabel[0] in ['max_depth','min_samples_split', 'max_features']:
parameters = [np.zeros(parRange[u],dtype=np.int) for u in range(len(parRange))]
else:
parameters = [np.zeros(parRange[u]) for u in range(len(parRange))]
for i in range(len(parLabel)):
if parLabel[i] == "alpha":
parameters[i][:] = [math.pow(10,(u - np.around(parRange[i]/2))) for u in range(parRange[i])]
elif parLabel[i] == "gamma":
parameters[i][:] = [math.pow(10,(u - np.around(parRange[i]/2))) for u in range(parRange[i])]
elif parLabel[i] == "C":
parameters[i][:] = [math.pow(10,(u - np.around(parRange[i]/2))) for u in range(parRange[i])]
elif parLabel[i] == "coef0":
parameters[i][:] = [math.pow(10,(u - np.around(parRange[i]/2))) for u in range(parRange[i])]
elif parLabel[i] == "epsilon":
parameters[i][:] = [0+2/parRange[i]*u for u in range(parRange[i])]
elif parLabel[i] == "max_depth":
parameters[i][:] = [int(u+1) for u in range(parRange[i])]
elif parLabel[i] == 'min_samples_split':
parameters[i][:] = [int(u+2) for u in range(parRange[i])]
elif parLabel[i] == 'max_features':
parameters[i][:] = [int(u+2) for u in range(parRange[i])]
else:
return print("Not a valid parameter")
parSet = {parLabel[u]:parameters[u] for u in range(len(parLabel))}
return parSet
def EvaluateParameterSet(X_train, X_test, y_train, y_test, parModel, parSet):
"""Evaluate the scores of a set of parameters for a given model.
Parameters
-----------
X_train:
Training dataset features
X_test:
Test dataset features
y_train
Training dataset labels
y_test
Test dataset labels
parModel: Dictionary
parSet : Dictionary
Dictionary holding the parameter label and values over which the model has to be
evaluated. This can be created through the function GetParameterSet.
Accepted keys are:
[alpha, gamma, C, coef0, epsilon, max_depth, min_samples, max_features]
Returns
--------
scores: 1-dimensional numpy array: int
Fitted scores of the model with each of the parametersSets
optimalPar: int
Optimal parameter value for a given parameter label
"""
scores = []
for i in range(len(parSet[parLabel])):
parSetIt = {parLabel:parSet[parLabel][i]}
model = SelectModel(**parModel,**parEvalIt)
model.fit(X_train,y_train)
scores = np.append(model.score(X_test,y_test))
optimalPar = parSet[parLabel][np.argmax(scores)]
return scores, optimalPar
def EvaluateScore(X_train, X_test, y_train, y_test, parModel, scoring='default', pw=False):
"""Evaluates the score of a model given for a given test and training data
Parameters
-----------
X_train, X_test: DataFrame
Test and training data of the features
y_train, y_test: 1-dimensional numpy array
Test and training data of the labels
parModel: dictionary
Parameters indicating the model and some of its features
Returns
--------
score: int
Score of the test data on the model
y_pred: 1-dimensional array
An array giving the predicted labels for a given test set
"""
model = SelectModel(**parModel)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
if scoring == 'default':
score = model.score(X_test,y_test)
elif scoring == 'kt':
if pw is True:
score = KendallTau(y_pred, y_test)
if pw is False:
y_pred_pw = CreateRankedLabels(y_pred)
y_test_pw = CreateRankedLabels(y_test)
score = KendallTau(y_pred_pw, y_test_pw)
elif scoring == 'spearman':
score = stats.spearmanr(y_test, y_pred)[0]
else:
raise("Scoring type not defined. Possible options are: 'default', 'kt', and 'spearman'")
return score, y_pred
def KendallTau(y_pred, y_true):
a = np.array(y_pred)
b = np.array(y_true)
n = len(y_pred)
score = (np.sum(a==b)-np.sum(a!=b))/n
return score
def LearningCurveInSample(dfDataset, featureBox, y ,parModel, scoring='default', k=5, pw=False, step=1):
"""Calculates the learning curve of a dataset for a given model
Parameters
-----------
dfDataset: Dataframe
Dataframe holding sequences,
featureBox: Dataframe
Test dataset features
y: 1-dimensional numpy array
parModel: Dictionary
k: int
pw: Boolean
step: int
Returns
--------
scores: 1-dimensional numpy array: int
Fitted scores of the model with each of the parametersSets
optimalPar: int
Optimal parameter value for a given parameter label
"""
X = featureBox.values
if pw is True:
temp = np.unique(dfDataset[['ID_1', 'ID_2']].values)
dfId = pd.Series(temp[:-(len(temp)%k)])
else:
dfId = dfDataset['ID'][:-(len(dfDataset)%k)]
lenId = len(dfId)
Id = dfId.values
indexId = np.array(range(lenId))
scores = np.array([])
it=0
for i in range(k):
boolTest = np.logical_and(indexId>=i*lenId/k,indexId<(i+1)*lenId/k)
test = Id[boolTest]
train = Id[np.invert(boolTest)]
if pw is True:
indexTest = (dfDataset['ID_1'].isin(test) | dfDataset['ID_2'].isin(test)).values
else:
indexTest = dfDataset['ID'].isin(test).values
dfDatasetTrain = dfDataset[np.invert(indexTest)]
X_train, y_train = featureBox[np.invert(indexTest)], y[np.invert(indexTest)]
X_test, y_test = featureBox[indexTest], y[indexTest]
for j in range((len(train)-5)//step):
print("\rProgress {:2.1%}".format(it/k+(j/len(train)/k*step)), end='')
trainInner = train[:(j*step)+5]
if pw is True:
indexTrainInner = (dfDatasetTrain['ID_1'].isin(trainInner) & dfDatasetTrain['ID_2'].isin(trainInner)).values
else:
indexTrainInner = (dfDatasetTrain['ID'].isin(trainInner)).values
X_trainInner, y_trainInner = X_train[indexTrainInner], y_train[indexTrainInner]
score, y_pred = EvaluateScore(X_trainInner, X_test, y_trainInner, y_test, {**parModel}, scoring, pw)
scores = np.append(scores,score)
it+=1
scores = scores.reshape((k,-1))
return scores
def LearningCurveInSampleEnriched(dfDataset, featureBox, enrichBox, y, y_enrich ,parModel,
scoring='default', k=5, pw=True, step=1):
"""Calculates the learning curve of an enriched dataset for a given model
Parameters
-----------
dfDataset: Dataframe
Dataframe holding sequences,
featureBox: Dataframe
Test dataset features
y: 1-dimensional numpy array
parModel: Dictionary
k: int
pw: Boolean
step: int
Returns
--------
scores: 1-dimensional numpy array: int
Fitted scores of the model with each of the parametersSets
optimalPar: int
Optimal parameter value for a given parameter label
"""
if pw is True:
temp = np.unique(dfDataset[['ID_1', 'ID_2']].values)
dfId = pd.Series(temp[:-(len(temp)%k)])
else:
dfId = dfDataset['ID'][:-(len(dfDataset)%k)]
lenId = len(dfId)
Id = dfId.values
indexId = np.array(range(lenId))
scores = np.array([])
it=0
for i in range(k):
boolTest = np.logical_and(indexId>=i*lenId/k,indexId<(i+1)*lenId/k)
test = Id[boolTest]
train = Id[np.invert(boolTest)]
if pw is True:
indexTest = (dfDataset['ID_1'].isin(test) | dfDataset['ID_2'].isin(test)).values
else:
indexTest = dfDataset['ID'].isin(test).values
dfDatasetTrain = dfDataset[np.invert(indexTest)]
X_train = featureBox[np.invert(indexTest)]
y_train = y[np.invert(indexTest)]
X_test, y_test = featureBox[indexTest], y[indexTest]
for j in range((len(train))//step):
print("\rProgress {:2.1%}".format(it/k+(j/len(train)/k*step)), end='')
trainInner = train[:(j*step)]
if pw is True:
indexTrainInner = (dfDatasetTrain['ID_1'].isin(trainInner) & dfDatasetTrain['ID_2'].isin(trainInner)).values
else:
indexTrainInner = (dfDatasetTrain['ID'].isin(trainInner)).values
X_trainInner = np.vstack((enrichBox,X_train[indexTrainInner]))
y_trainInner = np.append(y_enrich, y_train[indexTrainInner])
score, y_pred = EvaluateScore(X_trainInner, X_test, y_trainInner, y_test, {**parModel}, scoring, pw)
scores = np.append(scores,score)
it+=1
scores = scores.reshape((k,-1))
return scores
def LearningCurveOutOfSample(dfDataset, featureBox, y , dataList, parModel, scoring='default', pw=False, step=1):
"""Calculates the learning curve of a dataset for a given model
Parameters
-----------
dfDataset: Dataframe
Dataframe holding sequences,
featureBox: Dataframe
Test dataset features
y: 1-dimensional numpy array
parModel: Dictionary
k: int
pw: Boolean
step: int
Returns
--------
scores: 1-dimensional numpy array: int
Fitted scores of the model with each of the parametersSets
optimalPar: int
Optimal parameter value for a given parameter label
"""
if pw is True:
temp = np.unique(dfDataset[['ID_1', 'ID_2']].values)
dfId = pd.Series(temp)
else:
dfId = dfDataset['ID']
lenId = len(dfId)
Id = dfId.values
indexId = np.array(range(lenId))
scores = np.zeros(shape=(len(dataList),(lenId-5)//step))
for i in range((lenId-5)//step):
print("\rProgress {:2.1%}".format(i/lenId*step), end='')
train = Id[:((i*step)+5)]
if pw is True:
indexTrain = (dfDataset['ID_1'].isin(train) & dfDataset['ID_2'].isin(train)).values
else:
indexTrain = dfDataset['ID'].isin(train).values
X_train, y_train = featureBox[indexTrain], y[indexTrain]
for j in range(len(dataList)):
score, y_pred = EvaluateScore(X_train, dataList[j][1].values, y_train, dataList[j][2],
{**parModel}, scoring, pw)
scores[j,i] = score
return scores
def LearningCurveOutOfSampleEnriched(dfDataset, featureBox, enrichBox, y, y_enrich, dataOutList,
parModel, scoring='default', pw=True, step=1):
if pw is True:
temp = np.unique(dfDataset[['ID_1', 'ID_2']].values)
dfId = pd.Series(temp)
else:
dfId = dfDataset['ID']
lenId = len(dfId)
Id = dfId.values
indexId = np.array(range(lenId))
scores = np.zeros(shape=(len(dataOutList),(lenId)//step))
for i in range((lenId)//step):
print("\rProgress {:2.1%}".format(i/lenId*step), end='')
train = Id[:(i*step)]
if pw is True:
indexTrain = (dfDataset['ID_1'].isin(train) & dfDataset['ID_2'].isin(train)).values
else:
indexTrain = dfDataset['ID'].isin(train).values
X_train = np.vstack((enrichBox ,featureBox[indexTrain]))
y_train = np.append(y_enrich, y[indexTrain])
for j in range(len(dataOutList)):
score, y_pred = EvaluateScore(X_train, dataOutList[j][1].values, y_train, dataOutList[j][2],
{**parModel}, scoring, pw)
if pw is True:
scores[j,i] = score
return scores
def SelectModel(modelType, poly=None, kernel=None, alpha=0.1, gamma=0.1, epsilon=0.1, coef0=1,
fitInt=True, normalize=True, max_depth=None, max_features=None, min_samples_split = 2,
n_estimators = 50, C=1, n_jobs=12):
""" Initializes the correct model for a given set of parameters.
Parameters
-----------
modelType: str
Type of model. Possible values are:
['ridge', 'SVC', 'SVR', OLS', 'lasso', 'huber', 'treeReg', 'treeClass', 'forestReg', 'forestClass']
other parameters include (further information can be found on sklearn):
poly: int
kernel: str
alpha: int
gamma: int
epsilon: int
coef0: int
fit_intercept= Bool
normalize = Bool
max_depth = int
max_features = int
min_samples_split = int
n_estimators = int
C = int
n_jobs= int
Returns
-------
model: Class
sklearn-type model
"""
if kernel:
if modelType == "ridge":
model = KernelRidge(alpha=alpha, gamma=gamma, kernel=kernel, coef0=coef0)
if modelType == "SVC":
model = SVC(C=C, kernel=kernel, gamma=gamma, coef0=coef0, degree=poly)
if modelType == "SVR":
model = SVR(C=C, kernel=kernel, gamma=gamma, coef0=coef0, epsilon=epsilon, degree=poly)
elif poly:
if modelType == "OLS":
model = make_pipeline(PolynomialFeatures(poly), LinearRegression(fit_intercept=fit_intercept, normalize=normalize))
if modelType == "ridge":
model = make_pipeline(PolynomialFeatures(poly), Ridge(alpha= alpha, normalize=normalize))
if modelType == "lasso":
model = make_pipeline(PolynomialFeatures(poly), Lasso(alpha= alpha, normalize=normalize))
if modelType == "huber":
model = make_pipeline(PolynomialFeatures(poly), HuberRegressor(fit_intercept=fitInt, epsilon=epsilon, alpha=alpha))
else:
if modelType == "OLS":
model = LinearRegression(fit_intercept=fitInt, normalize=normalize)
if modelType == "ridge":
model = Ridge(alpha= alpha, normalize=normalize)
if modelType == "lasso":
model = Lasso(alpha= alpha, normalize=normalize)
if modelType == "huber":
model = HuberRegressor(fit_intercept=fitInt, alpha=alpha, epsilon=epsilon)
if modelType == "treeReg":
model = DecisionTreeRegressor(max_depth= max_depth, max_features=max_features, min_samples_split = min_samples_split)
if modelType == "treeClass":
model = DecisionTreeClassifier(max_depth = max_depth, max_features=max_features, min_samples_split = min_samples_split)
if modelType == "forestReg":
model = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_features= max_features, min_samples_split = min_samples_split, n_jobs=n_jobs)
if modelType == "forestClass":
model = RandomForestClassifier(n_estimators = n_estimators, max_depth = max_depth, max_features= max_features, min_samples_split = min_samples_split, n_jobs=n_jobs)
return model
def SetupModel(modelInit, parOptional={}):
#model selection and hyperparameters
modelType = modelInit[0]
kernel = modelInit[1]
poly= modelInit[2]
parModel = {"modelType":modelType, "poly":poly, "kernel":kernel, **parOptional }
return parModel
In [43]:
GetParameterSet(['alpha', 'gamma'],[10, 10])
Out[43]:
In [27]:
import sys
sys.path.append("../../src/")
import features.feature_utils as fu
import models.model_utils as mu
import plots.plot_utils as pu
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import GridSearchCV
%matplotlib inline
In [31]:
model = ['forestClass',None,None]
modelOpt = {'n_estimators':10}
pw = True
step = 2
seqRegions = [[0,12],[-6,11]]
data = '../../data/interim/pw_hammer_prom_lib.csv'
dataEnrich = ['../../data/interim/pw_mutalik_prom_lib.csv']
dataOut = ['../../data/interim/pw_anderson_prom_lib.csv','../../data/interim/pw_brewster_prom_lib.csv'
,'../../data/interim/pw_inbio_prom_lib.csv']
dataOutLabels = ['anderson','brewster','inbio']
In [22]:
def LearningCurveOutOfSampleEnriched(dfDataset, featureBox, enrichBox, y, y_enrich, dataOutList, parModel, pw=False, step=1):
if pw is True:
temp = np.unique(dfDataset[['ID_1', 'ID_2']].values)
dfId = pd.Series(temp)
else:
dfId = dfDataset['ID']
lenId = len(dfId)
Id = dfId.values
indexId = np.array(range(lenId))
scores = np.zeros(shape=(len(dataOut),(lenId-5)//step))
for i in range((lenId)//step):
print("\rProgress {:2.1%}".format(i/len(Id)), end='')
train = Id[:(i*step)]
if pw is True:
indexTrain = (dfDataset['ID_1'].isin(train) & dfDataset['ID_2'].isin(train)).values
else:
indexTrain = dfDataset['ID'].isin(train).values
X_train = np.vstack((enrichBoxFull ,featureBox[indexTrain]))
y_train = np.append(y_enrich, y[indexTrain])
for j in range(len(dataOut)):
score, y_pred = EvaluateScore(X_train, dataOutList[j][1].values, y_train, dataOutList[j][2], {**parModel})
if pw is True:
scores[j,i] = score
else:
scores[j,i] = abs(stats.spearmanr(dataOutList[j][0]['mean_score'],y_pred)[0])
return scores
In [32]:
parModel = SetupModel(model,modelOpt)
dfDataset , featureBox = fu.CreateFeaturesFromData(data, seqRegions, pw, shuffle=True)
data
enrichBoxList = []
y_enrich = []
for e in dataEnrich:
dfEnrich, enrichBox = fu.CreateFeaturesFromData(e, seqRegions, pw)
y = dfEnrich['rank']
enrichBoxList.append(enrichBox)
y_enrich.append(y)
#dataEnrichList.append((dfEnrich, enrichBox, y))
enrichBox = np.vstack((enrichBoxList[:]))
dataOutList = []
for d in dataOut:
dfOut, outBox = fu.CreateFeaturesFromData(d, seqRegions, pw)
y = dfOut['rank']
dataOutList.append((dfOut, outBox, y))
X = featureBox.values
y = dfDataset['rank']
scores = LearningCurveOutOfSampleEnriched(dfDataset, featureBox, enrichBox, y, y_enrich, dataOutList, parModel, pw, step)
fig, ax = plt.subplots(1,1, figsize=(8,6))
ax.set_title("Learning curve out of sample score")
colors = ['bo','ro','yo','go','wo','mo','co','ko','bo','co']
for j in range(len(dataOut)):
ax.plot(range((lenId)//step),scores[j,:], colors[j], label=dataOutLabels[j])
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
ax.set_xlabel("Step")
ax.set_ylabel("Score")
In [23]:
model = ['forestClass',None,None]
modelOpt = {'n_estimators':10}
k= 5
pw = True
step = 1
seqRegions = [[-7,12],[-6,11]]
data = '../../data/interim/pw_hammer_prom_lib.csv'
dataEnrich= '../../data/interim/pw_mutalik_prom_lib.csv'
dfDataset , featureBox = fu.CreateFeaturesFromData(data, seqRegions, pw, shuffle=True)
dfEnrich, enrichBox = fu.CreateFeaturesFromData(dataEnrich, seqRegions, pw, shuffle=True)
X_enrich = enrichBox.values
y_enrich = dfEnrich['rank']
X = featureBox.values
y = dfDataset['rank']
parModel = mu.SetupModel(model, modelOpt)
In [20]:
scores
Out[20]:
In [22]:
if pw is True:
temp = np.unique(dfDataset[['ID_1', 'ID_2']].values)
dfId = pd.Series(temp[:-(len(temp)%k)])
else:
dfId = dfDataset['ID'][:-(len(dfDataset)%k)]
lenId = len(dfId)
Id = dfId.values
indexId = np.array(range(lenId))
scores = np.array([])
it=0
for i in range(k):
boolTest = np.logical_and(indexId>=i*lenId/k,indexId<(i+1)*lenId/k)
test = Id[boolTest]
train = Id[np.invert(boolTest)]
if pw is True:
indexTest = (dfDataset['ID_1'].isin(test) | dfDataset['ID_2'].isin(test)).values
else:
indexTest = dfDataset['ID'].isin(test).values
dfDatasetTrain = dfDataset[np.invert(indexTest)]
X_train = featureBox[np.invert(indexTest)]
y_train = y[np.invert(indexTest)]
X_test, y_test = featureBox[indexTest], y[indexTest]
for j in range((len(train))//step):
print("\rProgress {:2.1%}".format(it/k+(j/len(train)/k)), end='')
trainInner = train[:(j*step)]
if pw is True:
indexTrainInner = (dfDatasetTrain['ID_1'].isin(trainInner) & dfDatasetTrain['ID_2'].isin(trainInner)).values
else:
indexTrainInner = (dfDatasetTrain['ID'].isin(trainInner)).values
X_trainInner = np.vstack((enrichBox,X_train[indexTrainInner]))
y_trainInner = np.append(y_enrich, y_train[indexTrainInner])
score, y_pred = mu.EvaluateScore(X_trainInner, X_test, y_trainInner, y_test, {**parModel})
scores = np.append(scores,score)
it+=1
scores = scores.reshape((k,-1))
fig, (ax1,ax2) = plt.subplots(2,1, figsize=(10,8),sharex=True)
ax1.set_title("Learning curve in sample score of enriched dataset")
for i in range(k):
colors = ['bo','ro','yo','go','wo','mo','co','ko','bo','co']
ax1.plot(range(len(scores[i,:])),scores[i,:], colors[i])
ax1.set_xlabel("Step")
ax1.set_ylabel("Score")
meanScores=np.mean(scores,axis=0)
stdScores=np.std(scores,axis=0)
ax2.errorbar(range(len(meanScores)), meanScores[:], stdScores[:])
ax2.set_xlabel("Step")
ax2.set_ylabel("Score")
Out[22]:
In [24]:
fig, (ax1,ax2) = plt.subplots(2,1, figsize=(10,8),sharex=True)
ax1.set_title("Learning curve in sample score, stepwise increase of promoter in train library")
for i in range(k):
colors = ['bo','ro','yo','go','wo','mo','co','ko','bo','co']
ax1.plot(range(len(scores[i,:])),scores[i,:], colors[i])
ax1.set_xlabel("Step")
ax1.set_ylabel("Score")
meanScores=np.mean(scores,axis=0)
stdScores=np.std(scores,axis=0)
ax2.errorbar(range(len(meanScores)), meanScores[:], stdScores[:])
ax2.set_xlabel("Step")
ax2.set_ylabel("Score")
Out[24]:
In [50]:
dfDataset = pd.read_csv('../../data/interim/pw_mutalik_prom_lib.csv')
dfDatasetAligned = fu.AlignSequences(dfDataset, pw=True)
dfDatasetShuffled , featureBox = fu.PositionalFeaturesPW(dfDatasetAligned, [[-7,12],[-6,11]], shuffle=True)
X = featureBox.values
y = dfDatasetShuffled['rank']
"""parSet = GetParameterSet(parLabel, parRange)
model = SelectModel(**parModel)
GS = GridSearchCV(model, parSet, cv=k, n_jobs=n_jobs)
GS.fit(X,y)"""
Out[50]:
In [10]:
GS.best_estimator_
Out[10]:
In [ ]:
In [6]:
np.unique(dfDatasetShuffled[['ID_1']].values).size
Out[6]:
In [38]:
scores
Out[38]:
In [29]:
len(train)*5
Out[29]: