In [1]:
import numpy as np
import pandas as pd
import h5py
from utils.input_pipeline import *
import time
import glob
import multiprocessing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils.input_pipeline import load_data, load_protein
from scipy.stats import randint as sp_randint
#%matplotlib inline
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import Imputer, Normalizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import RandomizedLogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.feature_selection import SelectFromModel
In [2]:
imputer = Imputer()
normalizer = Normalizer()
forest_clf = RandomForestClassifier(n_jobs=4)
forest_pipe = Pipeline([('imputer',imputer),('normalizer',normalizer),('forest_clf',forest_clf)])
# add a parameter for maximum tree depth, restrict to shallow decision trees to speed things up from auto
forest_params = {"forest_clf__n_estimators": sp_randint(15,50),
"forest_clf__oob_score": [True]
#,"forest_clf__max_depth": sp_randint(2,10)
}
best_forest_clf = RandomizedSearchCV(forest_pipe,forest_params,cv=5, n_jobs=5,scoring='f1')
In [4]:
with open("preprocessed_features_pca.csv", "r") as input_file:
feature_list = []
for line in input_file:
line = line.strip('\n')
feature_list.append(line)
print(len(feature_list))
In [5]:
print("loading data...")
t0 = time.time()
X,y = load_data("data/full_26_kinase_data.h5",features_list=feature_list)
t1 = time.time()
print("data loaded in",(t1-t0),"seconds.")
In [6]:
print(X.shape,y.shape)
In [7]:
from sklearn.model_selection import train_test_split, cross_val_score
X_train,X_test,y_train, y_test = train_test_split(X,y.flatten(),test_size=0.2,stratify=y.flatten())
In [8]:
best_forest_clf.fit(X_train,y_train)
Out[8]:
In [9]:
print(best_forest_clf.best_score_)
In [10]:
best_forest = best_forest_clf.best_estimator_
preds = best_forest.predict(X_test)
accuracy = accuracy_score(preds,y_test)
f1_score = f1_score(preds,y_test)
print("accuracy:", accuracy, "\tf1-score:",f1_score)
In [ ]: