In [1]:
import numpy as np
import pandas as pd
import h5py
from utils.input_pipeline import *

import time
import glob
import multiprocessing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils.input_pipeline import load_data, load_protein
from scipy.stats import randint as sp_randint
#%matplotlib inline

from sklearn.model_selection import cross_val_score, RandomizedSearchCV

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import Imputer, Normalizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import RandomizedLogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, make_scorer

from sklearn.feature_selection import SelectFromModel

In [2]:
imputer = Imputer()
normalizer = Normalizer()
forest_clf = RandomForestClassifier(n_jobs=4)

forest_pipe = Pipeline([('imputer',imputer),('normalizer',normalizer),('forest_clf',forest_clf)])

# add a parameter for maximum tree depth, restrict to shallow decision trees to speed things up from auto
forest_params = {"forest_clf__n_estimators": sp_randint(15,50),
                 "forest_clf__oob_score": [True]
                 #,"forest_clf__max_depth": sp_randint(2,10)
                }
best_forest_clf = RandomizedSearchCV(forest_pipe,forest_params,cv=5, n_jobs=5,scoring='f1')

In [4]:
with open("preprocessed_features_pca.csv", "r") as input_file:
    feature_list = []
    for line in input_file:
        line = line.strip('\n')
        feature_list.append(line)
        
print(len(feature_list))


460

In [5]:
print("loading data...")
t0 = time.time()
X,y = load_data("data/full_26_kinase_data.h5",features_list=feature_list)
t1 = time.time()
print("data loaded in",(t1-t0),"seconds.")


  0%|          | 0/26 [00:00<?, ?it/s]
loading data...
loading 26 proteins.
100%|██████████| 26/26 [00:21<00:00,  1.35s/it]
data loaded in 21.8553946018219 seconds.


In [6]:
print(X.shape,y.shape)


(361786, 460) (361786, 1)

In [7]:
from sklearn.model_selection import train_test_split, cross_val_score
X_train,X_test,y_train, y_test = train_test_split(X,y.flatten(),test_size=0.2,stratify=y.flatten())

In [8]:
best_forest_clf.fit(X_train,y_train)


/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:439: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/global/homes/w/wdjo224/anaconda3/envs/protein_binding/lib/python3.6/site-packages/sklearn/ensemble/forest.py:444: RuntimeWarning: invalid value encountered in true_divide
  predictions[k].sum(axis=1)[:, np.newaxis])
Out[8]:
RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('normalizer', Normalizer(copy=True, norm='l2')), ('forest_clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nod...imators=10, n_jobs=4, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
          fit_params={}, iid=True, n_iter=10, n_jobs=5,
          param_distributions={'forest_clf__n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x2ab8175838d0>, 'forest_clf__oob_score': [True]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring='f1', verbose=0)

In [9]:
print(best_forest_clf.best_score_)


0.755021513898

In [10]:
best_forest = best_forest_clf.best_estimator_
preds = best_forest.predict(X_test)

accuracy = accuracy_score(preds,y_test)
f1_score = f1_score(preds,y_test)

print("accuracy:", accuracy, "\tf1-score:",f1_score)


accuracy: 0.991403853064 	f1-score: 0.803164556962

In [ ]: