In [1]:
import sys
import os

import re
import collections
import itertools
import bcolz
import pickle
sys.path.append('../../lib')

import numpy as np
import pandas as pd
import gc
import random
import smart_open
import h5py
import csv
import json
import functools
import time
import string

import datetime as dt
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

random_state_number = 967898

In [2]:
import tensorflow as tf
from tensorflow.python.client import device_lib
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
get_available_gpus()


Out[2]:
['/gpu:0', '/gpu:1']

In [3]:
%pylab
%matplotlib inline
%load_ext line_profiler
%load_ext memory_profiler
%load_ext autoreload


Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib
/home/bicepjai/Programs/anaconda3/envs/dsotc-c3/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['random']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"

In [4]:
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
color = sns.color_palette()

Data


In [5]:
store = pd.HDFStore('../data_prep/processed/stage1/data_frames.h5')
train_df = store['train_df']
test_df = store['test_df']

In [6]:
display(train_df.head())
display(test_df.head())


ID Gene Variation Class Sentences
0 0 [fam58a] [truncating, mutations] 1 [[cyclin-dependent, kinases, , cdks, , regulat...
1 1 [cbl] [w802*] 2 [[abstract, background, non-small, cell, lung,...
2 2 [cbl] [q249e] 2 [[abstract, background, non-small, cell, lung,...
3 3 [cbl] [n454d] 3 [[recent, evidence, has, demonstrated, that, a...
4 4 [cbl] [l399v] 4 [[oncogenic, mutations, in, the, monomeric, ca...
ID Gene Variation Sentences
0 0 [acsl4] [r570s] [[2, this, mutation, resulted, in, a, myelopro...
1 1 [naglu] [p521l] [[abstract, the, large, tumor, suppressor, 1, ...
2 2 [pah] [l333f] [[vascular, endothelial, growth, factor, recep...
3 3 [ing1] [a148d] [[inflammatory, myofibroblastic, tumor, , imt,...
4 4 [tmem216] [g77a] [[abstract, retinoblastoma, is, a, pediatric, ...

In [7]:
with open('../data_prep/processed/stage1/vocab_words_wordidx.pkl', 'rb') as f:
    (vocab_words, vocab_wordidx) = pickle.load(f)
vocab_size = len(vocab_words)
vocab_size, len(vocab_wordidx)


Out[7]:
(352220, 352220)

Train-Test Split and Data Prep


In [8]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import make_scorer, f1_score, precision_score, accuracy_score, log_loss
f1_scorer = make_scorer(f1_score, average="macro")
precision_scorer = make_scorer(precision_score, average="macro")
accuracy_scorer = make_scorer(accuracy_score, average="macro")
log_loss_scorer = make_scorer(log_loss)

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [9]:
train_df.Sentences = train_df.Sentences.apply(lambda ll: list(itertools.chain.from_iterable(ll)))
all_text_train_df = pd.DataFrame()
all_text_train_df["Text"] = train_df.Gene + train_df.Variation + train_df.Sentences
all_text_train_df["Class"] = train_df.Class
display(all_text_train_df.head())


Text Class
0 [fam58a, truncating, mutations, cyclin-depende... 1
1 [cbl, w802*, abstract, background, non-small, ... 2
2 [cbl, q249e, abstract, background, non-small, ... 2
3 [cbl, n454d, recent, evidence, has, demonstrat... 3
4 [cbl, l399v, oncogenic, mutations, in, the, mo... 4

In [10]:
x_train, x_test, y_train, y_test = train_test_split(all_text_train_df.Text,all_text_train_df.Class,
                                                   test_size=0.10, random_state=random_state_number,
                                                   stratify=all_text_train_df.Class)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)


(2988,) (2988,)
(333,) (333,)

In [11]:
del all_text_train_df
del train_df
#del test_df

In [12]:
cvec = CountVectorizer(vocabulary=vocab_wordidx)
tfidf = TfidfTransformer()

In [13]:
x_train = x_train.str.join(" ")
x_train_counts = cvec.fit_transform(x_train, y_train)
print(x_train_counts.shape)
x_train_tf = tfidf.fit_transform(x_train_counts)
print(x_train_tf.shape)


(2988, 352220)
(2988, 352220)

In [14]:
x_test = x_test.str.join(" ")
x_test_counts = cvec.fit_transform(x_test, y_test)
print(x_test_counts.shape)
x_test_tf = tfidf.fit_transform(x_test_counts)
print(x_test_tf.shape)


(333, 352220)
(333, 352220)

In [15]:
len(x_test)


Out[15]:
333

In [16]:
gc.collect()


Out[16]:
197

XGBoost

not able to run xgboost, system dies in the middle on 3 tries.


In [ ]:
from xgboost import XGBClassifier
from xgboost.core import XGBoostError
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import scipy.stats as st


/home/bicepjai/Programs/anaconda3/envs/dsotc-c3/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

training


In [ ]:
xgb_clfr = XGBClassifier(objective="multi:softprob",n_estimators=500,  learning_rate=0.01, nthread=10)
xgb_clfr.fit(x_train_tf.toarray(),y_train)

CatBoost


In [19]:
import catboost
from catboost import CatBoostClassifier, CatboostError
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import ParameterSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import scipy.stats as st

training


In [20]:
cat_clfr = CatBoostClassifier(iterations=500, loss_function='MultiClass', 
                                thread_count=8, train_dir = "temp_files")

In [18]:
cat_clfr.fit(x_train_tf.toarray(), y_train, 
               use_best_model=True,
               eval_set=(x_test_tf.toarray(), y_test))


Out[18]:
<catboost.core.CatBoostClassifier at 0x7fa2037f0cf8>

In [26]:
#cat_clfr.save_model("stage1/catboost_classifier")

In [21]:
cat_clfr.load_model("stage1/catboost_classifier")


Out[21]:
<catboost.core.CatBoostClassifier at 0x7f2564f8a0f0>

In [22]:
y_pred = cat_clfr.predict(x_train_tf.toarray())
print(f1_score(y_train, y_pred, average="macro"))


0.587453613072
/home/bicepjai/Programs/anaconda3/envs/dsotc-c3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)