In [1]:
import sys
import os
import re
import collections
import itertools
import bcolz
import pickle
sys.path.append('../../lib')
import numpy as np
import pandas as pd
import gc
import random
import smart_open
import h5py
import csv
import json
import functools
import time
import string
import datetime as dt
from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
random_state_number = 967898
In [2]:
import tensorflow as tf
from tensorflow.python.client import device_lib
def get_available_gpus():
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
get_available_gpus()
Out[2]:
In [3]:
%pylab
%matplotlib inline
%load_ext line_profiler
%load_ext memory_profiler
%load_ext autoreload
In [4]:
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
color = sns.color_palette()
In [5]:
store = pd.HDFStore('../data_prep/processed/stage1/data_frames.h5')
train_df = store['train_df']
test_df = store['test_df']
In [6]:
display(train_df.head())
display(test_df.head())
In [7]:
with open('../data_prep/processed/stage1/vocab_words_wordidx.pkl', 'rb') as f:
(vocab_words, vocab_wordidx) = pickle.load(f)
vocab_size = len(vocab_words)
vocab_size, len(vocab_wordidx)
Out[7]:
In [8]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import make_scorer, f1_score, precision_score, accuracy_score, log_loss
f1_scorer = make_scorer(f1_score, average="macro")
precision_scorer = make_scorer(precision_score, average="macro")
accuracy_scorer = make_scorer(accuracy_score, average="macro")
log_loss_scorer = make_scorer(log_loss)
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
In [9]:
train_df.Sentences = train_df.Sentences.apply(lambda ll: list(itertools.chain.from_iterable(ll)))
all_text_train_df = pd.DataFrame()
all_text_train_df["Text"] = train_df.Gene + train_df.Variation + train_df.Sentences
all_text_train_df["Class"] = train_df.Class
display(all_text_train_df.head())
In [10]:
x_train, x_test, y_train, y_test = train_test_split(all_text_train_df.Text,all_text_train_df.Class,
test_size=0.10, random_state=random_state_number,
stratify=all_text_train_df.Class)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
In [11]:
del all_text_train_df
del train_df
#del test_df
In [12]:
cvec = CountVectorizer(vocabulary=vocab_wordidx)
tfidf = TfidfTransformer()
In [13]:
x_train = x_train.str.join(" ")
x_train_counts = cvec.fit_transform(x_train, y_train)
print(x_train_counts.shape)
x_train_tf = tfidf.fit_transform(x_train_counts)
print(x_train_tf.shape)
In [14]:
x_test = x_test.str.join(" ")
x_test_counts = cvec.fit_transform(x_test, y_test)
print(x_test_counts.shape)
x_test_tf = tfidf.fit_transform(x_test_counts)
print(x_test_tf.shape)
In [15]:
len(x_test)
Out[15]:
In [16]:
gc.collect()
Out[16]:
not able to run xgboost, system dies in the middle on 3 tries.
In [ ]:
from xgboost import XGBClassifier
from xgboost.core import XGBoostError
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import scipy.stats as st
In [ ]:
xgb_clfr = XGBClassifier(objective="multi:softprob",n_estimators=500, learning_rate=0.01, nthread=10)
xgb_clfr.fit(x_train_tf.toarray(),y_train)
In [19]:
import catboost
from catboost import CatBoostClassifier, CatboostError
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import ParameterSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import scipy.stats as st
In [20]:
cat_clfr = CatBoostClassifier(iterations=500, loss_function='MultiClass',
thread_count=8, train_dir = "temp_files")
In [18]:
cat_clfr.fit(x_train_tf.toarray(), y_train,
use_best_model=True,
eval_set=(x_test_tf.toarray(), y_test))
Out[18]:
In [26]:
#cat_clfr.save_model("stage1/catboost_classifier")
In [21]:
cat_clfr.load_model("stage1/catboost_classifier")
Out[21]:
In [22]:
y_pred = cat_clfr.predict(x_train_tf.toarray())
print(f1_score(y_train, y_pred, average="macro"))