In [2]:
import sys
import os
import re
import collections
import itertools
import bcolz
import pickle
sys.path.append('../lib')
import numpy as np
import pandas as pd
import gc
import random
import smart_open
import h5py
import csv
import tensorflow as tf
import gensim
import string
import datetime as dt
from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
random_state_number = 967898
In [3]:
from tensorflow.python.client import device_lib
def get_available_gpus():
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
get_available_gpus()
Out[3]:
In [4]:
%pylab
%matplotlib inline
%load_ext autoreload
In [5]:
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
color = sns.color_palette()
In [1]:
store = pd.HDFStore('processed/stage1/data_frames.h5')
train_df = store['train_df']
test_df = store['test_df']
store.close()
In [6]:
display(train_df.head())
display(test_df.head())
In [7]:
print(len(train_df))
print(len(test_df))
In [8]:
vocab_words, vocab_wordidx = None, None
with open('processed/stage2/vocab_words_wordidx.pkl', 'rb') as f:
(vocab_words, vocab_wordidx) = pickle.load(f)
len(vocab_words), len(vocab_wordidx)
Out[8]:
In [10]:
train_words = train_df.Sentences.apply(lambda ll: list(itertools.chain.from_iterable(ll)))
train_words = list(itertools.chain.from_iterable(train_words))
train_words = set(train_words)
len(train_words)
Out[10]:
In [11]:
test_words = test_df.Sentences.apply(lambda ll: list(itertools.chain.from_iterable(ll)))
test_words = list(itertools.chain.from_iterable(test_words))
test_words = set(test_words)
len(test_words)
Out[11]:
In [13]:
train_variations = set(list(itertools.chain.from_iterable(train_df.Variation)))
len(train_variations)
Out[13]:
In [14]:
test_variations = set(list(itertools.chain.from_iterable(test_df.Variation)))
len(test_variations)
Out[14]:
In [15]:
train_genes = set(list(itertools.chain.from_iterable(train_df.Gene)))
len(train_genes)
Out[15]:
In [16]:
test_genes = set(list(itertools.chain.from_iterable(test_df.Gene)))
len(test_genes)
Out[16]:
vocab_words and vocab_wordidx
In [17]:
len(train_genes & test_genes)
Out[17]:
In [19]:
len(train_variations & test_variations)
Out[19]:
In [20]:
len(train_words & test_words)
Out[20]:
In [22]:
print(len(train_variations | test_variations), len(train_words & train_variations), len(train_words & test_variations))
print(len(train_variations | test_variations), len(test_words & train_variations), len(test_words & test_variations))
In [23]:
print(len(train_genes | test_genes), len(train_words & train_genes), len(train_words & test_genes))
print(len(train_genes | test_genes), len(test_words & train_genes), len(test_words & test_genes))
In [24]:
train_df.Sentences[0][0]
Out[24]:
In [25]:
string.punctuation
Out[25]:
In [28]:
no_punctuations = [w for w in train_df.Sentences[0][0] if w not in string.punctuation]
no_punctuations
Out[28]:
In [29]:
train_sentence_counts = train_df.Sentences.apply(lambda document: len(document))
train_sentence_counts.describe()
Out[29]:
In [31]:
train_words_in_sentences = train_df.Sentences.apply(lambda document: np.mean([len(sentence) for sentence in document]))
train_words_in_sentences.describe()
Out[31]:
In [32]:
train_sentences = train_df.Sentences.apply(lambda document: len(document))
train_sentences.describe()
Out[32]:
In [33]:
test_sentences = test_df.Sentences.apply(lambda document: len(document))
test_sentences.describe()
Out[33]:
In [34]:
train_chars_in_sentences = train_df.Sentences.apply(lambda d: np.mean([np.sum([len(w) for w in s]) for s in d]))
train_chars_in_sentences.describe()
Out[34]:
In [ ]:
train_words
In [15]:
train_df.Class.value_counts()
Out[15]:
In [17]:
train_df.Sentences[train_df.Class == 7]
Out[17]:
In [ ]: