Data Exploration

Setup


In [2]:
import sys
import os

import re
import collections
import itertools
import bcolz
import pickle
sys.path.append('../lib')

import numpy as np
import pandas as pd
import gc
import random
import smart_open
import h5py
import csv

import tensorflow as tf
import gensim
import string

import datetime as dt
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

random_state_number = 967898

In [3]:
from tensorflow.python.client import device_lib
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
get_available_gpus()


Out[3]:
['/gpu:0', '/gpu:1']

In [4]:
%pylab
%matplotlib inline
%load_ext autoreload


Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib
/home/bicepjai/Programs/anaconda3/envs/dsotc-c3/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['random']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"

In [5]:
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
color = sns.color_palette()

Data

load data frames


In [1]:
store = pd.HDFStore('processed/stage1/data_frames.h5')
train_df = store['train_df']
test_df = store['test_df']
store.close()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-720aca14c334> in <module>()
----> 1 store = pd.HDFStore('processed/stage1/data_frames.h5')
      2 train_df = store['train_df']
      3 test_df = store['test_df']
      4 store.close()

NameError: name 'pd' is not defined

In [6]:
display(train_df.head())
display(test_df.head())


index Class Gene Variation Sentences
0 0 3 [acsl4] [r570s] [[2, this, mutation, resulted, in, a, myelopro...
1 1 9 [naglu] [p521l] [[abstract, the, large, tumor, suppressor, 1, ...
2 2 7 [pah] [l333f] [[vascular, endothelial, growth, factor, recep...
3 3 2 [ing1] [a148d] [[inflammatory, myofibroblastic, tumor, imt, i...
4 4 9 [tmem216] [g77a] [[abstract, retinoblastoma, is, a, pediatric, ...
Gene Variation Sentences
0 [chek2] [h371y] [[the, incidence, of, breast, cancer, is, incr...
1 [axin2] [truncating, mutations] [[an, unselected, series, of, 310, colorectal,...
2 [wnt4] [e216g] [[mycosis, fungoides, and, szary, syndrome, ar...
3 [sucla2] [g118r] [[regulated, progression, through, the, cell, ...
4 [braf] [t599instt] [[pilocytic, astrocytoma, pa, is, emerging, as...

In [7]:
print(len(train_df))
print(len(test_df))


8989
986

In [8]:
vocab_words, vocab_wordidx = None, None
with open('processed/stage2/vocab_words_wordidx.pkl', 'rb') as f:
    (vocab_words, vocab_wordidx) = pickle.load(f)
len(vocab_words), len(vocab_wordidx)


Out[8]:
(364610, 364610)

Exploration

words


In [10]:
train_words = train_df.Sentences.apply(lambda ll: list(itertools.chain.from_iterable(ll)))
train_words = list(itertools.chain.from_iterable(train_words))
train_words = set(train_words)
len(train_words)


Out[10]:
350604

In [11]:
test_words = test_df.Sentences.apply(lambda ll: list(itertools.chain.from_iterable(ll)))
test_words = list(itertools.chain.from_iterable(test_words))
test_words = set(test_words)
len(test_words)


Out[11]:
173470

In [13]:
train_variations = set(list(itertools.chain.from_iterable(train_df.Variation)))
len(train_variations)


Out[13]:
8632

In [14]:
test_variations = set(list(itertools.chain.from_iterable(test_df.Variation)))
len(test_variations)


Out[14]:
951

In [15]:
train_genes = set(list(itertools.chain.from_iterable(train_df.Gene)))
len(train_genes)


Out[15]:
1507

In [16]:
test_genes = set(list(itertools.chain.from_iterable(test_df.Gene)))
len(test_genes)


Out[16]:
279

vocab_words and vocab_wordidx


In [17]:
len(train_genes & test_genes)


Out[17]:
265

In [19]:
len(train_variations & test_variations)


Out[19]:
814

In [20]:
len(train_words & test_words)


Out[20]:
161141

In [22]:
print(len(train_variations | test_variations), len(train_words & train_variations), len(train_words & test_variations))
print(len(train_variations | test_variations), len(test_words & train_variations), len(test_words & test_variations))


8769 7760 768
8769 2665 758

In [23]:
print(len(train_genes | test_genes), len(train_words & train_genes), len(train_words & test_genes))
print(len(train_genes | test_genes), len(test_words & train_genes), len(test_words & test_genes))


1521 769 199
1521 616 188

Sentences


In [24]:
train_df.Sentences[0][0]


Out[24]:
['2',
 'this',
 'mutation',
 'resulted',
 'in',
 'a',
 'myeloproliferative',
 'phenotype',
 'including',
 'erythrocytosis',
 'in',
 'a',
 'murine',
 'model',
 'of',
 'retroviral',
 'bone',
 'marrow',
 'transplantation']

In [25]:
string.punctuation


Out[25]:
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [28]:
no_punctuations = [w for w in train_df.Sentences[0][0] if w not in string.punctuation]
no_punctuations


Out[28]:
['2',
 'this',
 'mutation',
 'resulted',
 'in',
 'a',
 'myeloproliferative',
 'phenotype',
 'including',
 'erythrocytosis',
 'in',
 'a',
 'murine',
 'model',
 'of',
 'retroviral',
 'bone',
 'marrow',
 'transplantation']

In [29]:
train_sentence_counts = train_df.Sentences.apply(lambda document: len(document))
train_sentence_counts.describe()


Out[29]:
count    8989.000000
mean      324.372344
std       217.407948
min         1.000000
25%       204.000000
50%       283.000000
75%       380.000000
max      3119.000000
Name: Sentences, dtype: float64

In [31]:
train_words_in_sentences = train_df.Sentences.apply(lambda document: np.mean([len(sentence) for sentence in document]))
train_words_in_sentences.describe()


Out[31]:
count    8989.000000
mean       27.435461
std         3.914051
min         1.000000
25%        24.972112
50%        27.125000
75%        29.482022
max        52.466667
Name: Sentences, dtype: float64

In [32]:
train_sentences = train_df.Sentences.apply(lambda document: len(document))
train_sentences.describe()


Out[32]:
count    8989.000000
mean      324.372344
std       217.407948
min         1.000000
25%       204.000000
50%       283.000000
75%       380.000000
max      3119.000000
Name: Sentences, dtype: float64

In [33]:
test_sentences = test_df.Sentences.apply(lambda document: len(document))
test_sentences.describe()


Out[33]:
count     986.000000
mean      337.166329
std       244.728149
min         5.000000
25%       195.000000
50%       281.000000
75%       401.000000
max      2964.000000
Name: Sentences, dtype: float64

Characters


In [34]:
train_chars_in_sentences = train_df.Sentences.apply(lambda d: np.mean([np.sum([len(w) for w in s]) for s in d]))
train_chars_in_sentences.describe()


Out[34]:
count    8989.000000
mean      151.113143
std        22.861147
min         4.000000
25%       136.980435
50%       149.089552
75%       162.356401
max       288.433333
Name: Sentences, dtype: float64

encoding issues


In [ ]:
train_words

Text Data and classes exploration


In [15]:
train_df.Class.value_counts()


Out[15]:
7    1692
4    1397
2    1193
1     990
6     983
5     930
3     739
8     688
9     377
Name: Class, dtype: int64

In [17]:
train_df.Sentences[train_df.Class == 7]


Out[17]:
2       [[vascular, endothelial, growth, factor, recep...
16      [[janus, jak, tyrosine, kinases, contain, a, t...
25      [[the, ten-eleven, translocation, 1, tet1, gen...
27      [[myc, expression, is, deregulated, in, a, wid...
32      [[individuals, with, neurofibromatosis, type, ...
48      [[this, 12-week, clinical, study, evaluated, t...
87      [[quality, control, mechanisms, promote, aggre...
93      [[c-kit, is, a, member, of, the, type, 3, subc...
96      [[the, were, premature, closely, derived, to, ...
101     [[kabuki, syndrome, ks, is, a, multiple, conge...
108     [[eml4alk, fusions, define, a, subset, of, lun...
111     [[e26, transformation-specific, ets, transcrip...
113     [[activating, mutations, in, jak1, and, jak2, ...
114     [[a, p53, hot-spot, mutation, found, frequentl...
117     [[establishedmsh6-null, mice, present, afreque...
133     [[recent, evidence, identified, a, genetic, an...
155     [[ligand-induced, phosphorylation, of, the, re...
157     [[introduction, anaplastic, lymphoma, kinase, ...
165     [[kinase, domain, kd, mutations, of, bcr-abl, ...
174     [[acquired, somatic, mutations, in, _atrx, an,...
186     [[the, maintenance, of, genomic, integrity, du...
187     [[inactivation, of, ras, gtpase, activating, p...
191     [[the, oncogenic, property, of, anaplastic, ly...
194     [[ezh2, enhancer, of, zeste, homolog, 2, is, a...
203     [[acetylation, of, multiple, lysine, residues,...
210     [[the, thyroid, trk-t3, oncogene, results, fro...
212     [[interaction, of, tcf7l2, with, translocated,...
225     [[personalized, therapy, provides, the, best, ...
251     [[endometrial, stromal, sarcomas, ess, are, ge...
254     [[extracellular, signal-regulated, kinase-1, a...
                              ...                        
8898    [[we, identified, novel, gene, fusions, in, pa...
8902    [[the, congenital, fibrosarcoma, t, 12, 15, p1...
8903    [[pediatric, high-grade, glioma, hgg, is, a, d...
8905    [[pediatric, high-grade, glioma, hgg, is, a, d...
8908    [[lung, cancer, is, the, leading, cause, of, c...
8910    [[lung, cancer, is, the, leading, cause, of, c...
8911    [[collagen, is, an, important, extracellular, ...
8912    [[collagen, is, an, important, extracellular, ...
8913    [[lung, cancer, is, the, leading, cause, of, c...
8919    [[head, and, neck, squamous, cell, carcinoma, ...
8925    [[rearrangements, of, the, proto-oncogene, ret...
8926    [[the, ret, protooncogene, mutations, responsi...
8927    [[we, investigatedthe, transformingactivityoft...
8929    [[we, investigatedthe, transformingactivityoft...
8930    [[ret, is, a, single-pass, transmembrane, rece...
8931    [[somatic, ret, mutations, have, been, identie...
8932    [[we, investigatedthe, transformingactivityoft...
8933    [[we, investigatedthe, transformingactivityoft...
8939    [[mutations, of, the, ret, receptor, tyrosine,...
8946    [[ret, is, a, single-pass, transmembrane, rece...
8947    [[activating, germ-line, point, mutations, in,...
8952    [[a, considerable, proportion, of, ladcs, the,...
8954    [[constitutive, activation, of, the, ret, rece...
8956    [[we, investigatedthe, transformingactivityoft...
8959    [[we, investigatedthe, transformingactivityoft...
8960    [[we, investigatedthe, transformingactivityoft...
8962    [[introduction, inherited, germ, line, activat...
8964    [[the, ret, proto-oncogene, encodes, a, recept...
8976    [[familial, platelet, disorder, with, propensi...
8978    [[runx, proteins, belong, to, a, family, of, m...
Name: Sentences, Length: 1692, dtype: object

In [ ]: