Data Exploration

Setup



In [2]:

    
import sys
import os

import re
import collections
import itertools
import bcolz
import pickle
sys.path.append('../lib')

import numpy as np
import pandas as pd
import gc
import random
import smart_open
import h5py
import csv

import tensorflow as tf
import gensim
import string

import datetime as dt
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

random_state_number = 967898



In [3]:

    
from tensorflow.python.client import device_lib
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
get_available_gpus()









    Out[3]:





['/gpu:0', '/gpu:1']



In [4]:

    
%pylab
%matplotlib inline
%load_ext autoreload









    



Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib






    



/home/bicepjai/Programs/anaconda3/envs/dsotc-c3/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['random']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"



In [5]:

    
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
color = sns.color_palette()

Data

load data frames



In [1]:

    
store = pd.HDFStore('processed/stage1/data_frames.h5')
train_df = store['train_df']
test_df = store['test_df']
store.close()









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-720aca14c334> in <module>()
----> 1 store = pd.HDFStore('processed/stage1/data_frames.h5')
      2 train_df = store['train_df']
      3 test_df = store['test_df']
      4 store.close()

NameError: name 'pd' is not defined



In [6]:

    
display(train_df.head())
display(test_df.head())









    







  
    
      
      index
      Class
      Gene
      Variation
      Sentences
    
  
  
    
      0
      0
      3
      [acsl4]
      [r570s]
      [[2, this, mutation, resulted, in, a, myelopro...
    
    
      1
      1
      9
      [naglu]
      [p521l]
      [[abstract, the, large, tumor, suppressor, 1, ...
    
    
      2
      2
      7
      [pah]
      [l333f]
      [[vascular, endothelial, growth, factor, recep...
    
    
      3
      3
      2
      [ing1]
      [a148d]
      [[inflammatory, myofibroblastic, tumor, imt, i...
    
    
      4
      4
      9
      [tmem216]
      [g77a]
      [[abstract, retinoblastoma, is, a, pediatric, ...
    
  








    







  
    
      
      Gene
      Variation
      Sentences
    
  
  
    
      0
      [chek2]
      [h371y]
      [[the, incidence, of, breast, cancer, is, incr...
    
    
      1
      [axin2]
      [truncating, mutations]
      [[an, unselected, series, of, 310, colorectal,...
    
    
      2
      [wnt4]
      [e216g]
      [[mycosis, fungoides, and, szary, syndrome, ar...
    
    
      3
      [sucla2]
      [g118r]
      [[regulated, progression, through, the, cell, ...
    
    
      4
      [braf]
      [t599instt]
      [[pilocytic, astrocytoma, pa, is, emerging, as...



In [7]:

    
print(len(train_df))
print(len(test_df))



In [8]:

    
vocab_words, vocab_wordidx = None, None
with open('processed/stage2/vocab_words_wordidx.pkl', 'rb') as f:
    (vocab_words, vocab_wordidx) = pickle.load(f)
len(vocab_words), len(vocab_wordidx)









    Out[8]:





(364610, 364610)

Exploration

words



In [10]:

    
train_words = train_df.Sentences.apply(lambda ll: list(itertools.chain.from_iterable(ll)))
train_words = list(itertools.chain.from_iterable(train_words))
train_words = set(train_words)
len(train_words)









    Out[10]:





350604



In [11]:

    
test_words = test_df.Sentences.apply(lambda ll: list(itertools.chain.from_iterable(ll)))
test_words = list(itertools.chain.from_iterable(test_words))
test_words = set(test_words)
len(test_words)









    Out[11]:





173470



In [13]:

    
train_variations = set(list(itertools.chain.from_iterable(train_df.Variation)))
len(train_variations)









    Out[13]:





8632



In [14]:

    
test_variations = set(list(itertools.chain.from_iterable(test_df.Variation)))
len(test_variations)









    Out[14]:





951



In [15]:

    
train_genes = set(list(itertools.chain.from_iterable(train_df.Gene)))
len(train_genes)









    Out[15]:





1507



In [16]:

    
test_genes = set(list(itertools.chain.from_iterable(test_df.Gene)))
len(test_genes)









    Out[16]:





279

vocab_words and vocab_wordidx



In [17]:

    
len(train_genes & test_genes)









    Out[17]:





265



In [19]:

    
len(train_variations & test_variations)









    Out[19]:





814



In [20]:

    
len(train_words & test_words)









    Out[20]:





161141



In [22]:

    
print(len(train_variations | test_variations), len(train_words & train_variations), len(train_words & test_variations))
print(len(train_variations | test_variations), len(test_words & train_variations), len(test_words & test_variations))









    



8769 7760 768
8769 2665 758



In [23]:

    
print(len(train_genes | test_genes), len(train_words & train_genes), len(train_words & test_genes))
print(len(train_genes | test_genes), len(test_words & train_genes), len(test_words & test_genes))









    



1521 769 199
1521 616 188

Sentences



In [24]:

    
train_df.Sentences[0][0]









    Out[24]:





['2',
 'this',
 'mutation',
 'resulted',
 'in',
 'a',
 'myeloproliferative',
 'phenotype',
 'including',
 'erythrocytosis',
 'in',
 'a',
 'murine',
 'model',
 'of',
 'retroviral',
 'bone',
 'marrow',
 'transplantation']



In [25]:

    
string.punctuation









    Out[25]:





'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'



In [28]:

    
no_punctuations = [w for w in train_df.Sentences[0][0] if w not in string.punctuation]
no_punctuations









    Out[28]:





['2',
 'this',
 'mutation',
 'resulted',
 'in',
 'a',
 'myeloproliferative',
 'phenotype',
 'including',
 'erythrocytosis',
 'in',
 'a',
 'murine',
 'model',
 'of',
 'retroviral',
 'bone',
 'marrow',
 'transplantation']



In [29]:

    
train_sentence_counts = train_df.Sentences.apply(lambda document: len(document))
train_sentence_counts.describe()









    Out[29]:





count    8989.000000
mean      324.372344
std       217.407948
min         1.000000
25%       204.000000
50%       283.000000
75%       380.000000
max      3119.000000
Name: Sentences, dtype: float64



In [31]:

    
train_words_in_sentences = train_df.Sentences.apply(lambda document: np.mean([len(sentence) for sentence in document]))
train_words_in_sentences.describe()









    Out[31]:





count    8989.000000
mean       27.435461
std         3.914051
min         1.000000
25%        24.972112
50%        27.125000
75%        29.482022
max        52.466667
Name: Sentences, dtype: float64



In [32]:

    
train_sentences = train_df.Sentences.apply(lambda document: len(document))
train_sentences.describe()









    Out[32]:





count    8989.000000
mean      324.372344
std       217.407948
min         1.000000
25%       204.000000
50%       283.000000
75%       380.000000
max      3119.000000
Name: Sentences, dtype: float64



In [33]:

    
test_sentences = test_df.Sentences.apply(lambda document: len(document))
test_sentences.describe()









    Out[33]:





count     986.000000
mean      337.166329
std       244.728149
min         5.000000
25%       195.000000
50%       281.000000
75%       401.000000
max      2964.000000
Name: Sentences, dtype: float64

Characters



In [34]:

    
train_chars_in_sentences = train_df.Sentences.apply(lambda d: np.mean([np.sum([len(w) for w in s]) for s in d]))
train_chars_in_sentences.describe()









    Out[34]:





count    8989.000000
mean      151.113143
std        22.861147
min         4.000000
25%       136.980435
50%       149.089552
75%       162.356401
max       288.433333
Name: Sentences, dtype: float64

encoding issues



In [ ]:

    
train_words

Text Data and classes exploration



In [15]:

    
train_df.Class.value_counts()









    Out[15]:





7    1692
4    1397
2    1193
1     990
6     983
5     930
3     739
8     688
9     377
Name: Class, dtype: int64



In [17]:

    
train_df.Sentences[train_df.Class == 7]









    Out[17]:





2       [[vascular, endothelial, growth, factor, recep...
16      [[janus, jak, tyrosine, kinases, contain, a, t...
25      [[the, ten-eleven, translocation, 1, tet1, gen...
27      [[myc, expression, is, deregulated, in, a, wid...
32      [[individuals, with, neurofibromatosis, type, ...
48      [[this, 12-week, clinical, study, evaluated, t...
87      [[quality, control, mechanisms, promote, aggre...
93      [[c-kit, is, a, member, of, the, type, 3, subc...
96      [[the, were, premature, closely, derived, to, ...
101     [[kabuki, syndrome, ks, is, a, multiple, conge...
108     [[eml4alk, fusions, define, a, subset, of, lun...
111     [[e26, transformation-specific, ets, transcrip...
113     [[activating, mutations, in, jak1, and, jak2, ...
114     [[a, p53, hot-spot, mutation, found, frequentl...
117     [[establishedmsh6-null, mice, present, afreque...
133     [[recent, evidence, identified, a, genetic, an...
155     [[ligand-induced, phosphorylation, of, the, re...
157     [[introduction, anaplastic, lymphoma, kinase, ...
165     [[kinase, domain, kd, mutations, of, bcr-abl, ...
174     [[acquired, somatic, mutations, in, _atrx, an,...
186     [[the, maintenance, of, genomic, integrity, du...
187     [[inactivation, of, ras, gtpase, activating, p...
191     [[the, oncogenic, property, of, anaplastic, ly...
194     [[ezh2, enhancer, of, zeste, homolog, 2, is, a...
203     [[acetylation, of, multiple, lysine, residues,...
210     [[the, thyroid, trk-t3, oncogene, results, fro...
212     [[interaction, of, tcf7l2, with, translocated,...
225     [[personalized, therapy, provides, the, best, ...
251     [[endometrial, stromal, sarcomas, ess, are, ge...
254     [[extracellular, signal-regulated, kinase-1, a...
                              ...                        
8898    [[we, identified, novel, gene, fusions, in, pa...
8902    [[the, congenital, fibrosarcoma, t, 12, 15, p1...
8903    [[pediatric, high-grade, glioma, hgg, is, a, d...
8905    [[pediatric, high-grade, glioma, hgg, is, a, d...
8908    [[lung, cancer, is, the, leading, cause, of, c...
8910    [[lung, cancer, is, the, leading, cause, of, c...
8911    [[collagen, is, an, important, extracellular, ...
8912    [[collagen, is, an, important, extracellular, ...
8913    [[lung, cancer, is, the, leading, cause, of, c...
8919    [[head, and, neck, squamous, cell, carcinoma, ...
8925    [[rearrangements, of, the, proto-oncogene, ret...
8926    [[the, ret, protooncogene, mutations, responsi...
8927    [[we, investigatedthe, transformingactivityoft...
8929    [[we, investigatedthe, transformingactivityoft...
8930    [[ret, is, a, single-pass, transmembrane, rece...
8931    [[somatic, ret, mutations, have, been, identie...
8932    [[we, investigatedthe, transformingactivityoft...
8933    [[we, investigatedthe, transformingactivityoft...
8939    [[mutations, of, the, ret, receptor, tyrosine,...
8946    [[ret, is, a, single-pass, transmembrane, rece...
8947    [[activating, germ-line, point, mutations, in,...
8952    [[a, considerable, proportion, of, ladcs, the,...
8954    [[constitutive, activation, of, the, ret, rece...
8956    [[we, investigatedthe, transformingactivityoft...
8959    [[we, investigatedthe, transformingactivityoft...
8960    [[we, investigatedthe, transformingactivityoft...
8962    [[introduction, inherited, germ, line, activat...
8964    [[the, ret, proto-oncogene, encodes, a, recept...
8976    [[familial, platelet, disorder, with, propensi...
8978    [[runx, proteins, belong, to, a, family, of, m...
Name: Sentences, Length: 1692, dtype: object



In [ ]:

	index	Class	Gene	Variation	Sentences
0	0	3	[acsl4]	[r570s]	[[2, this, mutation, resulted, in, a, myelopro...
1	1	9	[naglu]	[p521l]	[[abstract, the, large, tumor, suppressor, 1, ...
2	2	7	[pah]	[l333f]	[[vascular, endothelial, growth, factor, recep...
3	3	2	[ing1]	[a148d]	[[inflammatory, myofibroblastic, tumor, imt, i...
4	4	9	[tmem216]	[g77a]	[[abstract, retinoblastoma, is, a, pediatric, ...

	Gene	Variation	Sentences
0	[chek2]	[h371y]	[[the, incidence, of, breast, cancer, is, incr...
1	[axin2]	[truncating, mutations]	[[an, unselected, series, of, 310, colorectal,...
2	[wnt4]	[e216g]	[[mycosis, fungoides, and, szary, syndrome, ar...
3	[sucla2]	[g118r]	[[regulated, progression, through, the, cell, ...
4	[braf]	[t599instt]	[[pilocytic, astrocytoma, pa, is, emerging, as...