Building word vectors

Setup


In [1]:
import sys
import os

import re
import collections
import itertools
import bcolz
import pickle
sys.path.append('../lib')

import gc
import random
import smart_open
import h5py
import csv
import tensorflow as tf
import gensim

import datetime as dt
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

random_state_number = 967898

In [2]:
from tensorflow.python.client import device_lib
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
get_available_gpus()


Out[2]:
['/gpu:0', '/gpu:1']

In [3]:
%pylab
%matplotlib inline
%load_ext autoreload
%autoreload


Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib
/home/bicepjai/Programs/anaconda3/envs/dsotc-c3/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['random']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"

In [4]:
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
color = sns.color_palette()

Data

load corpus vocab and wordidx


In [13]:
corpus_vocab_list, corpus_vocab_wordidx = None, None
with open('processed/stage1/vocab_words_wordidx.pkl', 'rb') as f:
    (corpus_vocab_list, corpus_wordidx) = pickle.load(f)
print(len(corpus_vocab_list), len(corpus_wordidx))


352220 352220

load data


In [7]:
store = pd.HDFStore('processed/stage1/data_frames.h5')
train_df = store['train_df']
test_df = store['test_df']

Word Vectors Pre Trained

collecting biolab words


In [9]:
from gensim.models.keyedvectors import KeyedVectors
biolab_keyed_vectors_pubmed_pmc_wiki = KeyedVectors.load_word2vec_format('external/biolab_wvs/wikipedia-pubmed-and-PMC-w2v.bin', binary=True)

In [10]:
biolab_words_pubmed_pmc_wiki = biolab_keyed_vectors_pubmed_pmc_wiki.vocab.keys()
biolab_words = set(biolab_words_pubmed_pmc_wiki)
len(biolab_words)


Out[10]:
5443656

In [12]:
vocab_biolab = set(biolab_words) & set(vocab_words)
print (len(vocab_biolab))
vocab_biolab


100489
Out[12]:
{'inhibitor1',
 'expression19',
 'espinosa',
 'plate-shaped',
 '5359',
 'adrenal',
 'auroras',
 'de-emphasized',
 'bootstrap',
 'glyceraldehyde-3',
 '3.359',
 '16/121',
 'attenuated',
 '24.29',
 'dounced',
 'uroplakins',
 'splice-donor',
 'gplots',
 '8/117',
 'num',
 'featuresof',
 'snail',
 'favored',
 'greater-than-additive',
 'forty-seven',
 'augment',
 'grouch',
 'de-activating',
 'multiplies',
 'renaming',
 'bx',
 'vy',
 'eosinophlia',
 'tumors.2',
 'excretory',
 'ofapoptotic',
 'typi',
 'under-estimated',
 'maya',
 'extraosseous',
 'rho/rac',
 'ci',
 'galt',
 'anti-b-actin',
 '1-year',
 '0562',
 'hypodiploid',
 'regions.5',
 'isoform-selective',
 'entero-pancreatic',
 'cpt-11',
 's-protein',
 'ns/s',
 'caldesmon',
 'p50/p50',
 'translatable',
 'cytoprotection',
 'acneform',
 'defeating',
 '10p11.23',
 'pharmacotherapy',
 'haemorrhagic',
 '0.262',
 '0.20-0.78',
 'r=0.352',
 'mice19',
 'instituted',
 '4.5.4',
 '0.04',
 'refractions',
 'whenas',
 '20.47',
 'cells.15',
 'villin-like',
 'samples5',
 'structure-nonspecific',
 'two-stranded',
 'lipids',
 'justin',
 'donner',
 '3,6',
 'data36',
 'brakets',
 'occured',
 'originally',
 'ida2',
 'utra',
 '176807',
 '04/01',
 '0.936',
 'killer',
 '0.1961',
 '5-dihydrotestosterone',
 'anti-collagen',
 '7072',
 'goiters',
 'resting',
 'resection.3',
 'chip-genotyped',
 '1.0078',
 '78-bp',
 '14q11.1-q12',
 '2-sheet',
 'colonectomy',
 '1660',
 '4880',
 'anti-cytokeratin',
 'chlorambucil',
 'p-32',
 'p=0.090',
 'b3a2',
 'techne',
 '1,257',
 'phytosome',
 '2,787',
 'closed-loop',
 'descending',
 'target-of-rapamycin',
 'scheid',
 'sink',
 '851',
 'interaction29',
 'aliquoted',
 'purposes',
 'tridimensional',
 '3,346',
 'desensitizing',
 'aaa',
 '16-wk',
 'irs',
 'smiley',
 'har',
 '16,27',
 'lgrs',
 'alludes',
 '29-kb',
 'i3',
 '2.42',
 'well-polarized',
 'variable.11',
 'reproducibility',
 '0.993',
 'replicates/treatment',
 'calciumphosphate',
 'apoprotinin',
 'heterozygosity',
 'ccl20',
 'observing',
 'adenovirus-transformed',
 'exposition',
 '4/97',
 'tumourigenicity',
 'flavonolignans',
 'nonmalignant',
 'migration-promoting',
 'affinity',
 '35.6',
 'toothbrush',
 'moter',
 'outcome.21',
 'mvs',
 'volvulus',
 'survival36',
 'caspase3',
 'al.,1997',
 'nakai',
 'nedd1',
 'transcriptome/proteome',
 'technique.22',
 'softmax',
 'magnus',
 'growth-controlling',
 '3667',
 '29-89',
 'naked',
 'ideograms',
 'structure/folding',
 'panelb',
 'burns',
 'linger',
 'decatenation',
 'dated',
 'vectors',
 '7.8-fold',
 '4425',
 '5.798',
 '70m',
 'cresyl',
 'pathways.20',
 'subfertile',
 'lymphoid/myeloid',
 'julie',
 'labia',
 'plaque',
 'apoptogenic',
 'hemin',
 'experimentales',
 'ichthyosis-like',
 'rats6',
 'therapy.25',
 'contraception',
 'moderate/strong',
 'self-activating',
 'trastuzumab-refractory',
 '0.941',
 'magnetic-activated',
 'invokes',
 'arnt',
 'picker',
 '127/132',
 'tracing',
 'cells/0.1',
 'k11',
 'deceases',
 'stabilization',
 'disfavoured',
 'voracious',
 'transparency',
 'changed',
 'larvae',
 'spc216',
 'levelis',
 '2ug',
 'andcontrol',
 'unwilling',
 'geldanamycin',
 '2687',
 'p53-deficient',
 '61.6',
 '77.85',
 'tetramer-monomer',
 '10-9',
 '188-192',
 'histological',
 'recapitulation',
 'osi',
 'beater',
 '89.8',
 'dermatome',
 'worrying',
 '72-hr',
 'multichambered',
 '5169',
 'interleukin-10',
 '37',
 'scrubbing',
 'uniprot',
 'history.1',
 '0.00001',
 '524625',
 '31/88',
 'metalloproteinase-1',
 '2:2',
 'antiphosphotyrosine',
 'vitro.32',
 '0.151',
 'plummet',
 'hit',
 'doughy',
 'challenges',
 'flag-epitope',
 'ala9',
 'bmyb',
 'garlic',
 'atpases',
 'function,25',
 '6787',
 'droplet',
 'bones',
 '103.7',
 'lnx',
 '2578',
 'ov',
 'nicely',
 'available',
 'ind',
 '10-fold',
 'system,1',
 'organism',
 'x-y',
 '14,24',
 '3a',
 'non-stem',
 'glyceraldehyde-3-phosphate-dehydrogenase',
 '15/88',
 'blosum',
 'impute',
 'ileitis',
 'serum-activated',
 'iswith',
 'scorable',
 'cyp24a1',
 'sstr1',
 'repair-mediated',
 't311',
 'speckled',
 'sc-398',
 'data22',
 '9079',
 '952',
 'dacomitinib',
 'm-bcr-abl',
 '9077',
 'arginine/serine-rich',
 'plink',
 'skp-1',
 '20q13',
 '9122',
 'theres',
 'lls',
 'trichothiodystrophy',
 'uteroglobin',
 'outcome.10',
 'polymorphism-mediated',
 'skii',
 'familiar',
 'let-7-5p',
 'rs1050171',
 'pgr',
 'mice.84',
 'warm',
 'hexakisphosphate',
 'kif11',
 '15.43',
 'polydactyly',
 'hydrolysis',
 'seven-stranded',
 '154534',
 'aminoalkylamino',
 'pro-proliferative',
 '4-methoxy-6-nitro',
 '0.0232',
 'insuling',
 'distribution',
 '9/188',
 'rs138213197',
 'light-density',
 '0.477',
 'length-adjusted',
 'campomelia',
 'jarvi',
 'puromycin-containing',
 'ptc2',
 'mol/kg',
 '02.00',
 'tumorsuppressive',
 'nonchimeric',
 'ruan',
 'gsr',
 'enact',
 'vimentin-positive',
 'time-courses',
 '4,266',
 '0279',
 '4360',
 '2.5-kb',
 'acc2',
 'antiviral',
 '7.4/1',
 'i0.5',
 'fluorine',
 '36-year',
 'sierra',
 'inapparently',
 'flaring',
 'poikilocytes',
 'reported.27',
 '0.00388',
 '15-year-old',
 'whan',
 '16,35',
 'transcription-modulating',
 '0.496',
 '88.5',
 'albanese',
 'p161',
 '3in',
 'disease.1',
 'hits',
 '7000',
 'dko',
 '30-amino-acid',
 'lissencephaly',
 'aqueous',
 'squared',
 'populations3',
 'insect',
 'dual-label',
 'rac2',
 'cells.21',
 'co-operates',
 'motic',
 '5574',
 'hmga1',
 'numbered',
 'correspondence',
 'eps8',
 '2,4-dinitrophenyl',
 '16/46',
 '0.0083',
 '25,117',
 '26,29',
 'expression.25',
 'regrowth',
 '19-4',
 'avanti',
 'pheomelanins',
 'one-quarter',
 'p=0.82',
 'di-methylation',
 'calibration',
 'strikes',
 'gene.38',
 '12-transmembrane',
 'fuji',
 'pre-normalization',
 'ref.6',
 'surprising',
 '11,19,20',
 'spuriously',
 'nhs',
 '6,22',
 'immunohistologically',
 '30.00',
 'complementarity',
 '3.036',
 '64-117',
 'trio',
 'twostep',
 'expression.47',
 'anastomosed',
 'plasmid-encoding',
 '30-120',
 '2500',
 '2218',
 '4/167',
 'arterio-venous',
 'diphtheria',
 'cilia-based',
 'gadolinium',
 '1839',
 'rad',
 'interactions.24',
 'n=146',
 '1p21',
 'anti-keratin',
 'landi',
 'frost',
 'single-dose',
 '400-bp',
 'calnexin',
 'sequence-specificity',
 'protein.40',
 'undissected',
 'bayesian',
 'gfp-positive',
 'ruby',
 'ivs38-8t',
 '2624',
 'jobs',
 'replication.9',
 'poor-prognosis',
 'recurs',
 'library-selected',
 'model-phased',
 '1658',
 '28006',
 'thoughtfully',
 'degradation.25',
 'interphasic',
 'acid-soluble',
 'authenticity',
 '068',
 '356',
 'expectancy',
 'mir-124a',
 'bipositional',
 'submitters',
 'single-mutant',
 '0.588',
 '3.3.5',
 '3757',
 'lhb',
 'p38-specific',
 '3341',
 'seldomly',
 'arrestin',
 '15.50',
 'lpd',
 'land',
 'micros',
 'lowing',
 'manometry',
 'mandating',
 '11,470',
 'official',
 'onartuzumab',
 '2,579',
 'dishes',
 'ad5',
 '74',
 'albumin-containing',
 '6.877',
 'lowess',
 'donn',
 'integrin-signaling',
 '1x5',
 't529',
 '7d4',
 '1925-2002',
 'enolase',
 '0.1-cm',
 'powerpc',
 '28/64',
 '211-230',
 'tachibana',
 'androgen-binding',
 'exchanging',
 'q14',
 'milliseconds',
 '14/19',
 '6.99',
 'nuclease-mediated',
 'st3',
 'through',
 'train',
 '27.33',
 'small-bowel',
 'processes.1,2',
 'gavaged',
 'breed',
 'chromatography-tandem',
 'exclusively',
 '13,14,17',
 '7-28',
 'reln',
 'subclone',
 'salicylates',
 'time-frame',
 'colorimetric',
 '11/41',
 'n=234',
 'kpl',
 'discriminate',
 'high-leukemia',
 'fernando',
 '4/29',
 'oncocytes',
 'notions',
 'quasi-native',
 'coopted',
 'ministre',
 'miscategorized',
 'lopes',
 'voom',
 'sub-confluence',
 '2hr',
 'geniticin',
 '0.3-0.7',
 '1.2kg',
 'p=0.0124',
 'state.3',
 'extirpation',
 'cell-stress',
 '14,733',
 '3min',
 'unc',
 '64.1',
 'crescentic',
 '5149',
 'caucasoids',
 '20ug',
 '2021',
 'sc-520',
 'centrifuged',
 'ror',
 'extreme',
 'years,19',
 'iii.a',
 '21.05',
 'ischaemic',
 'localization/accumulation',
 '0.3ml',
 'fastest-rising',
 'lat',
 'deglycosylates',
 'doesnt',
 '21-specific',
 'www.oncomine.org',
 'estrogen-receptor',
 'robo4',
 'el',
 'particularities',
 'halo',
 'neuropsychiatric',
 'overrepresent',
 'mediated-apoptosis',
 'v6.5',
 'oxide',
 'exon13',
 '3-amino-9-ethylcarbazole',
 'juxtapositions',
 'finger/toe',
 'pathways.11',
 'nanoseconds',
 '20-nucleotide',
 'xdh',
 'spdbv',
 '3800',
 'prox',
 '2887',
 '3623',
 'barbieri',
 'r2=0.63',
 'cells/100-mm',
 'herpesvirus-2',
 'ear-2',
 '8,41',
 'silvestri',
 'para',
 'probemix',
 '48,000',
 'crenolanib',
 'pcl1',
 'mmm',
 'rationalization',
 'stainer',
 'potent',
 'doughnut',
 'pretest',
 '72-77',
 'intraductal',
 'fibril-like',
 'bcl2l1',
 'phenelzine',
 'two-plasmid',
 'mnp',
 'v0',
 'ultravision',
 '84103',
 'cod',
 'metabolomics',
 'late-apoptotic',
 'outcome14',
 '28s/18s',
 'fat-poor',
 'ey',
 'immunologist',
 '12.36',
 '0.291',
 'reported3',
 'osteomas',
 'digit',
 'laurenti',
 '162.3',
 'ptch2',
 '12260',
 'm6a',
 '7-3-1',
 '3-1-3',
 'proliferation-promoting',
 '1965',
 'delayed',
 'toll',
 'organs.3',
 'non-severe',
 'nal',
 'years2',
 'hai',
 'experimented',
 'epidithiodiketopiperazine',
 '0.66',
 'genes32',
 'tcf8',
 '70.5',
 'responses',
 'cin4',
 '853',
 'v-abl-mediated',
 'cut-off',
 'structure.24',
 '0.00019',
 'computer-assisted',
 'specialised',
 '3620',
 'sphenoid',
 'ctr',
 'moves',
 'elastase',
 'rewarded',
 '5330',
 'daughters',
 'models.6',
 'anatomical',
 'wef',
 'painted',
 '20.49',
 'sharing',
 '0.0944',
 'enucleation',
 'trimethylase',
 'incurable',
 'isoformspecific',
 '6:8',
 'subdomains',
 'potentially',
 'peroxisome-biogenesis',
 'deb',
 'wales',
 'lysine-to-glutamine',
 'b-casein',
 '9360',
 'centralis',
 '0.887',
 'n=159',
 'precedes',
 '54,000',
 'lei',
 'perceivably',
 'monoubiquitinylation',
 'antigen-induced',
 'seven-blade',
 '0.618',
 'unveils',
 'tartrate',
 'accidents',
 '9-25',
 'bureau',
 'nongenetic',
 'phospho-substrates',
 'din',
 'activity30',
 'lum',
 'uence',
 'polymorphism-based',
 'cross-linked',
 'immunodysregulation',
 'rephosphorylation',
 'particulate',
 'inour',
 'beta-strands',
 'ume',
 '1959',
 '30.8',
 'not.2',
 'hh-related',
 'hamon',
 '848',
 '1.60',
 'hydroxylamine',
 'sufficiency',
 '5394',
 'thoracentesis',
 'instillations',
 'keratohyalin',
 'tetrakisphosphate',
 'andg',
 'ramped',
 '2l/min',
 'v03',
 'ureter',
 '2-positive',
 'perception',
 '1,556',
 'aberration',
 'admission',
 'hu',
 '4130',
 'studies16,17',
 'section1',
 'convoluted',
 '0088',
 'concentrator-5',
 '2003a',
 'carb',
 'post-extension',
 'vismione',
 'gapless',
 'transient-transfection',
 '1200',
 'thyroglobulin',
 'alphavbeta3',
 'laue',
 '13,16',
 'dyshomeostasis',
 'unirradiated',
 'redefines',
 'non-smad',
 'cells/progenitors',
 'unliganded',
 'patients.12',
 'cancer.10',
 'acca',
 'spib',
 '1985',
 'prdm1',
 'permissions',
 'study,12',
 'whim',
 'thymuses',
 'ltx',
 'mean+s.d',
 'phospho-site',
 'mid-log',
 '7q11',
 'myelocytic',
 'saleem',
 'chromosome9',
 '40mg/kg',
 'itim',
 '65-7',
 'silences',
 'extra',
 'cardial',
 'a280',
 'plk2',
 'b.f',
 'subsaturating',
 '0.02a',
 'k-b',
 'thr',
 'back-focal',
 'message',
 '4459',
 '95/96',
 'transconjugants',
 'k71',
 'speciesin',
 '0.286',
 'solver',
 '3780',
 'unpermeabilized',
 '105/well',
 'atretic',
 'auto-inhibitory',
 'side-effects',
 'fiberoptic',
 'cancer-cell-autonomous',
 '2.263',
 'tumors4',
 'unequalled',
 'aura',
 'no2',
 '5ms',
 'cells10',
 '0150',
 '44.8',
 'stability',
 'dysontogenetic',
 'pter',
 '0.1548',
 '44,670',
 'loop-binding',
 'malm',
 'collectively',
 '56.7',
 'quinoxaline',
 '62-fold',
 'recurrence',
 'displacing',
 '168-172',
 'expression.48',
 'rpb1',
 'platelet-specific',
 'sequence-tagged',
 'directs',
 '23,9',
 'unmethylation',
 'unlike',
 'operator',
 '14-h',
 'indonesian',
 'diffi',
 'vmd',
 '18/34',
 '150mg/day',
 '0.79',
 'base-pairs',
 '3/229',
 'diagnosis.6,7',
 'blunter',
 'unidentified',
 '1-592',
 '510-513',
 'www.bioconductor.org',
 '0117',
 '736',
 'papular',
 'cry',
 'syntelic',
 '631',
 '100/total',
 'non-sensitised',
 'macrophage-colony-stimulating',
 'non-consensus',
 'sino-nasal',
 'polymerase-1',
 'mack',
 'sitedirected',
 'emblematic',
 'progressive/recurrent',
 'cycle-sequencing',
 '62-69',
 'heterodimerization-dependent',
 'inactivity',
 'uml',
 'celery',
 '0.0007',
 'separable',
 '1760',
 '6897',
 '4,609',
 'apical',
 'transformation/transcription',
 'mini-exon',
 'autoacetylation',
 'responsivity',
 'super-infected',
 'norte',
 'n=507',
 '0.4325',
 'economically',
 'non-staining',
 'translocations',
 'attractions',
 'pompa',
 'polyacrylamide-urea',
 '55-bp',
 '5-terminus',
 'tglu',
 'caspase-8-specific',
 '10595',
 'ile',
 'mammary',
 're-suspended',
 'protein.6',
 '89.94',
 '95-3',
 '58.3',
 '1.3e',
 'pre-tx',
 'gamma-treated',
 '46.9',
 'treatment.13',
 'accumu-lation',
 '1d',
 'al.,2008',
 'messengers',
 'protein-treated',
 'primetime',
 '2/31',
 '2491',
 '13039',
 'mismapping',
 'control.6',
 'sub-chromosomal',
 'tumors7',
 '98.02',
 '1,1-biphenyl',
 'preserved',
 'development1',
 'phosphorylation-mediated',
 '100,101',
 'vala',
 'ctype',
 'rs2230782',
 'argi',
 '3-case',
 'mills',
 'scores14',
 '1111',
 '8079',
 'ubiquitylating',
 'hemming',
 'sptan1',
 '0.47',
 'three-generational',
 'bothersome',
 'p=0.64',
 'pla2',
 'matter',
 'immune-surveillance',
 '0.037',
 '355',
 '9,197',
 'phosphatase-anti-alkaline',
 '3a2',
 'class-switch',
 'n=127',
 'sub-ependymal',
 'chemotherapy-refractory',
 'telecanthus',
 'rps29',
 'n42',
 'immunoassayed',
 'mir-194',
 'methoda',
 'nutator',
 'methylation-free',
 'fluorescence-activated',
 ...}

In [14]:
vocab_not_in_biolab =set(vocab_words) - set(biolab_words)
print(len(vocab_not_in_biolab))
vocab_not_in_biolab


251731
Out[14]:
{'',
 'mscv-nup214-abl1-ires-gfp',
 'limma.18',
 'c3h/1oth/2',
 '0.41.0',
 'stablee',
 'gdc-0879mediated',
 'a62t',
 'tumorsnamely',
 'her2-so/cep17-sg',
 'buffersubcloned',
 'measurementss.d',
 'cbreast',
 'rppametastasis-associated',
 'phenylalanine18',
 'detectableboard',
 'c.17981799gt',
 'ofdoes',
 'olfm4',
 'fkbp12rapamycin',
 'phenotype.lambdoid',
 'fragment17',
 'slc34a2',
 '20gap',
 'suppressorwell',
 'saciisite',
 'reportednrnac.1a',
 'd177y',
 'observeddownloadin',
 'p454s',
 'only.7.four',
 'ganglioglioma.25',
 'glissons',
 'puastderkwt',
 'lamp2-positive',
 'a/gagarose',
 'dahln',
 'obstructiona',
 'etv1suggest',
 'e32g',
 'catga',
 'identity22',
 'mm00455685_m1',
 'cys72asp74',
 'itdalleles',
 'coimmunoprecipitatedwith',
 'andpepstatin',
 'micej',
 '4016m1r219',
 'treatment-relatedareas',
 'smap.23',
 'r70w',
 'pt3n1',
 'genessupplementaryincluding',
 '0.064-2.262',
 '2a763v',
 'observations29',
 'p14arf/dapk/p53',
 'a314vexhibited',
 '321a',
 'severalfragment',
 'cellreaction',
 'referred.haplotype',
 'kemler',
 's114x.11',
 'dpp-8400574',
 '5-gttgaacggtggccacaccggc-3',
 '1501g-a',
 'lymphoid-signaling',
 '2002tf',
 'c1275y',
 'tyr646',
 'ar-v1',
 'p-eif2',
 'p53fl',
 '30345',
 't241m',
 'l288sfish',
 'peripherydownloadin',
 't/ti',
 'l273m',
 'carriers22',
 'materialfig',
 '02097810',
 'andets',
 'visvanathan',
 '29leiomyosarcoma+0.750.31.171.831.44',
 'substrates/oncogenic',
 'pre-b/t-all',
 'p.a148t',
 'i11t',
 'all.rapamycin',
 'tumour-extracted',
 'autogizer',
 'supplemental3h',
 '53/411',
 'azevedo',
 'h50r',
 'mutation.among',
 'insectin',
 'apc-coupled',
 'muci/gram',
 'x87838',
 'homotrimers.options',
 'nsii',
 'teetthvvmktdaefvcertlkyflgiaggkwvvsyfwvtqsikerkmlnehdfevrgdv',
 'min.the',
 'rs743185',
 'fiftyfive',
 'c.625c',
 'g360a',
 'needof',
 'c2h2-type',
 'r369w',
 'asxl1msh6unfortunately',
 '211k24',
 '10f4252pancolonic1.8+apyes',
 'msi+24poly',
 'med12/cyclin',
 'localtest',
 'rate.largedownload',
 'co-defining',
 'johinaz.cgen',
 'l115p',
 '65:5563',
 '4afigure',
 'rv560-561',
 'pit-2',
 'pd03259019',
 'palb2-associated',
 'akt3z',
 'proteinandgrowth',
 '9one',
 'kolodnerterminal',
 'r905',
 'butby',
 'denysdrash',
 'proline-287',
 'solexa-illumina',
 'fgfr3.32,33',
 'irish/german',
 'meanslevels',
 'lxxll-like',
 'pe1e2s1',
 'pone.0064364.e015.jpg1.0',
 "5'-agttccactcttagaggtag-3",
 'fujigaoka',
 'lex1',
 'v261m',
 'y17126',
 'inectmwt-1',
 'bcdk6',
 'gilhuis',
 'd1203',
 'supplementarymutantsassessed',
 'nb1224',
 'element/binding',
 'transduction.5,6',
 'cr-oe33',
 'h1417d',
 'cysts/multinodular',
 '2c/min',
 '5-aatagattctggcattgtggtccccgttttcttatggg-3',
 'h-actin',
 'gimema',
 'd18n',
 'd1s2737',
 '15,000r112c',
 'resper',
 'h701',
 's34f/y',
 'c.6014c',
 'e295k',
 'h2aub1',
 'response32.3',
 'pik3cathat',
 'tg101209.our',
 'v379a',
 'cd4-cell',
 'g4450a',
 'bamhi/aflii',
 'fgf-mapk-pathway',
 'o61267',
 'kumamoto',
 '12.9other',
 'introns43',
 'l2575',
 'y126',
 'c.2221-126c',
 'gtctctcccttgaaatgctgtga',
 'mplw515l-expressing',
 'ezh2h689a,19',
 'animals13',
 'elf-3',
 'v218g',
 '24772/pnf',
 'fhgenomic',
 'tmprss2erg3',
 'xyftqtllpglag',
 'tac272tat',
 'workresistance',
 'nbd-gdp/gtp',
 '9121s',
 'venkataramani',
 'bp21',
 'f451l',
 'epithelial183',
 'nut-positive',
 '212-717-3203',
 'q0q1dt18',
 'q661',
 'gcagctgcccggggccgaca',
 '0.00010.13',
 's1898f',
 'dexseq.29',
 '19.6os',
 'mda231',
 'respectively.sequencing',
 'nonfunctioningmaximizing',
 'routineuse',
 'notch1-responsive',
 'ascom-mll3',
 'therapeutics130',
 'cellcompound-protein',
 'univ-lvon1.fr',
 'processesfor',
 'centrosomes7a',
 'amino-binding',
 'cells125',
 'amx500',
 'sdsr703p',
 'adenocpoor50m40japan',
 'panasonic',
 'i220',
 "5'-gagtgctctaatgactgactgaga-3'/5'-aaaggtgacatggaaagccc-3",
 'foldin',
 '0/134',
 'tissuessupplementaryof',
 'c.1702_1703del',
 'ea64',
 'v37m',
 'normalsupplementalmargaret',
 'lymphocytosissupplementarynormal',
 'eachrepresents',
 'l1546n',
 'f.d.n',
 'a259raf-13b',
 'c141y',
 'eitherslcros',
 'esr1-e380q',
 'www-huber.embl.de/users/anders/htseq',
 'tyr1278',
 'viadetected',
 'ubbe',
 'coitus.56runx1',
 'domain/ras',
 'pdgf-a/c',
 'phospho-erbb2',
 'supplementarysmarca4',
 'ethyleneglycoltetracetic',
 'wascomparable',
 'hsc/clp',
 'tim-craf',
 '24hec',
 'thegene',
 'fdr=0.01',
 'datafig',
 'nowak-wegrzyn',
 '1321g',
 'brc15',
 'mef2btranscript',
 'panitumumabf',
 'aml.15,23,27',
 '0019059',
 'for13',
 '1779insc',
 'r257g',
 'p.leu597val',
 'l766pproteins',
 'hgf-mediated',
 'uhplc-qqtof-ms',
 'msh6.67',
 'frame.29',
 'backx',
 'glu118',
 'pcaf-mediated',
 '596718',
 'www.arup.utah.edu',
 'investigations,36',
 'ands21',
 'inicd1.reverse',
 'paired-primers',
 'h1155',
 'showedremarkable',
 '5-cagtttctgtctgctaggag-3',
 'hal-b2',
 'assaysleft',
 's966q',
 'al-elein',
 'genesunfortunately',
 'nickel-1,2-dioleolyl-sn-glycero-3',
 'mandatory.26',
 'wereaccording',
 'gli1-driven',
 'tyrosinethus',
 'bc-his6lane',
 'garcia-olive',
 'p27t198v',
 'thr75met75',
 'intoprb16',
 'th2-helper',
 'iswi-related',
 'sos1e846k',
 'spoplalso',
 'dicer16',
 'lung29,30',
 'serleu',
 '1-trcn0000074283',
 'proteinsone',
 'psq-containing',
 'anti-erk-2',
 'suppression.it',
 'gst-erk2',
 'nacl',
 'doublet=en',
 'samplewould',
 '503-220-3405',
 'erbb4.23',
 'tyr279cys',
 'studied1',
 'erk1k71r/erk2k54r',
 'irizarry',
 'fiscella',
 'infiltratingresistance',
 't49.8',
 'acidsubstitutions',
 '600.options',
 'hadeggs',
 'whenof',
 '11resulting',
 'cellsmander',
 'methylation4,12',
 'caggtcttgatgtacttccctcgtttgtgcagc',
 'leu-536',
 'ppm1d-mutant',
 't232',
 'v101m',
 '1592delt',
 'atpprotein',
 'examineddiagnosed',
 'resistance3b',
 "5'-gggaccggcttaatccatag-3",
 'biotecnologies',
 'rxrb',
 'assays.72',
 'slidenorthern',
 'sdfl',
 'pixsys',
 'gcn5/pcaf',
 'e6.5-e9.5',
 'fkrkhkkdisqnkravrr',
 'mir-34c-5p',
 'mores3',
 'd024',
 'approximation.20,21',
 'nuclearextracts',
 'cases.7,10',
 'mmmt',
 'poulikakos',
 'tecknica',
 '140481275-140481298',
 'anti-ar3',
 'p70s6k-t389',
 'ccagagtgctctaatgactg',
 'c.ten',
 'york-presbyterian/columbia',
 'alone5',
 'dnindividuals',
 '3376554583moderate',
 'gfp-ezh2y641',
 'kmeindl',
 '657185',
 'anti-phospho-c-kit',
 'knies-bamforth',
 'aneuploidy.micrographs',
 'cancers4547',
 'sox11-negative',
 'hsc7e116',
 'inhibitor7figure',
 '2003-00328',
 'gfoldwt',
 '10wtwtwtwtwtwtwtwtwtwt1',
 'translational/targeted',
 '3q4u',
 'betap2loop',
 'geneperhaps',
 'coot.embl.de',
 'homologywith',
 'cyclearrested',
 'skin.58',
 'delinsl',
 'q1537rq1537r',
 'peprotec',
 '302131',
 'thusthere',
 'micecarrying',
 'speciwc',
 '430k10',
 'f57l',
 'b-rafq257rsupplementarys2',
 'pdgfrp',
 'prmd1',
 'targets13',
 'reflections14,83521,181',
 'myc-reconstituted',
 'polymorphism.in',
 'ibgc4.7,13,14',
 'bar=100m',
 'sspi',
 'clegg',
 'inknockdown',
 'inactivation,16',
 'set1a/b',
 '4division',
 'v1m',
 'datayes',
 'tctgcagcagcaggcaga',
 'l276praf',
 'mutras',
 'lats2-expressing',
 'studyf',
 'dccd',
 'flj126847.7col8a1acollagen',
 'pvhl-defective',
 'lys569',
 'm243-f1695',
 'that2',
 'homology2',
 '78260810',
 'sh_1',
 'zc3h12b',
 'songet',
 'g1157s',
 'medium10',
 'k433r',
 '28+ndnd',
 '4087197',
 'missense427.5nonsense11820.9frameshift',
 '1207/1605',
 'cellsac',
 '5cctcctaccttggcattaca3',
 'dosagef',
 'debrauwere',
 'betweendifferent',
 '20092013',
 '9p272s111310',
 'wild-typeqi',
 'pfdn6',
 '121anormalgacaspgacatcnegneg',
 'receptorlung',
 'ic50sdeletions',
 '986995',
 'appbp1-uba3',
 'r482c',
 'g272d',
 'g779s',
 'ezrint567dwith',
 'familyin',
 'heterodimer.a',
 'log-calculation',
 '18.2g',
 'c27a',
 'pastorfide',
 'vaco5',
 'shnf1e',
 'checkpointlines',
 'nhr14',
 '4515mbtnamissenseex8c.856c',
 'sda-containing',
 'melanoma38.4',
 'hla-c*04:09n',
 'reverse5-tcagtccataagccaagctctca-3',
 'c.1501g4a',
 'tnfsf11/rankl',
 'a226tfigure',
 'e640',
 '89991',
 '13871784',
 'gelsi',
 'rasbraf',
 'a218v',
 'g1202r.15alk',
 'ac-ii',
 'patientp22',
 'sirnas2b',
 'hafner',
 'l858rand',
 'c.s.hill',
 'r196l',
 'leukemias.40',
 'leukemogeneic',
 'separateof',
 'mutations24',
 'whalin',
 'trkai/ii',
 'e478k',
 'rb-/-/p107',
 'slidewe',
 'samplescommon',
 'phospho-kit',
 'ckitwild-type',
 '3.1-pdgfr',
 'cytoplasmic.the',
 'asn117ser',
 'nonpolyposispackage',
 'shecases',
 'lincscloud',
 'daystop',
 'p1087r',
 'tcccctgttgattccctaga',
 '11f41',
 'pittsburgh.sequence',
 'gfr/pkc',
 'c797s',
 'torc1',
 'ct60',
 'genesdna',
 't155i',
 'informationapart',
 'ttaggatgagcctctcctagactt',
 'previously.4,17,22',
 'indmem',
 '92.5101.5',
 'she78-7',
 'tia1',
 'make.cells',
 '23995711',
 'ds-55004',
 'l430p',
 '426-521',
 'method74',
 'angio100',
 'non-bat-rii',
 '368-5698',
 'armigate',
 'hpvprominently',
 '3.50+0.20',
 'atcontent',
 'fc2s',
 'agentscolony',
 'supp.s1',
 'genomicer',
 'cmv-vp16-tfap2a',
 '241-269',
 'lys105',
 'caagtattggtctctcgtctttcagctggataaggtctggtttaatgc',
 '0.27.6',
 'physicallyeither',
 'flag-traf6',
 'nct00312377',
 'ahcyl1',
 'fgfr1,17',
 'www.broadinstitute.org/cancer/software/genepattern',
 'populations180',
 'srp033306',
 'n1380',
 'wererequired',
 'bindingcrystal',
 'p087',
 '256kb',
 'accagcca-ccactttctgatagg',
 'catcccatggtggc*gggatggttgcagaag',
 'issubset',
 'certam',
 'd446v',
 'syndromecausative',
 'y339',
 'pone.0064364.e006.jpg',
 'functioning.the',
 'ligandlane',
 'tyr791phe',
 'c896',
 'fgf8=1.79769e+308',
 'ikk3',
 'f99s',
 'tasimilar',
 '2e758ga1l798f/ha4a864q',
 'nucleotide.asxl1',
 'y537s3045950.0110.0001',
 'nes=2.05',
 '17q22-2514daint-213q32-33',
 'fluor-555',
 'flankswere',
 'tgs-6',
 'p73mutant',
 'yasuji',
 'p.asn127del',
 '2122-nt',
 'cimp16',
 'tamra-ins',
 'fibroblasts5',
 'serum5figure',
 'studiesrnai',
 'chromosome17q11.2',
 'proliferationa',
 'pci-neo-baf250',
 't-e-y',
 'd609g',
 'lagerstedt',
 'aptag-1',
 'gcmn',
 'tttggaagctctcagggtac',
 'syndromes15,21',
 'sophie',
 'ciovacco',
 'immunoblottingand',
 'injeclion',
 'c.2149g',
 'pa2g4p4',
 'dach1',
 'sud-luxembourg',
 'd85n',
 'e600w',
 'krasthe',
 '1mq4c',
 'h193qp53',
 'peg3',
 'a40v',
 'theerror-containing',
 'g112e',
 'messiaen',
 '586del',
 'repeats5,6',
 'mekwith',
 'r748g',
 'mpl.34-37',
 "c'l",
 'nopho',
 'bosmuller',
 'devol',
 'rs66944506',
 '4494503',
 'arer207w',
 'cellsdataset',
 'fhl1-induced',
 'wolf-hirschhorn',
 'diderot',
 'msh2-vd862msh6p',
 'p3xflag-cmv-wild-type-chk2',
 '3.24+0.18',
 'small.1',
 'c.856g',
 'masp2',
 'shinmura',
 'improm-ii',
 'hs00368175_m1',
 '2supplementary1',
 'ewsr1-2',
 'd86n',
 'errfi14a',
 'egflane',
 'f224lcontain',
 'y859',
 'smai/bglii',
 "5'-atcatgtttgagaccttcaa-3",
 'e1a-binding',
 'this25',
 '106the',
 't/p.p214l',
 '0.20.3',
 'supplementary11a',
 'randerath',
 'antigal4',
 'chek1',
 'differenceq276p',
 'schwaller',
 'g20a',
 'htlv-iinfected',
 'together.13,14,36',
 'kms-9',
 'pegfp-flag-pdz',
 'antibody15,34,39',
 't0.31',
 'amc.uva.nl',
 'c304',
 'proteinsupplementaryno',
 '5001,600',
 'arid1b-associated',
 'receptor-smads',
 '5-ctggaagcaaagacggacaa-3',
 'pznctj2-q205l',
 'h1881',
 '7235g',
 'siles',
 'balkwill',
 'mycmaxmxd',
 'p2lv-h-rasand',
 'theevent',
 'il-1-il-1r',
 'etoh/hanks',
 'f7425',
 'inhibitionsupplement',
 '5nmis',
 '10.1007/s10147-013-0602-1',
 'et163950',
 'future-the',
 'erlotinib.81',
 'non-transactivating',
 'flt3mf5',
 'comparedunfortunately',
 'nucleotidesthe',
 'p.q684x',
 '31544',
 'resultsliposarcomas',
 'adenocarcinomas.28,44,47',
 'mutant-egfr',
 'pgex-ecorv-sac1',
 'specificallyt790m',
 'lmpl',
 'cisa',
 'yeast,3',
 'k708',
 'competitorin',
 '0.500.83',
 'donnem',
 'l858r-substituted',
 'paip1s-luc',
 'c.425',
 'bclanes',
 'immunoblottingfor',
 'm-v5',
 'f37',
 't4m0',
 'smad4-reconstituted',
 'ct-a/cyt-1',
 'pladienolide,64',
 'fgfr2-cit',
 'stag2l360w',
 'polymicrogyria-postaxial',
 'methodsr300h',
 'glfg-2',
 'flag-elf3n233',
 'anti-gata3',
 'pml/rar-expressing',
 'n=48supplementary',
 'catagg',
 'ssc/0.3',
 'variation_29880',
 'ew-sm1-989',
 'restrictionpattern',
 'function5677insatruncated',
 'dmempten',
 'condensedf',
 '2683-2711',
 'pe-fluorescence',
 'pathogenica',
 '3.285614467',
 'w406r',
 'ampmdm2',
 'srf-staining',
 'zfn639',
 'y6a',
 'c-16',
 'c3h/10t/2',
 'y-meso-27',
 'fc=1.46',
 'g482v',
 'transformationrelevant',
 'p332q',
 'lys3326tersdhb',
 '4337123',
 'p53-/-/p210',
 'bjornsti',
 'allele34,35',
 'theiii',
 'igf-i-induced',
 '2.1289126',
 'e40t',
 'bc012846.1',
 'cellssupplementalimportantly',
 'vigers',
 'anassays',
 'brim-311',
 'rp11-295i5',
 'ttce2',
 'idh2,15',
 'detectedlanes',
 'bcr-ablt315itransformed',
 'il-9r/c',
 'esr-y537n',
 'melanomasl',
 'g418-positive',
 'primersthat',
 'wee1hu',
 'hrp-coupled',
 'wild-type-likemutant',
 '4/490',
 'leu298',
 'collagen-5a1',
 'p.q9x',
 'hetzer',
 'gradeprognostic',
 'wm278-gfp',
 'spen-specific',
 'contexts.conclusionin',
 'e02641959224774121743269288',
 'tmm18',
 'g662eschematic',
 'h585d',
 'gagtcatcaattttattctgactgatcc',
 'myc5a',
 '37yrs',
 'pro261-induced',
 'ng/lin',
 'post-enucleated',
 's1py-bound',
 'v281i',
 'disease0',
 'bartletts',
 'cellssupplementaryoverall',
 'modifications.9,10',
 '5-caaga-acagcaacgagtaccg-3',
 'ostium-primum',
 'female3823',
 'pms2-interactive',
 'aacggtaccaaggctgagaa',
 'texasred-conjugated',
 'lskeffect',
 'mfinal',
 'spop-mediated',
 'h-galactosidase',
 '70.169.2',
 'esmedical',
 'budssuch',
 'reorganizedin',
 'r246w',
 '32p-4e-bp1',
 "5'-aatgcccat",
 'anti-phospho-c-met',
 '68/f',
 'trastuzumab.25',
 'snu-886',
 'cplcknditkrslqestrfsqlveellkii',
 'htertnot',
 '1.282.17',
 'satoko',
 '5-aaggtgttgcaatccccagc-3',
 'matrix.41',
 'mutationssupplementarythese',
 'gccagcattttagcattacttc',
 'a864t',
 'indouble',
 't49.2/t49.3',
 '9206s',
 'promoter-ttadriven',
 's1/kh',
 'interaction17',
 'unknown/na14.813.223.8',
 'p110-d964a',
 'jak3-stat3',
 'extendmr',
 'y-733',
 'pbabe-zeo-nrasg12d',
 'mek1c121s-expressing',
 'anti-pdgfc',
 'asp-220',
 'g1567d',
 '211403',
 'polyacryiamide',
 'p.val600_lys601',
 'jim3',
 'statethe',
 'staphylococcusbetween',
 "5'-attacacagtatcctcgaca-3",
 'ha-11',
 'u3-1287',
 'di7s2sorepeating',
 'hadshould',
 'y1003x',
 'sd/agar',
 'differentiation,3',
 'staalesen',
 '70800',
 'activity.23,24',
 'againstwap-cre',
 'g162rnormalnormaldeficientdeleterious0.00pathogenic',
 'gly719x',
 'mds19',
 'd464g',
 'mode.14',
 'y88c',
 'asp770_asn771insmetalathrpro',
 'oxiod',
 '50target',
 'q1496h',
 'c135r',
 'crizotinibc',
 'sequencing.10',
 'rotterdam.26',
 'defectao105yes',
 't198a/g',
 '14.981.7',
 'locatedleading',
 'sex.1984',
 'capacityrestoring',
 '20-mmol/l',
 'butdownloadin',
 'makita',
 'syndromeassociated',
 'newman-keuls',
 '5-gatggtgggggccctcctctt-3',
 'mutagenesismutants',
 'analysisimmunohistochemical',
 'rs3f',
 'that.altogether',
 '030/50',
 'conformationgrowthcodon',
 'gln184stop',
 'arrest.the',
 '3.40+0.12',
 'a328p6ntp91l',
 'status22',
 'sample.sanger',
 'pocket4a',
 's71w',
 'sdcbp2',
 'm114',
 'phospho-ros1',
 'nm_007817',
 'ntgtg',
 'lossoffunction',
 'eachaffected',
 '71f/57stomach453spvery',
 'anti-cd4',
 '144k',
 'scalia',
 'microcystein-lr',
 'repairatpase',
 'me6',
 'erk.of',
 'tfsearch',
 'xeds/eels',
 'her2+/pi3k-mutant',
 's6o',
 'p5-specic',
 'l65f',
 'antibody.of',
 'a3062g',
 'malignancy13',
 'locusb',
 'hkr1',
 'supplemental5a-b',
 '2chighwire_math',
 'sc1747',
 'turnhout',
 'nrap',
 'wasthree',
 '413641513',
 'metnm_000245c2646tp814sgermline1/0',
 'binitial',
 'tgcggacagg',
 'carcinoma987512.512.5040',
 'lsab+system-hrp',
 'ret-ntrk1',
 'nordenstadt',
 'molecularn114s',
 'rotolib2.aa',
 'backgroundcould',
 'n105kthe',
 'y110c',
 '395one',
 'complexes129',
 'sulfateinduced',
 'shrna-smad2',
 'c-dx',
 'trii/alk-x',
 'sa-b',
 'h920',
 'referencingca',
 'ctm-5990506',
 'er+breast',
 '33-kda',
 'alkf1178l',
 '5tgcaaggtggagcgattctg',
 'withclosure',
 'substitutionb',
 't-lymphotropic',
 'pa30824101none0',
 'lm3d-induced',
 'vomiting30',
 't479pvitro',
 '5-acacgtccccatctgaag-3',
 'a279vprotein',
 'br67',
 'decribed.8',
 'complementarygene',
 'hoxd8',
 '5-uauauuuauauauuagacgdgdg-3',
 'skp1or',
 'imatinib.the',
 ...}

dont need word to id dict since this is indexed with words

using biolab words for missing corpus words


In [17]:
undesirable_ascii_characters = list(range(32))
undesirable_ascii_characters.remove(10) #keep new line since this might be used for sentence tokenizer
undesirable_charmap = dict.fromkeys(undesirable_ascii_characters)

In [20]:
from nltk import word_tokenize
from utils import custom_word_tokenizer, apply_custom_regx

custom_tokenized_biolab_pubmed_pmc_wiki_wv = {}
for word in vocab_biolab:
    vector = biolab_keyed_vectors_pubmed_pmc_wiki.word_vec(word)
    custom_tokenized_biolab_pubmed_pmc_wiki_wv[word.lower()] = vector
    word = word.lower().encode('ascii', 'ignore').decode('utf-8', 'ignore')
    word = str(word).translate(undesirable_charmap)
    word = apply_custom_regx(word)
    word = word.replace('\\t', '')
    for part in word_tokenize(word):
        if part in custom_tokenized_biolab_pubmed_pmc_wiki_wv:
            custom_tokenized_biolab_pubmed_pmc_wiki_wv[part] += vector
            custom_tokenized_biolab_pubmed_pmc_wiki_wv[part] /= 2

In [21]:
len(custom_tokenized_biolab_pubmed_pmc_wiki_wv)


Out[21]:
100489

for tensorboard


In [27]:
tb_vocab_size=5000

In [38]:
tb_vocab_biolab = list(vocab_biolab)[:tb_vocab_size]
with open("view_wvs_tb/tb_vocab.tsv", "w") as fp:
    wr = csv.writer(fp, delimiter='\n')
    wr.writerow(tb_vocab_biolab)

tb_word_vectors = np.random.randn(tb_vocab_size, 200)
for i,word in enumerate(tb_vocab_biolab):
    tb_word_vectors[i] = custom_tokenized_biolab_pubmed_pmc_wiki_wv[word]

In [40]:
%autoreload
from utils import visualize_embeddings_in_tensorboard
visualize_this_embedding = tb_word_vectors
print(visualize_this_embedding.shape)
metadata_path = "/home/bicepjai/Projects/dsotc/data_prep/view_wvs_tb/tb_vocab.tsv"
visualize_embeddings_in_tensorboard(visualize_this_embedding, metadata_path, "/home/bicepjai/Projects/dsotc/data_prep/view_wvs_tb")


(5000, 200)

In [35]:
del tb_word_vectors

building word vectors of 200d for model


In [22]:
corpus_word_vectors = np.random.randn(len(vocab_words), 200)
corpus_word_vectors.shape


Out[22]:
(352220, 200)

fill in biolab vectors available


In [23]:
for word in vocab_biolab:
    dataset_corpus_word_index = vocab_wordidx[word]
    corpus_word_vectors[dataset_corpus_word_index] = custom_tokenized_biolab_pubmed_pmc_wiki_wv[word]

total words not updated with training from biolab


In [24]:
words_not_updated = set(vocab_words) - vocab_biolab
len(words_not_updated)


Out[24]:
251731

In [25]:
words_not_updated


Out[25]:
{'',
 'mscv-nup214-abl1-ires-gfp',
 'limma.18',
 'c3h/1oth/2',
 '0.41.0',
 'stablee',
 'gdc-0879mediated',
 'a62t',
 'tumorsnamely',
 'her2-so/cep17-sg',
 'buffersubcloned',
 'measurementss.d',
 'cbreast',
 'rppametastasis-associated',
 'phenylalanine18',
 'detectableboard',
 'c.17981799gt',
 'ofdoes',
 'olfm4',
 'fkbp12rapamycin',
 'phenotype.lambdoid',
 'fragment17',
 'slc34a2',
 '20gap',
 'suppressorwell',
 'saciisite',
 'reportednrnac.1a',
 'd177y',
 'observeddownloadin',
 'p454s',
 'only.7.four',
 'ganglioglioma.25',
 'glissons',
 'puastderkwt',
 'lamp2-positive',
 'a/gagarose',
 'dahln',
 'obstructiona',
 'etv1suggest',
 'e32g',
 'catga',
 'identity22',
 'mm00455685_m1',
 'cys72asp74',
 'itdalleles',
 'coimmunoprecipitatedwith',
 'andpepstatin',
 'micej',
 '4016m1r219',
 'treatment-relatedareas',
 'smap.23',
 'r70w',
 'pt3n1',
 'genessupplementaryincluding',
 '0.064-2.262',
 '2a763v',
 'observations29',
 'p14arf/dapk/p53',
 'a314vexhibited',
 '321a',
 'severalfragment',
 'cellreaction',
 'referred.haplotype',
 'kemler',
 's114x.11',
 'dpp-8400574',
 '5-gttgaacggtggccacaccggc-3',
 '1501g-a',
 'lymphoid-signaling',
 '2002tf',
 'c1275y',
 'tyr646',
 'ar-v1',
 'p-eif2',
 'p53fl',
 '30345',
 't241m',
 'l288sfish',
 'peripherydownloadin',
 't/ti',
 'l273m',
 'carriers22',
 'materialfig',
 '02097810',
 'andets',
 'visvanathan',
 '29leiomyosarcoma+0.750.31.171.831.44',
 'substrates/oncogenic',
 'pre-b/t-all',
 'p.a148t',
 'i11t',
 'all.rapamycin',
 'tumour-extracted',
 'autogizer',
 'supplemental3h',
 '53/411',
 'azevedo',
 'h50r',
 'mutation.among',
 'insectin',
 'apc-coupled',
 'muci/gram',
 'x87838',
 'homotrimers.options',
 'nsii',
 'teetthvvmktdaefvcertlkyflgiaggkwvvsyfwvtqsikerkmlnehdfevrgdv',
 'min.the',
 'rs743185',
 'fiftyfive',
 'c.625c',
 'g360a',
 'needof',
 'c2h2-type',
 'r369w',
 'asxl1msh6unfortunately',
 '211k24',
 '10f4252pancolonic1.8+apyes',
 'msi+24poly',
 'med12/cyclin',
 'localtest',
 'rate.largedownload',
 'co-defining',
 'johinaz.cgen',
 'l115p',
 '65:5563',
 '4afigure',
 'rv560-561',
 'pit-2',
 'pd03259019',
 'palb2-associated',
 'akt3z',
 'proteinandgrowth',
 '9one',
 'kolodnerterminal',
 'r905',
 'butby',
 'denysdrash',
 'proline-287',
 'solexa-illumina',
 'fgfr3.32,33',
 'irish/german',
 'meanslevels',
 'lxxll-like',
 'pe1e2s1',
 'pone.0064364.e015.jpg1.0',
 "5'-agttccactcttagaggtag-3",
 'fujigaoka',
 'lex1',
 'v261m',
 'y17126',
 'inectmwt-1',
 'bcdk6',
 'gilhuis',
 'd1203',
 'supplementarymutantsassessed',
 'nb1224',
 'element/binding',
 'transduction.5,6',
 'cr-oe33',
 'h1417d',
 'cysts/multinodular',
 '2c/min',
 '5-aatagattctggcattgtggtccccgttttcttatggg-3',
 'h-actin',
 'gimema',
 'd18n',
 'd1s2737',
 '15,000r112c',
 'resper',
 'h701',
 's34f/y',
 'c.6014c',
 'e295k',
 'h2aub1',
 'response32.3',
 'pik3cathat',
 'tg101209.our',
 'v379a',
 'cd4-cell',
 'g4450a',
 'bamhi/aflii',
 'fgf-mapk-pathway',
 'o61267',
 'kumamoto',
 '12.9other',
 'introns43',
 'l2575',
 'y126',
 'c.2221-126c',
 'gtctctcccttgaaatgctgtga',
 'mplw515l-expressing',
 'ezh2h689a,19',
 'animals13',
 'elf-3',
 'v218g',
 '24772/pnf',
 'fhgenomic',
 'tmprss2erg3',
 'xyftqtllpglag',
 'tac272tat',
 'workresistance',
 'nbd-gdp/gtp',
 '9121s',
 'venkataramani',
 'bp21',
 'f451l',
 'epithelial183',
 'nut-positive',
 '212-717-3203',
 'q0q1dt18',
 'q661',
 'gcagctgcccggggccgaca',
 '0.00010.13',
 's1898f',
 'dexseq.29',
 '19.6os',
 'mda231',
 'respectively.sequencing',
 'nonfunctioningmaximizing',
 'routineuse',
 'notch1-responsive',
 'ascom-mll3',
 'therapeutics130',
 'cellcompound-protein',
 'univ-lvon1.fr',
 'processesfor',
 'centrosomes7a',
 'amino-binding',
 'cells125',
 'amx500',
 'sdsr703p',
 'adenocpoor50m40japan',
 'panasonic',
 'i220',
 "5'-gagtgctctaatgactgactgaga-3'/5'-aaaggtgacatggaaagccc-3",
 'foldin',
 '0/134',
 'tissuessupplementaryof',
 'c.1702_1703del',
 'ea64',
 'v37m',
 'normalsupplementalmargaret',
 'lymphocytosissupplementarynormal',
 'eachrepresents',
 'l1546n',
 'f.d.n',
 'a259raf-13b',
 'c141y',
 'eitherslcros',
 'esr1-e380q',
 'www-huber.embl.de/users/anders/htseq',
 'tyr1278',
 'viadetected',
 'ubbe',
 'coitus.56runx1',
 'domain/ras',
 'pdgf-a/c',
 'phospho-erbb2',
 'supplementarysmarca4',
 'ethyleneglycoltetracetic',
 'wascomparable',
 'hsc/clp',
 'tim-craf',
 '24hec',
 'thegene',
 'fdr=0.01',
 'datafig',
 'nowak-wegrzyn',
 '1321g',
 'brc15',
 'mef2btranscript',
 'panitumumabf',
 'aml.15,23,27',
 '0019059',
 'for13',
 '1779insc',
 'r257g',
 'p.leu597val',
 'l766pproteins',
 'hgf-mediated',
 'uhplc-qqtof-ms',
 'msh6.67',
 'frame.29',
 'backx',
 'glu118',
 'pcaf-mediated',
 '596718',
 'www.arup.utah.edu',
 'investigations,36',
 'ands21',
 'inicd1.reverse',
 'paired-primers',
 'h1155',
 'showedremarkable',
 '5-cagtttctgtctgctaggag-3',
 'hal-b2',
 'assaysleft',
 's966q',
 'al-elein',
 'genesunfortunately',
 'nickel-1,2-dioleolyl-sn-glycero-3',
 'mandatory.26',
 'wereaccording',
 'gli1-driven',
 'tyrosinethus',
 'bc-his6lane',
 'garcia-olive',
 'p27t198v',
 'thr75met75',
 'intoprb16',
 'th2-helper',
 'iswi-related',
 'sos1e846k',
 'spoplalso',
 'dicer16',
 'lung29,30',
 'serleu',
 '1-trcn0000074283',
 'proteinsone',
 'psq-containing',
 'anti-erk-2',
 'suppression.it',
 'gst-erk2',
 'nacl',
 'doublet=en',
 'samplewould',
 '503-220-3405',
 'erbb4.23',
 'tyr279cys',
 'studied1',
 'erk1k71r/erk2k54r',
 'irizarry',
 'fiscella',
 'infiltratingresistance',
 't49.8',
 'acidsubstitutions',
 '600.options',
 'hadeggs',
 'whenof',
 '11resulting',
 'cellsmander',
 'methylation4,12',
 'caggtcttgatgtacttccctcgtttgtgcagc',
 'leu-536',
 'ppm1d-mutant',
 't232',
 'v101m',
 '1592delt',
 'atpprotein',
 'examineddiagnosed',
 'resistance3b',
 "5'-gggaccggcttaatccatag-3",
 'biotecnologies',
 'rxrb',
 'assays.72',
 'slidenorthern',
 'sdfl',
 'pixsys',
 'gcn5/pcaf',
 'e6.5-e9.5',
 'fkrkhkkdisqnkravrr',
 'mir-34c-5p',
 'mores3',
 'd024',
 'approximation.20,21',
 'nuclearextracts',
 'cases.7,10',
 'mmmt',
 'poulikakos',
 'tecknica',
 '140481275-140481298',
 'anti-ar3',
 'p70s6k-t389',
 'ccagagtgctctaatgactg',
 'c.ten',
 'york-presbyterian/columbia',
 'alone5',
 'dnindividuals',
 '3376554583moderate',
 'gfp-ezh2y641',
 'kmeindl',
 '657185',
 'anti-phospho-c-kit',
 'knies-bamforth',
 'aneuploidy.micrographs',
 'cancers4547',
 'sox11-negative',
 'hsc7e116',
 'inhibitor7figure',
 '2003-00328',
 'gfoldwt',
 '10wtwtwtwtwtwtwtwtwtwt1',
 'translational/targeted',
 '3q4u',
 'betap2loop',
 'geneperhaps',
 'coot.embl.de',
 'homologywith',
 'cyclearrested',
 'skin.58',
 'delinsl',
 'q1537rq1537r',
 'peprotec',
 '302131',
 'thusthere',
 'micecarrying',
 'speciwc',
 '430k10',
 'f57l',
 'b-rafq257rsupplementarys2',
 'pdgfrp',
 'prmd1',
 'targets13',
 'reflections14,83521,181',
 'myc-reconstituted',
 'polymorphism.in',
 'ibgc4.7,13,14',
 'bar=100m',
 'sspi',
 'clegg',
 'inknockdown',
 'inactivation,16',
 'set1a/b',
 '4division',
 'v1m',
 'datayes',
 'tctgcagcagcaggcaga',
 'l276praf',
 'mutras',
 'lats2-expressing',
 'studyf',
 'dccd',
 'flj126847.7col8a1acollagen',
 'pvhl-defective',
 'lys569',
 'm243-f1695',
 'that2',
 'homology2',
 '78260810',
 'sh_1',
 'zc3h12b',
 'songet',
 'g1157s',
 'medium10',
 'k433r',
 '28+ndnd',
 '4087197',
 'missense427.5nonsense11820.9frameshift',
 '1207/1605',
 'cellsac',
 '5cctcctaccttggcattaca3',
 'dosagef',
 'debrauwere',
 'betweendifferent',
 '20092013',
 '9p272s111310',
 'wild-typeqi',
 'pfdn6',
 '121anormalgacaspgacatcnegneg',
 'receptorlung',
 'ic50sdeletions',
 '986995',
 'appbp1-uba3',
 'r482c',
 'g272d',
 'g779s',
 'ezrint567dwith',
 'familyin',
 'heterodimer.a',
 'log-calculation',
 '18.2g',
 'c27a',
 'pastorfide',
 'vaco5',
 'shnf1e',
 'checkpointlines',
 'nhr14',
 '4515mbtnamissenseex8c.856c',
 'sda-containing',
 'melanoma38.4',
 'hla-c*04:09n',
 'reverse5-tcagtccataagccaagctctca-3',
 'c.1501g4a',
 'tnfsf11/rankl',
 'a226tfigure',
 'e640',
 '89991',
 '13871784',
 'gelsi',
 'rasbraf',
 'a218v',
 'g1202r.15alk',
 'ac-ii',
 'patientp22',
 'sirnas2b',
 'hafner',
 'l858rand',
 'c.s.hill',
 'r196l',
 'leukemias.40',
 'leukemogeneic',
 'separateof',
 'mutations24',
 'whalin',
 'trkai/ii',
 'e478k',
 'rb-/-/p107',
 'slidewe',
 'samplescommon',
 'phospho-kit',
 'ckitwild-type',
 '3.1-pdgfr',
 'cytoplasmic.the',
 'asn117ser',
 'nonpolyposispackage',
 'shecases',
 'lincscloud',
 'daystop',
 'p1087r',
 'tcccctgttgattccctaga',
 '11f41',
 'pittsburgh.sequence',
 'gfr/pkc',
 'c797s',
 'torc1',
 'ct60',
 'genesdna',
 't155i',
 'informationapart',
 'ttaggatgagcctctcctagactt',
 'previously.4,17,22',
 'indmem',
 '92.5101.5',
 'she78-7',
 'tia1',
 'make.cells',
 '23995711',
 'ds-55004',
 'l430p',
 '426-521',
 'method74',
 'angio100',
 'non-bat-rii',
 '368-5698',
 'armigate',
 'hpvprominently',
 '3.50+0.20',
 'atcontent',
 'fc2s',
 'agentscolony',
 'supp.s1',
 'genomicer',
 'cmv-vp16-tfap2a',
 '241-269',
 'lys105',
 'caagtattggtctctcgtctttcagctggataaggtctggtttaatgc',
 '0.27.6',
 'physicallyeither',
 'flag-traf6',
 'nct00312377',
 'ahcyl1',
 'fgfr1,17',
 'www.broadinstitute.org/cancer/software/genepattern',
 'populations180',
 'srp033306',
 'n1380',
 'wererequired',
 'bindingcrystal',
 'p087',
 '256kb',
 'accagcca-ccactttctgatagg',
 'catcccatggtggc*gggatggttgcagaag',
 'issubset',
 'certam',
 'd446v',
 'syndromecausative',
 'y339',
 'pone.0064364.e006.jpg',
 'functioning.the',
 'ligandlane',
 'tyr791phe',
 'c896',
 'fgf8=1.79769e+308',
 'ikk3',
 'f99s',
 'tasimilar',
 '2e758ga1l798f/ha4a864q',
 'nucleotide.asxl1',
 'y537s3045950.0110.0001',
 'nes=2.05',
 '17q22-2514daint-213q32-33',
 'fluor-555',
 'flankswere',
 'tgs-6',
 'p73mutant',
 'yasuji',
 'p.asn127del',
 '2122-nt',
 'cimp16',
 'tamra-ins',
 'fibroblasts5',
 'serum5figure',
 'studiesrnai',
 'chromosome17q11.2',
 'proliferationa',
 'pci-neo-baf250',
 't-e-y',
 'd609g',
 'lagerstedt',
 'aptag-1',
 'gcmn',
 'tttggaagctctcagggtac',
 'syndromes15,21',
 'sophie',
 'ciovacco',
 'immunoblottingand',
 'injeclion',
 'c.2149g',
 'pa2g4p4',
 'dach1',
 'sud-luxembourg',
 'd85n',
 'e600w',
 'krasthe',
 '1mq4c',
 'h193qp53',
 'peg3',
 'a40v',
 'theerror-containing',
 'g112e',
 'messiaen',
 '586del',
 'repeats5,6',
 'mekwith',
 'r748g',
 'mpl.34-37',
 "c'l",
 'nopho',
 'bosmuller',
 'devol',
 'rs66944506',
 '4494503',
 'arer207w',
 'cellsdataset',
 'fhl1-induced',
 'wolf-hirschhorn',
 'diderot',
 'msh2-vd862msh6p',
 'p3xflag-cmv-wild-type-chk2',
 '3.24+0.18',
 'small.1',
 'c.856g',
 'masp2',
 'shinmura',
 'improm-ii',
 'hs00368175_m1',
 '2supplementary1',
 'ewsr1-2',
 'd86n',
 'errfi14a',
 'egflane',
 'f224lcontain',
 'y859',
 'smai/bglii',
 "5'-atcatgtttgagaccttcaa-3",
 'e1a-binding',
 'this25',
 '106the',
 't/p.p214l',
 '0.20.3',
 'supplementary11a',
 'randerath',
 'antigal4',
 'chek1',
 'differenceq276p',
 'schwaller',
 'g20a',
 'htlv-iinfected',
 'together.13,14,36',
 'kms-9',
 'pegfp-flag-pdz',
 'antibody15,34,39',
 't0.31',
 'amc.uva.nl',
 'c304',
 'proteinsupplementaryno',
 '5001,600',
 'arid1b-associated',
 'receptor-smads',
 '5-ctggaagcaaagacggacaa-3',
 'pznctj2-q205l',
 'h1881',
 '7235g',
 'siles',
 'balkwill',
 'mycmaxmxd',
 'p2lv-h-rasand',
 'theevent',
 'il-1-il-1r',
 'etoh/hanks',
 'f7425',
 'inhibitionsupplement',
 '5nmis',
 '10.1007/s10147-013-0602-1',
 'et163950',
 'future-the',
 'erlotinib.81',
 'non-transactivating',
 'flt3mf5',
 'comparedunfortunately',
 'nucleotidesthe',
 'p.q684x',
 '31544',
 'resultsliposarcomas',
 'adenocarcinomas.28,44,47',
 'mutant-egfr',
 'pgex-ecorv-sac1',
 'specificallyt790m',
 'lmpl',
 'cisa',
 'yeast,3',
 'k708',
 'competitorin',
 '0.500.83',
 'donnem',
 'l858r-substituted',
 'paip1s-luc',
 'c.425',
 'bclanes',
 'immunoblottingfor',
 'm-v5',
 'f37',
 't4m0',
 'smad4-reconstituted',
 'ct-a/cyt-1',
 'pladienolide,64',
 'fgfr2-cit',
 'stag2l360w',
 'polymicrogyria-postaxial',
 'methodsr300h',
 'glfg-2',
 'flag-elf3n233',
 'anti-gata3',
 'pml/rar-expressing',
 'n=48supplementary',
 'catagg',
 'ssc/0.3',
 'variation_29880',
 'ew-sm1-989',
 'restrictionpattern',
 'function5677insatruncated',
 'dmempten',
 'condensedf',
 '2683-2711',
 'pe-fluorescence',
 'pathogenica',
 '3.285614467',
 'w406r',
 'ampmdm2',
 'srf-staining',
 'zfn639',
 'y6a',
 'c-16',
 'c3h/10t/2',
 'y-meso-27',
 'fc=1.46',
 'g482v',
 'transformationrelevant',
 'p332q',
 'lys3326tersdhb',
 '4337123',
 'p53-/-/p210',
 'bjornsti',
 'allele34,35',
 'theiii',
 'igf-i-induced',
 '2.1289126',
 'e40t',
 'bc012846.1',
 'cellssupplementalimportantly',
 'vigers',
 'anassays',
 'brim-311',
 'rp11-295i5',
 'ttce2',
 'idh2,15',
 'detectedlanes',
 'bcr-ablt315itransformed',
 'il-9r/c',
 'esr-y537n',
 'melanomasl',
 'g418-positive',
 'primersthat',
 'wee1hu',
 'hrp-coupled',
 'wild-type-likemutant',
 '4/490',
 'leu298',
 'collagen-5a1',
 'p.q9x',
 'hetzer',
 'gradeprognostic',
 'wm278-gfp',
 'spen-specific',
 'contexts.conclusionin',
 'e02641959224774121743269288',
 'tmm18',
 'g662eschematic',
 'h585d',
 'gagtcatcaattttattctgactgatcc',
 'myc5a',
 '37yrs',
 'pro261-induced',
 'ng/lin',
 'post-enucleated',
 's1py-bound',
 'v281i',
 'disease0',
 'bartletts',
 'cellssupplementaryoverall',
 'modifications.9,10',
 '5-caaga-acagcaacgagtaccg-3',
 'ostium-primum',
 'female3823',
 'pms2-interactive',
 'aacggtaccaaggctgagaa',
 'texasred-conjugated',
 'lskeffect',
 'mfinal',
 'spop-mediated',
 'h-galactosidase',
 '70.169.2',
 'esmedical',
 'budssuch',
 'reorganizedin',
 'r246w',
 '32p-4e-bp1',
 "5'-aatgcccat",
 'anti-phospho-c-met',
 '68/f',
 'trastuzumab.25',
 'snu-886',
 'cplcknditkrslqestrfsqlveellkii',
 'htertnot',
 '1.282.17',
 'satoko',
 '5-aaggtgttgcaatccccagc-3',
 'matrix.41',
 'mutationssupplementarythese',
 'gccagcattttagcattacttc',
 'a864t',
 'indouble',
 't49.2/t49.3',
 '9206s',
 'promoter-ttadriven',
 's1/kh',
 'interaction17',
 'unknown/na14.813.223.8',
 'p110-d964a',
 'jak3-stat3',
 'extendmr',
 'y-733',
 'pbabe-zeo-nrasg12d',
 'mek1c121s-expressing',
 'anti-pdgfc',
 'asp-220',
 'g1567d',
 '211403',
 'polyacryiamide',
 'p.val600_lys601',
 'jim3',
 'statethe',
 'staphylococcusbetween',
 "5'-attacacagtatcctcgaca-3",
 'ha-11',
 'u3-1287',
 'di7s2sorepeating',
 'hadshould',
 'y1003x',
 'sd/agar',
 'differentiation,3',
 'staalesen',
 '70800',
 'activity.23,24',
 'againstwap-cre',
 'g162rnormalnormaldeficientdeleterious0.00pathogenic',
 'gly719x',
 'mds19',
 'd464g',
 'mode.14',
 'y88c',
 'asp770_asn771insmetalathrpro',
 'oxiod',
 '50target',
 'q1496h',
 'c135r',
 'crizotinibc',
 'sequencing.10',
 'rotterdam.26',
 'defectao105yes',
 't198a/g',
 '14.981.7',
 'locatedleading',
 'sex.1984',
 'capacityrestoring',
 '20-mmol/l',
 'butdownloadin',
 'makita',
 'syndromeassociated',
 'newman-keuls',
 '5-gatggtgggggccctcctctt-3',
 'mutagenesismutants',
 'analysisimmunohistochemical',
 'rs3f',
 'that.altogether',
 '030/50',
 'conformationgrowthcodon',
 'gln184stop',
 'arrest.the',
 '3.40+0.12',
 'a328p6ntp91l',
 'status22',
 'sample.sanger',
 'pocket4a',
 's71w',
 'sdcbp2',
 'm114',
 'phospho-ros1',
 'nm_007817',
 'ntgtg',
 'lossoffunction',
 'eachaffected',
 '71f/57stomach453spvery',
 'anti-cd4',
 '144k',
 'scalia',
 'microcystein-lr',
 'repairatpase',
 'me6',
 'erk.of',
 'tfsearch',
 'xeds/eels',
 'her2+/pi3k-mutant',
 's6o',
 'p5-specic',
 'l65f',
 'antibody.of',
 'a3062g',
 'malignancy13',
 'locusb',
 'hkr1',
 'supplemental5a-b',
 '2chighwire_math',
 'sc1747',
 'turnhout',
 'nrap',
 'wasthree',
 '413641513',
 'metnm_000245c2646tp814sgermline1/0',
 'binitial',
 'tgcggacagg',
 'carcinoma987512.512.5040',
 'lsab+system-hrp',
 'ret-ntrk1',
 'nordenstadt',
 'molecularn114s',
 'rotolib2.aa',
 'backgroundcould',
 'n105kthe',
 'y110c',
 '395one',
 'complexes129',
 'sulfateinduced',
 'shrna-smad2',
 'c-dx',
 'trii/alk-x',
 'sa-b',
 'h920',
 'referencingca',
 'ctm-5990506',
 'er+breast',
 '33-kda',
 'alkf1178l',
 '5tgcaaggtggagcgattctg',
 'withclosure',
 'substitutionb',
 't-lymphotropic',
 'pa30824101none0',
 'lm3d-induced',
 'vomiting30',
 't479pvitro',
 '5-acacgtccccatctgaag-3',
 'a279vprotein',
 'br67',
 'decribed.8',
 'complementarygene',
 'hoxd8',
 '5-uauauuuauauauuagacgdgdg-3',
 'skp1or',
 'imatinib.the',
 ...}

In [26]:
np.save("processed/stage1/biolab_updated_wvs.npy", corpus_word_vectors)

gcloud tensorboard serving


In [14]:
dataset_corpus_words_list = np.load("dataset_corpus_words_list.npy")
corpus_word_vectors = np.load("corpus_word_vectors.npy")

In [15]:
tb_vocab_size = 10000

In [ ]:
local_tb_dir = "/home/bicepjai/Projects/ml-compete/kaggle/mskrct/data_prep_2_ft/model_wv_visualize/gcloud/"

In [34]:
with open(local_tb_dir+"/vocab.tsv", "wb") as fp:
    wr = csv.writer(fp, delimiter='\n')
    wr.writerow(dataset_corpus_words_list[:tb_vocab_size])

for http://projector.tensorflow.org/ vectors need to be in tsv form


In [13]:
# np.savetxt("model_wv_visualize/word_vectors.tsv",corpus_word_vectors[:tb_vocab_size], delimiter='\t')

write to checkpoint file


In [30]:
!rm $local_tb_dir/checkpoint
!ls $local_tb_dir


rm: cannot remove '/home/bicepjai/Projects/ml-compete/kaggle/mskrct/data_prep_2_ft/model_wv_visualize/checkpoint': No such file or directory

In [32]:
from word2vec import visualize_embeddings_in_tensorboard
visualize_this_embedding = corpus_word_vectors[:tb_vocab_size]
print visualize_this_embedding.shape
# path for gcloud tensorboard
metadata_path = "/home/bicepjai/projects/tb_visual/vocab.tsv"
# metadata_path = "/home/bicepjai/Projects/ml-compete/kaggle/mskrct/data_prep_2_ft/model_wv_visualize/vocab.tsv"
visualize_embeddings_in_tensorboard(visualize_this_embedding, metadata_path, local_tb_dir)


(10000, 200)

In [33]:
checkpoint_txt = "model_checkpoint_path: \"/home/bicepjai/projects/tb_visual/visual_embed.ckpt-1\"\n\
all_model_checkpoint_paths: \"/home/bicepjai/projects/tb_visual/visual_embed.ckpt-1\""
with open(local_tb_dir+"/checkpoint","w") as f:
    f.seek(0)
    f.truncate()
    f.write(checkpoint_txt)

In [ ]:

FastText Vectors

fasttext commands used

fasttext skipgram -minCount 1 -dim 200 -epoch 10 -input corpus_text_for_fast_text.txt -output ft_wvs_200d_10e

fasttext cbow -minCount 1 -dim 200 -epoch 10 -input corpus_text_for_fast_text.txt -output ft_wvs_200d_10e

reading ft vectors


In [32]:
fasttext_vec_file = "processed/stage2/pretrained_word_vectors/ft_sg_200d_10e.vec"

In [33]:
ft_lines = None
with open(fasttext_vec_file,"r") as f:
    ft_lines = f.readlines()

In [34]:
print(ft_lines[0])
print(type(ft_lines), len(ft_lines))
ft_shape = tuple([int(i.strip()) for i in ft_lines[0].split()])
ft_shape


362933 200

<class 'list'> 362934
Out[34]:
(362933, 200)

In [35]:
print(len(ft_lines[1].split()))
ft_lines[1]


201
Out[35]:
'the 0.027251 -0.018114 0.0096083 0.076723 -0.29626 0.05729 0.17298 0.097187 0.10251 0.16822 -0.40156 0.12471 0.11843 0.069956 0.031858 -0.20362 0.18791 -0.20113 -0.20219 0.002323 -0.30366 0.16106 -0.091842 0.028771 -0.082447 0.18842 0.02471 -0.10553 -0.28138 0.044856 -0.041988 -0.031351 0.25131 -0.18547 0.23941 -0.18438 0.12292 -0.039016 0.075311 0.028379 0.024822 -0.069827 0.054794 0.19297 0.19053 -0.15749 0.21978 -0.003489 -0.15063 -0.018887 0.05638 0.1385 0.10112 0.023256 -0.22436 -0.27619 -0.047866 -0.053595 0.010177 0.059109 0.078079 0.080721 -0.017329 0.29334 0.19386 0.1279 0.04759 0.11951 -0.37341 -0.028312 0.0086509 0.021498 0.049069 0.094658 -0.076768 0.00541 -0.0013258 -0.062564 -0.092488 0.15718 0.21148 0.11005 0.088614 0.17268 0.057106 -0.0044174 -0.0072504 0.01389 -0.067416 -0.18715 -0.009639 0.12991 0.11389 -0.0017624 0.020464 -0.19809 -0.038933 -0.016631 -0.24906 0.012139 0.21376 0.14972 -0.16496 0.3738 -0.095022 0.10864 -0.058577 -0.034298 0.0021112 -0.010114 -0.024814 0.027078 0.036302 0.10004 -0.35396 -0.064597 0.0010858 -0.0049044 -0.094081 0.096904 -0.0046191 0.074286 0.09301 -0.28307 -0.15225 0.064754 0.094255 0.20833 -0.088393 0.1362 0.11452 -0.076745 0.26119 0.068646 0.067695 -0.069496 -0.047141 0.11597 -0.18205 -0.074642 -0.0431 -0.15549 0.27262 -0.012248 0.067552 0.12357 -0.027967 -0.24034 0.21146 -0.030294 -0.16886 -0.36566 -0.027902 -0.04372 0.079934 -0.10144 -0.029423 -0.06038 -0.22478 -0.19269 -0.068223 -0.016667 0.3038 0.012443 -0.42416 0.077392 -0.19895 0.016593 0.051294 -0.0079492 0.11613 -0.13423 0.19772 0.056557 -0.023173 -0.15394 0.078205 -0.17027 -0.26604 0.098637 -0.036921 0.18138 0.20576 -0.17695 0.15974 -0.059677 0.017603 -0.21435 -0.036402 -0.085582 0.06715 0.080574 -0.21038 0.024121 -0.18857 -0.0853 -0.27693 -0.00081868 0.02937 0.040705 \n'

In [36]:
ft_vocab_size=ft_shape[0]
ft_vocab_size


Out[36]:
362933

In [37]:
ft_word_vectors = np.random.randn(ft_vocab_size, ft_shape[1])
ft_words = []

In [38]:
for i, line in enumerate(ft_lines[1:]):
    str_list =line.split()
    ft_words.append(str_list[0].strip())
    vec = np.array([np.float(f) for f in str_list[1:]])
    ft_word_vectors[i] = vec

In [39]:
ft_word_vectors.shape


Out[39]:
(362933, 200)

In [40]:
a = list(ft_words)
a.sort(key=len, reverse=True)
print(a[:10])
del a


['k2950n1.6.2501.857.151010r2108h2.722.55.521.745.481010s1733f8.88.7608.121.33108g1529r4.123.9708.091.23108i2285v5.492.4107.97.86107l1019v3.81.543.186.462.86106a75p3.52.312.516.342.20106t3349a.711.233.45.352.22105r1190w1.991.172.125.271.88105p1819s4.47.771.535.231.70105t630i4.64.5805.231.68105g1771d4.97.1505.121.33105k1690n4.49.5305.021.05105s1172l3.36.56.894.816.48104q2384k1.73.35.264.796.11104c554w1.31.33.74.715.15104d2312v.062.731.914.573.76104ivs26-20ct1.33.71.534.483.04104g602r.89.173.594.32.02104e462g1.39.641.823.846,960n56t.97.112.963.826,666h2074n3.5.3103.816,513r2973c1.39.262.633.755,685c3198r1.651.22.873.745,541i1929v.371.931.33.593,914n1228d.351.182.043.573,726h1918y2.31.2503.553,552v1306i1.891.21.373.472,979y3098h3.45.71.73.462,892v2969m2.06.82.573.452,806v894i.91.011.483.392,440i1349t.642.7203.372,320q1396r.021.741.63.362,280n2113s1.03.991.233.241,749r2842h.851.231.123.21,574ivs25+9ac.481.011.73.191,553v3079i.82.3803.191,545ivs11-20ta.83.232.523.111,288r2888c1.111.35.643.11,246t2250a1.3.291.9831,005f1524v1.991.18.172.99982l1904v.331.241.422.99976t582p.322.6402.96910d3170g1.54.071.422.89772d1280v2.571.01.722.85714p168t1.721.0902.81646n2048i.751.28.782.81639k2729n2.21.81.382.79614d2665g.61.891.12.61406r3052q1.241.3102.55354g1194d1.41.151.242.5316p375s1.261.2402.5315v2908g1.171.2402.41256d806h.571.18.662.41255t1354m1.12.271.562.4254n1102y2.53.1402.39246y3092c2.34.11.012.22166ivs20-16cg1.71.27.782.22165c1365y.571.23.382.18150n2436i.391.32.412.12132k2411t.841.2402.08119l2396f.671.3802.04111k513r.741.2902.03108table', 'r866c7.71.762.6312.11.251012p142h8.922.16011.081.201011t1720a6.23.495.221.959.001010n810y2.427.281.241.948.771010s186y3.573.542.39.422.61109p1614s.517.121.028.644.40108v1534m4.633.9908.624.18108e597k6.551.9208.472.95108n723d1.138.9407.816.44107e1214k2.412.132.937.472.96107s1101n7.06.5506.513.23106ivs16-20ag2.573.6906.251.78106ivs8-17gt4.421.6206.041.10106p1238l5.251.31.546.021.05106y105c.834.57.125.291.94105v191i1.862.231.025.111.30105p334l.954.85.974.887.52104i1275v5.28.4904.796.16104ivs18-6ca2.142.4504.593.90104p1859r2.742.26.524.472.98104r504h3.19.571.664.281.92104s1266t1.872.3904.261.80104d67y3.23.281.084.031.06104i1044v1.542.403.948,723v1247i1.58.172.53.918,123ivs2-11delt1.492.3903.887,598ivs18-13ag.392.421.023.836,799i925l1.412.3903.86,251n473s1.292.4903.786,049ivs6+7ga1.432.16.173.755,601r1203q1.71.152.183.755,589g890v3.86.1603.75,006e842g1.222.4603.684,741m1652t1.74.091.923.573,728d369n3.77.2203.563,593d369del', '1.312.2103.523,328v1736a1.52.1.253.352,219k862e2.93.19.573.312,059r1028h1.232.0803.312,045e143k2.3.121.093.271,871d642h.75.072.493.161,455ivs2-13cg1.161.9103.071,166d1546y.562.4703.041,088a622v3.43.4502.98963v1804d2.61.57.882.92828ivs12+10gc.472.4302.9803i1405v.422.4602.88752q804h1.25.331.952.86729k1109n.512.3202.82664ivs11-11tc.372.4502.81648q155e.272.4502.72524p1637l1.83.4.462.69493t1349m2.71.0402.66462i1858l.142.502.65444n1468h.042.4902.53337m297i02.502.51322e1682k02.502.5317ivs21-8ct1.89.13.742.5313a280g.062.4902.43266ivs17-9at.092.4302.34218e1419q2.37.0602.31203f1662s.162.4502.28192h1402y.072.45.292.22165r1751q2.66.19.32.17150i124v2.21.0402.17147ivs15-7ct.24.121.972.1125n132k2.32.2302.09123d420y2.16.0702.09123l668f2.85.73.12.02105m1361l2.09.0702.02104brca2', 'typeazd6244pd98059pd0325901u0126gsk1120212bsp600125bi78d3as601245sb203580ecc1endometrial3.1/na5.5/na7.1/na0.2/na1.4/nana/nana/nana/nana/nahec108endometrial9.8/nana/na5.5/na0.6/na0.6/nana/na4.1/na2.3/nana/nahec265endometrialna/nana/nana/na1.4/nana/nana/nana/na1.1/nana/nasngiiendometrial0.1/na0.4/na2.1/na0.6/7.81.3/nana/na1.1/na0.49/9.8na/naovk18ovarian0.03/1.20.03/2.70.04/1.80.009/0.70.04/9.50.09/7.10.06/8.20.08/6.4na/naic25', '1.13.071.232.43268ivs6-3cg2.28.07.002.35225ivs15+1ga2.11.19.002.30199ivs17+1ga2.21.09.002.29196t1685i2.10.04.002.15140i1766s1.11.16.872.14139g1738r1.94.12.002.06114t1685a2.00.03.002.03107ivs6-1gc1.84.01.001.8572m1689r1.49.17.001.6746s1715r1.56.09.001.6545ivs12+2del21insa1.46.10.001.5636m18t1.41.09.001.4931a1623g', 'cellmelanomal.vmphomamalthiv-positivelunggliomasgliohlaslomasaslrocytomasgliohlaslomasastrocytomasgliohlaslomasaslrocytomasprostalelocidinucleotidedis126d2s393d3s1067d5s644tp53dinucleolided2si23d2s136o3si067diis922tp53ap', 'ins20.9pd3052mansmokeradenocarcinomaivl858r/s768i11.3pd8945womanneveradenocarcinomaivl858r18.0pr11462womanneveradenocarcinomaivl858r14.7sd11639womanneveradenocarcinomaiiibl858r14.1sd14260womanneveradenocarcinomaiv19', 'wd37-1pdmet47-1w54h-1wdmet58-1wd59-1anapmet60-1wdmet61-1wd64-1anap65-1wd36-2wd37-2anap47-2wd54h-2wd58-2pdmet60-2wd61-2pd64-2anap65-2anap36-3wd37-3anapmet47-3wdmet54h-3wdmet58-3wdmet61-3pd64-3anap65-3anapmetin', 'a.a.g.c.g.c.a.a.g.a.t.c.c.a.g.g.t.t.c.tg.c.a.g.c.a.g.c.a.g.g.c.a.g.a.t.g.a.t.g.c.a.g.a.g.g.a.g.c.g.a.g.c.t.g.a.g.c.g.c.ct.c.c.a.g.c.a.a.ga.a.g.t.t.g.a.g.g.g.a.g.a.a.a.g.g.c.g.g.g.c.c.c.g.g.a.a.ac.a.g', '5.28.81.006.101,249,808r1699w4.34.26.004.6039,978g1788v2.37.421.063.857,054ivs5+3ag3.04.12.003.151,417g1706e2.35.13.292.77589ivs19-12ga4.52.102.062.56363l1764p1.56.09.892.54350v1688del']

In [41]:
ft_wordidx = {w:i for i,w in enumerate(ft_words)}
ft_vocab_size, len(ft_wordidx)


Out[41]:
(362933, 362933)

In [42]:
len(set(vocab_words) - set(ft_words))


Out[42]:
1677

In [43]:
set(vocab_words) - set(ft_words)


Out[43]:
{'lrp4',
 'dfnb59',
 'a75p',
 'g1803a',
 'e1682v',
 'atp7b',
 'fxn',
 'r561c',
 't417_d419delinsi',
 'mocs1',
 'r1726g',
 'scn4a',
 'pdha1',
 'c420g',
 'g2420c',
 'prkn',
 'dpys',
 'e1051k',
 'v384d',
 'prpf31',
 'plod2',
 'h1805p',
 's860l',
 'g776delinslc',
 's241y',
 'washc5',
 'r112g',
 'siae',
 'r342w',
 'q2416*',
 'ctns',
 'rtn4r',
 'e685v',
 'q1811r',
 'a41s',
 'a500t',
 'm1663l',
 'gdi1',
 'gnmt',
 'd603g',
 'adgrg1',
 'mtmr14',
 'slc12a3',
 'r421*',
 'l37p',
 'e1356g',
 'h773dup',
 'g598a',
 's216f',
 'p2417a',
 'cfl2',
 'y599_d600inspapqimststlisenmnia',
 'x582_splice',
 'cngb3',
 'bbs4',
 'vangl2',
 'r505l',
 'bckdhb',
 'd737v',
 'i2675v',
 'arsb',
 'r331p',
 'i1718t',
 'a2351g',
 'l1844r',
 'r2336p',
 't244_i245inscpt',
 'r280g',
 'a72d',
 'grxcr1',
 'cldn14',
 'aptx',
 'phex',
 'slc19a3',
 'r100*',
 'hps3',
 'gjb1',
 's453fs*',
 'krt86',
 'atg16l1',
 'pafah1b1',
 'a60v',
 't1685a',
 'm1663k',
 'ftcd',
 'hip1-pdgfrb',
 'l747_a750del',
 'n480del',
 'a57v',
 'dock8',
 'e554_k558del',
 'pitx3',
 'm1v',
 'q546l',
 'p838l',
 'r1627',
 'a312p',
 'dysf',
 'k376n',
 'dmgdh',
 'f537_k539delinsl',
 's23r',
 'gja8',
 'sh3pxd2b',
 'sil1',
 'p551_m552del',
 't216s',
 'stxbp1',
 'hspb3',
 'alg3',
 'gdap1',
 't1151r',
 'sucla2',
 'e606g',
 'sptlc1',
 'k700r',
 'e69g',
 'v750e',
 'slc25a3',
 'g87r',
 'p2ry12',
 'cbx2',
 'y113*',
 'g196v',
 'g106_r108del',
 'stox1',
 'r1190w',
 'crb1',
 'd1352y',
 'r514c',
 'y382h',
 'cpt1a',
 'b3glct',
 'opa1',
 'k550_w557del',
 'slc26a2',
 'lpar6',
 'mkrn1-braf',
 'cyba',
 'colq',
 'r369w',
 'upk3a',
 'l283_d294del',
 'y647c',
 'k550_v555delinsi',
 'e545v',
 'kcnj10',
 'k806a',
 'v118d',
 'krt83',
 'slc6a5',
 'ubiad1',
 'alg12',
 'slc37a4',
 'hpgd',
 'w531c',
 'q276*',
 'e120q',
 'x475_splice',
 's1088f',
 'chst14',
 'zfyve27',
 'r194w',
 'krt12',
 '533_534del',
 'hmcn1',
 'p286s',
 'nipa1',
 'slc22a4',
 'cyp4v2',
 'r621c',
 'v3079i',
 'l611_e612inscssdneyfyvdfreyeydlkwefprenl',
 'delta-ntrk1',
 'c450_k451insmiewmi',
 'n639k',
 'r450h',
 'p490_q494del',
 'agps',
 'y24c',
 'robo3',
 'adgrv1',
 'gm2a',
 'pomgnt1',
 'stra6',
 'y3092c',
 'q144p',
 'r659l',
 'g6pc',
 'd1010n',
 'd568n',
 'tspan12',
 'i111a',
 'r2842c',
 'a1701p',
 'trpm7',
 'i254t',
 'r544s',
 'd3170g',
 'd402y',
 'trex1',
 'mbtps2',
 'v762m',
 's241a',
 'aldh5a1',
 'kcne3',
 'cc2d2a',
 'galns',
 'wdr35',
 'fut1',
 'a767_v769del',
 'y35c',
 'hlcs',
 's326r',
 'v665a',
 'c1qtnf5',
 'f877l',
 'nsdhl',
 'matn3',
 'e218*',
 'a2770t',
 'd587h',
 'a603fs',
 'pla2g6',
 'v1306i',
 'anos1',
 'fam126a',
 'a767_v769dup',
 'mmaa',
 'a41t',
 'q429*',
 's562l',
 'gckr',
 'tlr5',
 'nhlrc1',
 'gdf5',
 'v559_v560del',
 'nectin4',
 'e265k',
 'r380a',
 'p29t',
 'k483e',
 'acsl4',
 'a120s',
 'l455m',
 'g119t',
 'slc35d1',
 'c1365y',
 'htra2',
 'k459_s460delinsn',
 'm1328i',
 'd1010y',
 'r515a',
 'sf3b2',
 'kcnv2',
 'scn1b',
 'cx3cr1',
 'pou3f4',
 'creld1',
 'cpt2',
 'i408v',
 'p449_l455del',
 'tsen54',
 'r957q',
 'k745_a750del',
 'gcnt2',
 'l97r',
 'lcat',
 'e60l',
 'slc6a8',
 'q72l',
 'r1446g',
 't2250a',
 'slc6a19',
 't599_v600insett',
 'r479h',
 'g173c',
 'n2829r',
 'slc52a3',
 'l755m',
 'tcn2',
 'tbc1d24',
 'ift122',
 's752_i759del',
 'cyp2r1',
 'i251s',
 'd887n',
 'w559_r560del',
 'a77p',
 'f590g',
 'asah1',
 'avp',
 'antxr1',
 'd609e',
 'hsf4',
 'y1045*',
 'e1836k',
 'arl6',
 'r798*',
 'd769n',
 'r148i',
 'q395*',
 'rps26',
 'e709_t710delinsd',
 'mapk8ip1',
 'k467t',
 'd646y',
 'cstb',
 'lpin2',
 'x1009_splice',
 'hesx1',
 'r258m',
 't3211k',
 'kcne2',
 'd74y',
 'r922*',
 'n771_h773dup',
 'rfxank',
 's45c',
 'alg6',
 'fgf14',
 'v2969m',
 'a598t',
 'l301s',
 'acat1',
 'n826y',
 's206c',
 'tbx22',
 'v769_d770insgvv',
 'g665a',
 'q2384k',
 'e622q',
 'coq8a',
 'n1333gfs*',
 'cul1-braf',
 'a97g',
 'lgi1',
 'eif2b1',
 'd1642h',
 's267_d273dup',
 'opcml',
 'ccm2',
 'd29h',
 'i33del',
 'dars2',
 'y236s',
 's2483n',
 'r2842h',
 'kcnmb1',
 'scnn1b',
 'e1978*',
 'bsnd',
 'g596c',
 'sumf1',
 'y1853*',
 'r389*',
 'g17e',
 'n387p',
 'g1194d',
 't82a',
 'e612_f613insgyvdfreyeydlkwefrprenlef',
 'ndufs2',
 'aldh4a1',
 'w24s',
 's116f',
 'cox4i2',
 'v104l',
 'i1680n',
 'l461v',
 'd289del',
 'i1807s',
 'ndufaf4',
 'p81t',
 'e267g',
 'a1200v',
 'd29y',
 'v852i',
 'f158c',
 'foxred1',
 'fcgr2b',
 'aldh3a2',
 'r2502h',
 'a77s',
 'plod3',
 'l234fs',
 'e598_y599insdvdfreye',
 'f136l',
 'y1003*',
 'atp6v0a4',
 'y375_k455del',
 'l485_q494del',
 'gnptg',
 'p1859r',
 'plec',
 'e144k',
 'l1678p',
 'trmu',
 'adamts13',
 'd842_i843delinsim',
 'l1273f',
 'r838q',
 'tfr2',
 'mplkip',
 'mfn2',
 'bbs9',
 '1_2009trunc',
 'gnptab',
 's142i',
 'y553_k558del',
 'd2312v',
 'd806h',
 'mcfd2',
 'g1079d',
 'pccb',
 'nipal4',
 'a1752v',
 'e326l',
 'alg1',
 'r248g',
 'f216a',
 'pon2',
 'efhc1',
 'hadha',
 'n987i',
 'dnm1l',
 'snta1',
 'olr1',
 'atf7ip-pdgfrb',
 'r2108h',
 'idua',
 't2681r',
 'gbe1',
 'r108g',
 'r133*',
 'slc11a2',
 'k558delinsnp',
 'slc22a18',
 'w603_e604insdreyeydlkw',
 'n1819s',
 'arsa',
 'r213l',
 'slc4a11',
 'l915m',
 'pnpo',
 'bag3',
 'q58_q59insl',
 'v35m',
 'd32v',
 'fig4',
 'hsd3b7',
 'atp13a2',
 'e746_s752delinsi',
 'rxfp2',
 'umps',
 'h284y',
 'k45t',
 'neu1',
 'v1188l',
 'c248t',
 'p4309a',
 'y234n',
 't488_p492del',
 'e632_l633del',
 'p1776s',
 'e571a',
 's279y',
 'l49h',
 '2010_2471trunc',
 'cox15',
 'apol1',
 's584l',
 't4511i',
 'ecm1',
 'e160*',
 'hmgcl',
 'arfgef2',
 '385_418del',
 'vps33b',
 'r812a',
 'v564i',
 'c383y',
 'r462i',
 'y599_d600insglyvdfreyey',
 'clcnka',
 't286i',
 'n553s',
 'd1778g',
 'xylt2',
 'tufm',
 'cpn1',
 'abcg8',
 'r1209w',
 'm391r',
 'i99m',
 'papss2',
 'p375s',
 'v677i',
 'eif2b3',
 'q984k',
 'rnaset2',
 'acads',
 'dpagt1',
 '596_619splice',
 'litaf',
 't319del',
 'brwd3',
 'kcnj5',
 's1670a',
 'e746_t751insip',
 'rbm20',
 's102f',
 'q579_l581del',
 'y220d',
 'l2721h',
 'r258c',
 'kcnj18',
 'gamt',
 'a146p',
 'r80p',
 'd390y',
 'cpox',
 'y3098h',
 'atf7ip-jak2',
 'w802*',
 'e719k',
 'yars',
 'n238s',
 'v294m',
 'pcca',
 'p551_w557delinsl',
 'pepd',
 'f568fs',
 'n659r',
 'e35*',
 'gucy2d',
 'l576del',
 'phka1',
 'r183p',
 'sar1b',
 'e746_s752delinsv',
 'h355m',
 'w1502a',
 'ppargc1b',
 'e311_k312del',
 'gch1',
 'l145r',
 'm552_k558del',
 's1841r',
 'h1918y',
 'q2405rfs*17',
 't80r',
 'd1851e',
 'n549s',
 'secisbp2',
 'rspo4',
 'd355e',
 'mpdu1',
 'cdan1',
 'dtna',
 'n1125i',
 'a113_splice',
 'slc25a12',
 'e1322*',
 'd842_h845del',
 'crx',
 'r5q',
 'sc5d',
 's302g',
 't599_v600inseat',
 'r592h',
 'mlph',
 'mefv',
 'f53i',
 'q123r',
 'h2428q',
 'sgce',
 'ndufaf5',
 'y598c',
 'm1i',
 'd816a',
 'g81d',
 'a128d',
 'l122r',
 'a197t',
 'n564_y578del',
 'myo5b',
 'znf592',
 'prkag3',
 'm1_e165del',
 'bbs1',
 'bcs1l',
 'q1064r',
 'k1299e',
 't574_r588delinsl',
 'l1301r',
 'h773inslgnp',
 'p1502l',
 'abcd1',
 's109p',
 'f1662s',
 'slurp1',
 'r487*',
 'r1608s',
 'serpina7',
 's1424c',
 'spink5',
 'igh-fgfr3',
 's1722f',
 'htra1',
 'sptbn1-pdgfrb',
 'etfdh',
 'p38l',
 'g914r',
 'ndufaf3',
 'd842_m844del',
 'w714*',
 'slc6a20',
 'prkra',
 'y568_l576delinsvn',
 'a2425t',
 'rp2',
 'h662r',
 'q201h',
 'p28s',
 'r1189*',
 'fgfr1op1-fgfr1',
 'pitx1',
 'l1574p',
 'v1676d',
 'rdh12',
 'r2888c',
 's36y',
 'tubb1',
 'slc7a7',
 'y646s',
 'slc17a5',
 'tmie',
 'n581t',
 'foxi1',
 'k28m',
 'e14*',
 'arms2',
 'ece1',
 'gdf1',
 'akap10',
 '-',
 'w742l',
 'tirap',
 'g292r',
 'r335*',
 'acadl',
 'v747l',
 'r79p',
 'pla2g4a',
 's280f',
 'klf11',
 'lhx3',
 'd301n',
 'rgs9',
 's257w',
 'r24p',
 'k830r',
 'r1563s',
 's24f',
 'v197l',
 'khk',
 'a97v',
 'pank2',
 'i32del',
 'n2436i',
 'flnb',
 'slc25a22',
 'i49s',
 'r2502c',
 'a1685s',
 'r226*',
 'd1692h',
 'lipa',
 'ndufs8',
 'chst3',
 'd493a',
 'r640g',
 'r680*',
 'g253c',
 'g101s',
 'atp8b1',
 'g1286r',
 'fbn2',
 'kiss1r',
 'slc11a1',
 'slc25a19',
 'kiaa1509-pdgfrb',
 'x1008_splice',
 'tulp1',
 's123t',
 'e106g',
 'y406h',
 'r174*',
 'smpd1',
 'a1830t',
 'zdhhc9',
 'plekhg5',
 'kirrel3',
 'manba',
 'zfp57',
 'prokr2',
 'xylt1',
 'f594_r595inssdneyfyvdf',
 's1303n',
 'c1385',
 '422_605trunc',
 'g478c',
 'r120m',
 'r583a',
 's786f',
 'r174c',
 't1852s',
 'ahi1',
 'znf81',
 'rapsn',
 's768_d770dup',
 'g386s',
 'p648s',
 'ndufs7',
 'y646n',
 'pvt1-myc',
 'r2318q',
 'i491m',
 'v561_i562inser',
 'f468c',
 'aloxe3',
 'n551t',
 'n1228d',
 'r2327w',
 'r1751p',
 'ifnar2',
 'l1854p',
 'q25h',
 'g67s',
 'lrat',
 'q110r',
 'pdss1',
 't73p',
 'ube3a',
 'bscl2',
 'd837n',
 'p286h',
 'r162*',
 'slc25a20',
 'gpc3',
 'w345*',
 'sh3tc2',
 'tmprss2-etv4',
 'n588d',
 'k765r',
 'g81s',
 'scn4b',
 'l146r',
 'e746_t751delinsl',
 'bckdha',
 'six5',
 'n2113s',
 't1365m',
 'gjb3',
 'y1003c',
 'l221r',
 'pygl',
 'p531s',
 'cnnm4',
 'i843del',
 'd2512g',
 'lrp8',
 'l2396f',
 'r11k',
 'slc29a3',
 'l861f',
 'fktn',
 'd739y',
 'p660t',
 'm160v',
 'q1756fs',
 'flvcr2',
 'l585i',
 'hsd17b3',
 'htr2c',
 'mccc2',
 'ndufv2',
 'gyg1',
 'h115q',
 'c41y',
 'v544_l545insavlvllviviisli',
 'dlat',
 'iyd',
 'p1771r',
 'd2512y',
 's214t',
 'fam20c',
 'sumo4',
 'm552_w557del',
 'f154l',
 'ubr1',
 'rpgr',
 'abcg5',
 'f1088lfs*5',
 'atg7-raf1',
 'k2472t',
 'q816*',
 'man2b1',
 'r370c',
 'upb1',
 'g81r',
 'r361p',
 'epm2a',
 'e746q',
 'ep300-moz',
 'tbx1',
 'p2415del',
 'scnn1a',
 'n463s',
 'i1018w',
 'igl-myc',
 'st14',
 'n486_p490del',
 't844m',
 'e746_s752delinsa',
 'efemp1',
 'mlycd',
 't1354m',
 'g23d',
 'i1929v',
 'r1758g',
 'slc1a3',
 'l559r',
 'nt5c3a',
 'pomt1',
 'slc27a4',
 'rgr',
 'p1856t',
 's387n',
 'a36p',
 'clcf1',
 'v487_p492delinsa',
 'trappc2',
 'e554_i571del',
 'd1344h',
 'rai1',
 'l1584r',
 'r2304c',
 'rpgrip1',
 'n71k',
 'clcn5',
 'acadsb',
 'p1812s',
 'l747_p753delinss',
 'efnb1',
 'mesp2',
 'd1778h',
 'trpv4',
 'q689r',
 'w290_i291delinsc',
 'n71i',
 'dpyd',
 'a888v',
 'e275k',
 'slc22a12',
 't28i',
 'h133q',
 'nubpl',
 '256_286trunc',
 'd423n',
 'fgd1',
 'l485_p490del',
 'dnm2',
 'dsg4',
 'v1576e',
 'a60r',
 'cyp4f22',
 'uroc1',
 'p95s',
 'abca12',
 'fah',
 'dpm3',
 'h845_n848delinsp',
 'p531l',
 'faah',
 'opa3',
 'h1094l',
 'l485_p490delinsf',
 'q1396r',
 'g2430a',
 'slc39a4',
 'sema3e',
 'd384n',
 'znf365',
 'pex7',
 'r2430m',
 's501_a502dup',
 'acadvl',
 'e749q',
 'guca1a',
 'egfr-purb',
 'c39s',
 'btd',
 'v220f',
 'pklr',
 'v60m',
 'chmp4b',
 'r282p',
 'd1071n',
 'cacna1f',
 's241t',
 'syt6',
 'k666m',
 'mat1a',
 '?',
 'pou6f2',
 'r304*',
 't75m',
 'kcnq2',
 'best1',
 's70fsx93',
 '981_1028splice',
 'npc1',
 'l301f',
 'k641n',
 'n1100y',
 'tjp2',
 'cartpt',
 'aipl1',
 'g67w',
 'cilp',
 'clcnkb',
 's1164i',
 'r631c',
 't1025a',
 'g52r',
 'f384y',
 'znf41',
 'l617m',
 'e946*',
 'tnfrsf13b',
 'r2418g',
 'i668v',
 'cd40lg',
 'lhx4',
 'ccnd1-igh',
 'vps13b',
 'l607i',
 'ndp',
 '<SOSent>',
 'p551_e554del',
 't582p',
 'r711*',
 'r698w',
 'mfrp',
 'i111r',
 'emg1',
 'y599_d600inseyeyeyey',
 'd357y',
 'rp1l1',
 'm535i',
 'y1414c',
 'p291qfs*51',
 'k575m',
 'tnfrsf11b',
 'dnaaf1',
 'bbs2',
 'd29n',
 'i563_l576del',
 'alg9',
 'v555_l576del',
 'rp1',
 'kcnj2',
 'c1483w',
 'kcnma1',
 'g583e',
 'c554w',
 'd600_l601insfreyeyd',
 'v600d_k601insfglat',
 'l747_t751delinsp',
 's746fs',
 'pex5',
 't574instqlpyd',
 'unc13d',
 'l493v',
 'abca4',
 'i18v',
 'r886w',
 'h284p',
 'n542_e543del',
 'm168t',
 'slc10a2',
 'g93w',
 'dync2h1',
 'x434_splice',
 'guca1b',
 'tspan7',
 'bbs12',
 'agrp',
 'prpf3',
 ...}

In [80]:
%autoreload
import global_utils
fasttext_vec_file="/home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_200d_20e.vec"
wvs = global_utils.get_corpus_wvs_from_ft(fasttext_vec_file, 200, vocab_words)
wvs.shape


---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-80-c83bc136c868> in <module>()
      2 import global_utils
      3 fasttext_vec_file="/home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_200d_20e.vec"
----> 4 wvs = global_utils.get_corpus_wvs_from_ft(fasttext_vec_file, 200, vocab_words)
      5 wvs.shape

~/Projects/dsotc/lib/global_utils.py in get_corpus_wvs_from_ft(fasttext_vec_file, dim, vocab_words)
     43       word = str(str_list[0].strip())
     44       vec = np.array([np.float(f) for f in str_list[1:]])
---> 45       assert dim == len(vec), "fast text some vectors doesn't match dimensions"+str(dim)+" != "+str(len(vec))
     46       ft_wvs_dict[word] = vec
     47 

AssertionError: fast text some vectors doesn't match dimensions200 != 20

saving all trained fast text vectors


In [99]:
%ll /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors


total 550348
-rwxrwxr-x 1 bicepjai 563552080 Sep 24 11:30 biolab_updated_wvs.npy*

In [103]:
len(vocab_words)


Out[103]:
352220

In [104]:
%autoreload
import global_utils
ft_vector_files = [
                   (100,"ft_cbow_100d_20e"),(200,"ft_cbow_200d_20e"),(200,"ft_cbow_300d_20e"),
                   (100,"ft_sg_100d_20e"),(200,"ft_sg_200d_20e"),(200,"ft_sg_300d_20e"),
                   (100,"ft_cbow_100d_50e"),(200,"ft_cbow_200d_50e"),(200,"ft_cbow_300d_50e"),
                   (100,"ft_sg_100d_50e"),(200,"ft_sg_200d_50e"),(200,"ft_sg_300d_50e"),
                   (100,"ft_cbow_100d_100e"),(200,"ft_cbow_200d_100e"),(200,"ft_cbow_300d_100e"),
                   (100,"ft_sg_100d_100e"),(200,"ft_sg_200d_100e"),(200,"ft_sg_300d_100e")
                  ]

for dim_file_name in ft_vector_files:
    file_path = "/home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/"+dim_file_name[1]+".vec"
    dim = dim_file_name[0]
    if not os.path.exists(file_path):
        print("file doesnt exist",file_path)
        continue
    ft_vec = global_utils.get_corpus_wvs_from_ft(file_path, dim, vocab_words)
    print(ft_vector_file,ft_vec.shape)
    np.save("processed/stage1/pretrained_word_vectors/"+dim_file_name[1]+".npy", ft_vec)


file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_100d_20e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_200d_20e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_300d_20e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_100d_20e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_200d_20e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_300d_20e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_100d_50e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_200d_50e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_300d_50e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_100d_50e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_200d_50e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_300d_50e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_100d_100e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_200d_100e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_300d_100e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_100d_100e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_200d_100e.vec
file doesnt exist /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_300d_100e.vec

In [32]:



Out[32]:
(367260, 200)

Viewing word vectors


In [9]:
%autoreload
import global_utils

In [14]:
WORD_EMB_SIZE=200
ft_file_path = "/home/bicepjai/Projects/Deep-Survey-Text-Classification/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_200d_50e.vec"
trained_embeddings = global_utils.get_embeddings_from_ft(ft_file_path, WORD_EMB_SIZE, corpus_vocab_list)
trained_embeddings.shape


Out[14]:
(352220, 200)

In [16]:
tb_vocab_size=5000

In [17]:
tb_vocab_biolab = list(trained_embeddings)[:tb_vocab_size]
with open("view_wvs_tb/tb_vocab.tsv", "w") as fp:
    wr = csv.writer(fp, delimiter='\n')
    wr.writerow(corpus_vocab_list)

tb_word_vectors = np.random.randn(tb_vocab_size, 200)
for i,word in enumerate(tb_vocab_biolab):
    tb_word_vectors[i] = trained_embeddings[i]

In [22]:
%autoreload
from utils import visualize_embeddings_in_tensorboard
visualize_this_embedding = tb_word_vectors
print(visualize_this_embedding.shape)
metadata_path = "/home/bicepjai/Projects/Deep-Survey-Text-Classification/data_prep/view_wvs_tb/tb_vocab.tsv"
visualize_embeddings_in_tensorboard(visualize_this_embedding, metadata_path, "/home/bicepjai/Projects/Deep-Survey-Text-Classification/data_prep/view_wvs_tb")


(5000, 200)

In [ ]:


In [ ]: