In [1]:
import sys
import os
import re
import collections
import itertools
import bcolz
import pickle
sys.path.append('../lib')
import gc
import random
import smart_open
import h5py
import csv
import tensorflow as tf
import gensim
import datetime as dt
from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
random_state_number = 967898
In [2]:
from tensorflow.python.client import device_lib
def get_available_gpus():
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
get_available_gpus()
Out[2]:
In [3]:
%pylab
%matplotlib inline
%load_ext autoreload
%autoreload
In [4]:
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
color = sns.color_palette()
In [13]:
corpus_vocab_list, corpus_vocab_wordidx = None, None
with open('processed/stage1/vocab_words_wordidx.pkl', 'rb') as f:
(corpus_vocab_list, corpus_wordidx) = pickle.load(f)
print(len(corpus_vocab_list), len(corpus_wordidx))
In [7]:
store = pd.HDFStore('processed/stage1/data_frames.h5')
train_df = store['train_df']
test_df = store['test_df']
In [9]:
from gensim.models.keyedvectors import KeyedVectors
biolab_keyed_vectors_pubmed_pmc_wiki = KeyedVectors.load_word2vec_format('external/biolab_wvs/wikipedia-pubmed-and-PMC-w2v.bin', binary=True)
In [10]:
biolab_words_pubmed_pmc_wiki = biolab_keyed_vectors_pubmed_pmc_wiki.vocab.keys()
biolab_words = set(biolab_words_pubmed_pmc_wiki)
len(biolab_words)
Out[10]:
In [12]:
vocab_biolab = set(biolab_words) & set(vocab_words)
print (len(vocab_biolab))
vocab_biolab
Out[12]:
In [14]:
vocab_not_in_biolab =set(vocab_words) - set(biolab_words)
print(len(vocab_not_in_biolab))
vocab_not_in_biolab
Out[14]:
dont need word to id dict since this is indexed with words
In [17]:
undesirable_ascii_characters = list(range(32))
undesirable_ascii_characters.remove(10) #keep new line since this might be used for sentence tokenizer
undesirable_charmap = dict.fromkeys(undesirable_ascii_characters)
In [20]:
from nltk import word_tokenize
from utils import custom_word_tokenizer, apply_custom_regx
custom_tokenized_biolab_pubmed_pmc_wiki_wv = {}
for word in vocab_biolab:
vector = biolab_keyed_vectors_pubmed_pmc_wiki.word_vec(word)
custom_tokenized_biolab_pubmed_pmc_wiki_wv[word.lower()] = vector
word = word.lower().encode('ascii', 'ignore').decode('utf-8', 'ignore')
word = str(word).translate(undesirable_charmap)
word = apply_custom_regx(word)
word = word.replace('\\t', '')
for part in word_tokenize(word):
if part in custom_tokenized_biolab_pubmed_pmc_wiki_wv:
custom_tokenized_biolab_pubmed_pmc_wiki_wv[part] += vector
custom_tokenized_biolab_pubmed_pmc_wiki_wv[part] /= 2
In [21]:
len(custom_tokenized_biolab_pubmed_pmc_wiki_wv)
Out[21]:
In [27]:
tb_vocab_size=5000
In [38]:
tb_vocab_biolab = list(vocab_biolab)[:tb_vocab_size]
with open("view_wvs_tb/tb_vocab.tsv", "w") as fp:
wr = csv.writer(fp, delimiter='\n')
wr.writerow(tb_vocab_biolab)
tb_word_vectors = np.random.randn(tb_vocab_size, 200)
for i,word in enumerate(tb_vocab_biolab):
tb_word_vectors[i] = custom_tokenized_biolab_pubmed_pmc_wiki_wv[word]
In [40]:
%autoreload
from utils import visualize_embeddings_in_tensorboard
visualize_this_embedding = tb_word_vectors
print(visualize_this_embedding.shape)
metadata_path = "/home/bicepjai/Projects/dsotc/data_prep/view_wvs_tb/tb_vocab.tsv"
visualize_embeddings_in_tensorboard(visualize_this_embedding, metadata_path, "/home/bicepjai/Projects/dsotc/data_prep/view_wvs_tb")
In [35]:
del tb_word_vectors
In [22]:
corpus_word_vectors = np.random.randn(len(vocab_words), 200)
corpus_word_vectors.shape
Out[22]:
fill in biolab vectors available
In [23]:
for word in vocab_biolab:
dataset_corpus_word_index = vocab_wordidx[word]
corpus_word_vectors[dataset_corpus_word_index] = custom_tokenized_biolab_pubmed_pmc_wiki_wv[word]
total words not updated with training from biolab
In [24]:
words_not_updated = set(vocab_words) - vocab_biolab
len(words_not_updated)
Out[24]:
In [25]:
words_not_updated
Out[25]:
In [26]:
np.save("processed/stage1/biolab_updated_wvs.npy", corpus_word_vectors)
In [14]:
dataset_corpus_words_list = np.load("dataset_corpus_words_list.npy")
corpus_word_vectors = np.load("corpus_word_vectors.npy")
In [15]:
tb_vocab_size = 10000
In [ ]:
local_tb_dir = "/home/bicepjai/Projects/ml-compete/kaggle/mskrct/data_prep_2_ft/model_wv_visualize/gcloud/"
In [34]:
with open(local_tb_dir+"/vocab.tsv", "wb") as fp:
wr = csv.writer(fp, delimiter='\n')
wr.writerow(dataset_corpus_words_list[:tb_vocab_size])
for http://projector.tensorflow.org/ vectors need to be in tsv form
In [13]:
# np.savetxt("model_wv_visualize/word_vectors.tsv",corpus_word_vectors[:tb_vocab_size], delimiter='\t')
write to checkpoint file
In [30]:
!rm $local_tb_dir/checkpoint
!ls $local_tb_dir
In [32]:
from word2vec import visualize_embeddings_in_tensorboard
visualize_this_embedding = corpus_word_vectors[:tb_vocab_size]
print visualize_this_embedding.shape
# path for gcloud tensorboard
metadata_path = "/home/bicepjai/projects/tb_visual/vocab.tsv"
# metadata_path = "/home/bicepjai/Projects/ml-compete/kaggle/mskrct/data_prep_2_ft/model_wv_visualize/vocab.tsv"
visualize_embeddings_in_tensorboard(visualize_this_embedding, metadata_path, local_tb_dir)
In [33]:
checkpoint_txt = "model_checkpoint_path: \"/home/bicepjai/projects/tb_visual/visual_embed.ckpt-1\"\n\
all_model_checkpoint_paths: \"/home/bicepjai/projects/tb_visual/visual_embed.ckpt-1\""
with open(local_tb_dir+"/checkpoint","w") as f:
f.seek(0)
f.truncate()
f.write(checkpoint_txt)
In [ ]:
fasttext skipgram -minCount 1 -dim 200 -epoch 10 -input corpus_text_for_fast_text.txt -output ft_wvs_200d_10e
fasttext cbow -minCount 1 -dim 200 -epoch 10 -input corpus_text_for_fast_text.txt -output ft_wvs_200d_10e
In [32]:
fasttext_vec_file = "processed/stage2/pretrained_word_vectors/ft_sg_200d_10e.vec"
In [33]:
ft_lines = None
with open(fasttext_vec_file,"r") as f:
ft_lines = f.readlines()
In [34]:
print(ft_lines[0])
print(type(ft_lines), len(ft_lines))
ft_shape = tuple([int(i.strip()) for i in ft_lines[0].split()])
ft_shape
Out[34]:
In [35]:
print(len(ft_lines[1].split()))
ft_lines[1]
Out[35]:
In [36]:
ft_vocab_size=ft_shape[0]
ft_vocab_size
Out[36]:
In [37]:
ft_word_vectors = np.random.randn(ft_vocab_size, ft_shape[1])
ft_words = []
In [38]:
for i, line in enumerate(ft_lines[1:]):
str_list =line.split()
ft_words.append(str_list[0].strip())
vec = np.array([np.float(f) for f in str_list[1:]])
ft_word_vectors[i] = vec
In [39]:
ft_word_vectors.shape
Out[39]:
In [40]:
a = list(ft_words)
a.sort(key=len, reverse=True)
print(a[:10])
del a
In [41]:
ft_wordidx = {w:i for i,w in enumerate(ft_words)}
ft_vocab_size, len(ft_wordidx)
Out[41]:
In [42]:
len(set(vocab_words) - set(ft_words))
Out[42]:
In [43]:
set(vocab_words) - set(ft_words)
Out[43]:
In [80]:
%autoreload
import global_utils
fasttext_vec_file="/home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/ft_cbow_200d_20e.vec"
wvs = global_utils.get_corpus_wvs_from_ft(fasttext_vec_file, 200, vocab_words)
wvs.shape
In [99]:
%ll /home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors
In [103]:
len(vocab_words)
Out[103]:
In [104]:
%autoreload
import global_utils
ft_vector_files = [
(100,"ft_cbow_100d_20e"),(200,"ft_cbow_200d_20e"),(200,"ft_cbow_300d_20e"),
(100,"ft_sg_100d_20e"),(200,"ft_sg_200d_20e"),(200,"ft_sg_300d_20e"),
(100,"ft_cbow_100d_50e"),(200,"ft_cbow_200d_50e"),(200,"ft_cbow_300d_50e"),
(100,"ft_sg_100d_50e"),(200,"ft_sg_200d_50e"),(200,"ft_sg_300d_50e"),
(100,"ft_cbow_100d_100e"),(200,"ft_cbow_200d_100e"),(200,"ft_cbow_300d_100e"),
(100,"ft_sg_100d_100e"),(200,"ft_sg_200d_100e"),(200,"ft_sg_300d_100e")
]
for dim_file_name in ft_vector_files:
file_path = "/home/bicepjai/Projects/dsotc/data_prep/processed/stage1/pretrained_word_vectors/"+dim_file_name[1]+".vec"
dim = dim_file_name[0]
if not os.path.exists(file_path):
print("file doesnt exist",file_path)
continue
ft_vec = global_utils.get_corpus_wvs_from_ft(file_path, dim, vocab_words)
print(ft_vector_file,ft_vec.shape)
np.save("processed/stage1/pretrained_word_vectors/"+dim_file_name[1]+".npy", ft_vec)
In [32]:
Out[32]:
In [9]:
%autoreload
import global_utils
In [14]:
WORD_EMB_SIZE=200
ft_file_path = "/home/bicepjai/Projects/Deep-Survey-Text-Classification/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_200d_50e.vec"
trained_embeddings = global_utils.get_embeddings_from_ft(ft_file_path, WORD_EMB_SIZE, corpus_vocab_list)
trained_embeddings.shape
Out[14]:
In [16]:
tb_vocab_size=5000
In [17]:
tb_vocab_biolab = list(trained_embeddings)[:tb_vocab_size]
with open("view_wvs_tb/tb_vocab.tsv", "w") as fp:
wr = csv.writer(fp, delimiter='\n')
wr.writerow(corpus_vocab_list)
tb_word_vectors = np.random.randn(tb_vocab_size, 200)
for i,word in enumerate(tb_vocab_biolab):
tb_word_vectors[i] = trained_embeddings[i]
In [22]:
%autoreload
from utils import visualize_embeddings_in_tensorboard
visualize_this_embedding = tb_word_vectors
print(visualize_this_embedding.shape)
metadata_path = "/home/bicepjai/Projects/Deep-Survey-Text-Classification/data_prep/view_wvs_tb/tb_vocab.tsv"
visualize_embeddings_in_tensorboard(visualize_this_embedding, metadata_path, "/home/bicepjai/Projects/Deep-Survey-Text-Classification/data_prep/view_wvs_tb")
In [ ]:
In [ ]: