In [2]:
import re
import os
import numpy as np
import json
import pickle
import datetime
import spacy
from keras.utils import to_categorical
from nltk.tokenize import word_tokenize,sent_tokenize
from load_squad_wiki_data import get_squad_data, get_squad_wiki_data
from gensim.models import Word2Vec
nlp = spacy.load('en', parser=False, entity=False, matcher=False, add_vectors=False)

In [ ]:
class Embeddings:
    def __init__(self, size, window, min_count, workers):
        
        self.size = size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        base_file_name = '_'.join([str(number) for number in [size, window, min_count, workers]])
        self.path_word2vec_model = '../data/word2vec_model_{0}.pickle'.format(base_file_name)
        self.path_indexed_sentences = '../data/indexed_sentences_{0}.json'.format(base_file_name)
        self.path_word_embeddings = '../data/word_embeddings_{0}.npz'.format(base_file_name)
        self.path_indexed_vocabulary = '../data/indexed_vocabulary_{0}.json'.format(base_file_name)
        self.path_pos_categorical_indexed_sentences = '../data/pos_categorical_indexed_sentences_{0}.json'.format(base_file_name)
        self.path_pos_indexed_vocabulary = '../data/pos_indexed_vocabulary_{0}.json'.format(base_file_name)
        self.load_embeddings()
    
    def tokenize_sentence(self, raw_text):
        sentences = sent_tokenize(raw_text)
        sentences = [re.sub(r'[^\w\'\+\-\=\*\s\^]', '', sent) for sent in sentences]
        tokenized_sentences = [word_tokenize(sent) for sent in sentences]
        return tokenized_sentences
    
    def tokenize_index_sentence(self, sentence):
        word2index, index2word = self.get_vocabulary()
        tokenized_sentences = self.tokenize_sentence(sentence.lower())
        indexed_sentences = [[word2index[word] for word in sent] for sent in tokenized_sentences]
        return indexed_sentences
        
        
    def tag_sentence(self, text):
        tokenized_sentences = self.tokenize_sentence(text.lower())
        tokenized_pos_sentences = self.find_POS(tokenized_sentences)
        return tokenized_pos_sentences
    
    def find_POS(self, tokenized_sentences):
        final_pos_sents = []
        for sent in tokenized_sentences:
            doc = nlp(' '.join(sent))
            pos = []
            for word in doc:
                pos.append(word.pos_)
            final_pos_sents.append(pos)
        return final_pos_sents 

        
    def preprocessor(self, raw_text, size, window, min_count, workers):  
        tokenized_sentences = self.tokenize_sentence(raw_text)
        tokenized_pos_sentences = self.find_POS(tokenized_sentences)
        vocab = ['PUNCT','SYM','X','ADJ','VERB','CONJ','NUM','DET','ADV','PROPN','NOUN','PART','INTJ','CCONJ','SPACE','ADP','SCONJ','AUX', 'PRON']
        vocab = dict((word, index) for index, word in enumerate(vocab))
        with open(self.path_pos_indexed_vocabulary,'w') as outfile:
            json.dump(vocab, outfile)
        # initialize word2vector model
        model = Word2Vec(sentences = tokenized_sentences, size = size, window = window, min_count = min_count, workers = workers)
        # finding out the vocabulary of raw_text with index     
        vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
        # Storeing the vocab2index in a seperate file
        with open(self.path_indexed_vocabulary,'w') as outfile:
            json.dump(vocab, outfile)
         # finding gensim weights
        weights = model.wv.syn0
        # storeing weights in wordembeddings.npz file
        np.save(open(self.path_word_embeddings, 'wb'), weights)
        # dump the word2vec model in dump file word2vec_model
        with open(self.path_word2vec_model, 'wb') as output:
            pickle.dump(model, output)

    def get_raw_text(self, dataset):
        raw_text = ""
        #passage_text = "" 
        question_text = ""
        #passage_text_list = []
        question_text_list = []
        for data in dataset:
            #passage_text_list.append(data['Paragraph'])
            question_text_list.extend(data['Question'])                
        #passage_text = "".join(passage_text_list)
        question_text = " ".join(question_text_list)
        #raw_text = passage_text + " " + question_text
        raw_text = question_text
        raw_text = raw_text.lower()
        return raw_text
    
    def load_embeddings(self):
        if not (os.path.isfile(self.path_word2vec_model) and 
                os.path.isfile(self.path_word_embeddings) and 
                os.path.isfile(self.path_indexed_vocabulary) and
                os.path.isfile(self.path_pos_indexed_vocabulary)):
            print("Loading embeddings....")
            dataset = get_squad_wiki_data()
            raw_text = self.get_raw_text(dataset) 
            self.preprocessor(raw_text, self.size, self.window, self.min_count, self.workers)
        print("Loading the embeddings from the cache")
        if not (os.path.isfile(self.path_pos_categorical_indexed_sentences) and 
            os.path.isfile(self.path_indexed_sentences)):
            print("Starting tokenized, pos squad data.....")
            squad_data = get_squad_data()
            raw_text = self.get_raw_text(squad_data) 
            self.create_tokenized_squad_corpus(raw_text)
            self.create_pos_tokenized_squad_corpus(raw_text)
        
        

    # Will load and return weights from the existing embedding.npz file
    def get_weights(self):
        weights = np.load(open(self.path_word_embeddings,'rb'))
        return weights

    # Returns word2Index and index2word
    def get_vocabulary(self):
        with open(self.path_indexed_vocabulary, 'r') as f:
            data = json.load(f)
        word2idx = data
        idx2word = dict([(v, k) for k, v in data.items()])
        return word2idx, idx2word

    # Returns the pickled model
    def get_model(self):
        with open(self.path_word2vec_model,'rb') as output:
            model = pickle.load(output)
        return model

    # Returns the tokenized sentences
    def get_tokenized_indexed_sentences(self):
        with open(self.path_indexed_sentences, 'r') as f:
            tokenized_sentences = json.load(f)
        return tokenized_sentences
    
    # Returns pos2Index and index2pos
    def get_pos_vocabulary(self):
        with open(self.path_pos_indexed_vocabulary, 'r') as f:
            data = json.load(f)
        pos2idx = data
        idx2pos = dict([(v, k) for k, v in data.items()])
        return pos2idx, idx2pos
    
    # Returns the tokenized pos sentences
    def get_pos_categorical_indexed_sentences(self):
        with open(self.path_pos_categorical_indexed_sentences, 'r') as f:
            tokenized_pos_sentences = json.load(f)
        return tokenized_pos_sentences
    
    def create_tokenized_squad_corpus(self, squad_corpus):
        print("Creating Tokenized Squad Corpus")
        tokenized_indexed_sentences = self.tokenize_index_sentence(squad_corpus)
        with open(self.path_indexed_sentences, "w") as f:
            json.dump(tokenized_indexed_sentences, f)
        
    def create_pos_tokenized_squad_corpus(self, squad_corpus):
        print("Creating Tokenized Squad Corpus")
        tokenized_pos_sentences = self.tag_sentence(squad_corpus)
        pos2idx, idx2pos = self.get_pos_vocabulary()
        categorical_pos_sentences = [to_categorical([pos2idx[word] for word in sent], num_classes = len(pos2idx)).tolist() for sent in tokenized_pos_sentences] 
        with open(self.path_pos_categorical_indexed_sentences, "w") as f:
            json.dump(categorical_pos_sentences, f)
            
    def load_google_word2vec_model(self):
        #google_word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('../model/GoogleNews-vectors-negative300.bin',binary = True, encoding = 'utf8')
        print("LOADING TRAINED SQUAD AND WIKI WORD2VEC MODEL.....")
        wiki_word2vec_model = self.get_model()
        print("INTERSECTING GOOGLES WORD2VEC MODEL WITH WORD2VEC MODEL")
        wiki_word2vec_model.intersect_word2vec_format(fname = '../model/GoogleNews-vectors-negative300.bin' , lockf = 1.0, binary = True)

In [3]:
# start_date = datetime.datetime.now()
print("EMBEDDING(100,4,1,4) STARTED .....")
e = Embeddings(100, 4, 1, 4)
print("EMBEDDING(100,4,1,4) COMPLETED .....")
# end_date = datetime.datetime.now()
# #print("TOTAL TIME ELAPSED IN EMBEDDINGS")
# #print(((end_date - start_date).hour)," HOURS ",((end_date - start_date).minute)," MINUTES ",((end_date - start_date).second)," SECONDS ")

In [ ]:
# print("CALLING INTERSECT FUNCTION OF EMBEDDING .....")
# e.load_google_word2vec_model()
# print("WORD2VEC INTERSECTION DONE.....")

In [4]:
# e.get_model()

In [5]:
# e.get_weights()

In [6]:
# e.get_vocabulary()

In [7]:
# e.get_tokenized_indexed_sentences()

In [8]:
# print(e.tokenize_index_sentence("this is Nikola Tesla"))
# e.tag_sentence("this is nikola tesla")

In [ ]:
#vocab = ['PUNCT','SYM','X','ADJ','VERB','CONJ','NUM','DET','ADV','PROPN','NOUN','PART','INTJ','CCONJ','','']