In [1]:
import re
import os
import numpy as np
import json
import pickle
import datetime
import spacy
from keras.utils import to_categorical
from nltk.tokenize import word_tokenize,sent_tokenize
from load_squad_wiki_data import get_squad_data, get_squad_wiki_data
from gensim.models import Word2Vec
from spacy.en import English
nlp = spacy.load('en', parser=False, matcher=False, add_vectors=False)
nlp_en = English()


Using TensorFlow backend.

In [2]:
class MakeIter(object):
    def __init__(self, generator_func, **kwargs):
        self.generator_func = generator_func
        self.kwargs = kwargs
    def __iter__(self):
        return self.generator_func(**self.kwargs)
    
class Embeddings:
    def __init__(self, size, window, min_count, workers):
        
        self.size = size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        base_file_name = '_'.join([str(number) for number in [size, window, min_count, workers]])
        self.path_word2vec_model = '../data/word2vec_model_{0}.pickle'.format(base_file_name)
        self.path_word_tokenized_sentence = '../data/word_tokenized_sentence_{0}.json'.format(base_file_name)
        self.path_indexed_sentences = '../data/indexed_sentences_{0}.json'.format(base_file_name)
        self.path_vocabulary = '../data/vocabulary_{0}.json'.format(base_file_name)
        self.path_google_intersected = '../data/google_intersected_model_{0}.pickle'.format(base_file_name)
        self.index_sentences()
    
    def tokenize_sentences(self):
        tokenized_sentences = []
        embeddings_generator = self.load_embeddings()
        for sentence in embeddings_generator:
            tokenized_sentence = word_tokenize(sentence.lower())
            tokenized_sentences.append(tokenized_sentence)
        with open(self.path_word_tokenized_sentence, "w") as outfile:
            json.dump(tokenized_sentences, outfile)
        

    def create_embeddings(self):
        sentences = self.get_tokenized_sentences()
        word2vec_model = Word2Vec(sentences, size=self.size, window=self.window, min_count=self.min_count, workers=self.workers)
        word2index = dict([(k, v.index) for k, v in word2vec_model.wv.vocab.items()])
        with open(self.path_vocabulary, "w") as output:
            json.dump(word2index, output)
        with open(self.path_word2vec_model, 'wb') as output:
            pickle.dump(word2vec_model, output)
    
    def create_google_intersected_embeddings(self):
        word2vec_model = self.get_model()
        intersected_model = self.load_google_word2vec_model(word2vec_model) 
        with open(self.path_google_intersected, "wb") as output:
            pickle.dump(intersected_model, output)
            
        
    def index_sentences(self):
        if not os.path.isfile(self.path_indexed_sentences):
            tokenized_sentences = self.get_tokenized_sentences()
            word2vec_model = self.get_intersected_model()
            word2index, index2word = self.get_vocabulary() 
            indexed_sentences = [[word2index[word] for word in sent] for sent in tokenized_sentences]
            with open(self.path_indexed_sentences, "w") as outfile:
                json.dump(indexed_sentences, outfile)

    def get_raw_text(self, dataset):
        question_text = ""
        for data in dataset:
            for question in data['Question']:
                question = self.noun_chunkers(question)
                question = "SQUADSTART " + re.sub(r'[^\w\'\+\-\=\*\s\^]', '', question) + " SQUADEND"
                yield question
    
    def load_embeddings(self):
        print("Loading embeddings....")
        dataset = get_squad_wiki_data()
        return self.get_raw_text(dataset)

    # Returns word2Index and index2word
    def get_vocabulary(self):
        with open(self.path_vocabulary, 'r') as f:
            data = json.load(f)
        word2idx = data
        idx2word = dict([(v, k) for k, v in data.items()])
        return word2idx, idx2word

    # Returns the pickled model
    def get_model(self):
        if not os.path.isfile(self.path_word2vec_model):
            print("Creating Embeddings...")
            self.create_embeddings()
        print("Loading Embeddings...")
        with open(self.path_word2vec_model,'rb') as output:
            model = pickle.load(output)
        return model
    
    def get_tokenized_sentences(self):
        if not os.path.isfile(self.path_word_tokenized_sentence):
            print("Creating Tokenized Sentences...")
            self.tokenize_sentences()
        print("Loading Indexed Sentences...")
        with open(self.path_word_tokenized_sentence, "r") as file:
            tokenized_sentences = json.load(file)
        return tokenized_sentences

    def get_indexed_sentences(self):
        if not os.path.isfile(self.path_indexed_sentences):
            print("Creating Indexed Sentences...")
            self.index_sentences()
        print("Loading Indexed Sentences...")
        with open(self.path_indexed_sentences, 'r') as f:
            indexed_sentences = json.load(f)
        return indexed_sentences
        
    def load_google_word2vec_model(self, model):
        print("INTERSECTING GOOGLES WORD2VEC MODEL WITH ORIGINAL WORD2VEC MODEL")
        model.intersect_word2vec_format(fname = '../model/GoogleNews-vectors-negative300.bin' , lockf = 1.0, binary = True)        
        return model
    
    def get_intersected_model(self):
        if not os.path.isfile(self.path_google_intersected):
            self.create_google_intersected_embeddings()
        with open(self.path_google_intersected, "rb") as output:
            intersected_model = pickle.load(output)
        return intersected_model
            
    def noun_chunkers(self, raw_text):
        doc = nlp_en(raw_text)
        for entity in doc.ents:
            raw_text = raw_text.replace(str(entity), "_".join(str(entity).split()))
        return raw_text
    
    def get_indexed_query(self, query):
        query = self.noun_chunkers(query)
        query = "SQUADSTART " + re.sub(r'[^\w\'\+\-\=\*\s\^]', '', query)
        word_tokenized_query = word_tokenize(query.lower())
        word2index, index2word = self.get_vocabulary()
        indexed_query = [word2index[word] for word in word_tokenized_query if word in word2index.keys()]
        return indexed_query

In [3]:
e = Embeddings(300, 4, 1, 4)

In [4]:
e.noun_chunkers("when was Super Bowl 50. Indian Space Research Organization is known as ISRO. World  Health Organization.")


Out[4]:
'when was Super_Bowl 50. Indian_Space_Research_Organization is known as ISRO. World_Health_Organization.'

In [5]:
a,b = e.get_vocabulary()


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-5-2745c42eed2d> in <module>()
----> 1 a,b = e.get_vocabulary()

<ipython-input-2-058f925c0c99> in get_vocabulary(self)
     71     # Returns word2Index and index2word
     72     def get_vocabulary(self):
---> 73         with open(self.path_vocabulary, 'r') as f:
     74             data = json.load(f)
     75         word2idx = data

FileNotFoundError: [Errno 2] No such file or directory: '../data/vocabulary_300_4_1_4.json'

In [ ]:
# def preprocessor(self, raw_text, size, window, min_count, workers):  
# #         tokenized_sentences = self.tokenize_sentence(raw_text)
#         print("STOREING RAW TEXT AFTER REGEX AND WORD TOKENIZATION ")
#         with open("../data/tokenized_sentences_after_regex.json","w") as outfile:
#             json.dump(tokenized_sentences,outfile)
        
#         tokenized_pos_sentences = self.find_POS(tokenized_sentences)
#         vocab = ['PUNCT','SYM','X','ADJ','VERB','CONJ','NUM','DET','ADV','PROPN','NOUN','PART','INTJ','CCONJ','SPACE','ADP','SCONJ','AUX', 'PRON']
#         vocab = dict((word, index) for index, word in enumerate(vocab))
#         with open(self.path_pos_indexed_vocabulary,'w') as outfile:
#             json.dump(vocab, outfile)
#         # initialize word2vector model
#         model = Word2Vec(sentences = tokenized_sentences, size = size, window = window, min_count = min_count, workers = workers)
#         intersected_model = self.load_google_word2vec_model(model)
#         # finding out the vocabulary of raw_text with index     
#         vocab = dict([(k, v.index) for k, v in intersected_model.wv.vocab.items()])
#         # Storeing the vocab2index in a seperate file
#         with open(self.path_indexed_vocabulary,'w') as outfile:
#             json.dump(vocab, outfile)
#          # finding gensim weights
#         weights = intersected_model.wv.syn0
#         # storeing weights in wordembeddings.npz file
#         np.save(open(self.path_word_embeddings, 'wb'), weights)
#         # dump the word2vec model in dump file word2vec_model