In [27]:
#top 5k dice keywords
# set to none to use phrases only
KEY_WORDS_FILE = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/top_5k_keywords.txt"
# set to none to use keywords only
PHRASES_FILE = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/Phrases.txt"
VECTOR_SYNONYMS_FILE  = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/vector_synonyms.txt"
MODEL_FILE     = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/keyword_model.w2v"

In [15]:
import numpy as np
#Shared
#just used to load phrases file
def load_stop_words(stop_words_file):
    stop_words = set()
    with open(stop_words_file) as f:
            for line in f:
                word = line.strip()
                if word[0] != "#":
                    word = word.lower()
                    stop_words.add(word)
    return stop_words

def get_vector(item, model):
    vocab = model.vocab[item]
    vector = model.syn0[vocab.index]
    return vector

def get_norm_vector(item, model):
    if item not in model.vocab:
        return None
    # for deserialized models, the norm vectors are not stored
    vec = get_vector(item, model)
    norm = np.linalg.norm(vec)
    if norm != 0:
        return vec / norm
    return vec

In [29]:
#functions
def is_valid_search_keyword(kw):
    q_kw = " " + kw + " "
    for wd in "(,), and , or , not , true , TRUE , false , FALSE ".split(","):
        if wd in q_kw:
            return False
    # remove queries with negations in them
    tokens = kw.split(" ")
    
    # remove single char keywords
    if len(tokens) == 1 and len(tokens[0]) == 1:
        if tokens[0].isalpha():
            return True
        return False
    
    if any(map(lambda t: t.strip().startswith("-"), tokens)):
        return False
    return True

def map_keyword(kw):
    return kw.replace(" ", "_")

def vectors_to_file(fname, terms, model):
    with open(fname, "w+") as f:
        for term in terms:
            vec = get_norm_vector(term, model)
            if vec is not None: #in model
                f.write("%s=>" % term)
                for i, val in enumerate(vec):
                    # left pad the string so the same number of characters
                    f.write("%s|%f " %(str(i).rjust(3, "0"),val))
                f.write("\n")

In [10]:
import gensim, time
from gensim.models.word2vec import Word2Vec

model = Word2Vec.load(MODEL_FILE)

In [17]:
if PHRASES_FILE is not None:
    phrases = load_stop_words(PHRASES_FILE)
else:
    phrases = set()
len(phrases)


Out[17]:
24785

In [21]:
un_keywords = set()
if KEY_WORDS_FILE is not None:
    with open(KEY_WORDS_FILE) as f:
        for line in f:
            kw = line.strip()
            if len(kw) > 0 and is_valid_search_keyword(kw):
                un_keywords.add(kw)
    print("%i keywords loaded from %s" % (len(un_keywords), KEY_WORDS_FILE))


4047 keywords loaded from /Users/simon.hughes/Documents/Dice Data/LuceneTalk/top_5k_keywords.txt

In [26]:
all_terms = un_keywords.union(phrases)
print(len(all_terms), "total terms")


(27125, 'total terms')

In [31]:
vectors_to_file(VECTOR_SYNONYMS_FILE, all_terms, model)