In [27]:
#top 5k dice keywords
# set to none to use phrases only
KEY_WORDS_FILE = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/top_5k_keywords.txt"
# set to none to use keywords only
PHRASES_FILE = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/Phrases.txt"
VECTOR_SYNONYMS_FILE = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/vector_synonyms.txt"
MODEL_FILE = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/keyword_model.w2v"
In [15]:
import numpy as np
#Shared
#just used to load phrases file
def load_stop_words(stop_words_file):
stop_words = set()
with open(stop_words_file) as f:
for line in f:
word = line.strip()
if word[0] != "#":
word = word.lower()
stop_words.add(word)
return stop_words
def get_vector(item, model):
vocab = model.vocab[item]
vector = model.syn0[vocab.index]
return vector
def get_norm_vector(item, model):
if item not in model.vocab:
return None
# for deserialized models, the norm vectors are not stored
vec = get_vector(item, model)
norm = np.linalg.norm(vec)
if norm != 0:
return vec / norm
return vec
In [29]:
#functions
def is_valid_search_keyword(kw):
q_kw = " " + kw + " "
for wd in "(,), and , or , not , true , TRUE , false , FALSE ".split(","):
if wd in q_kw:
return False
# remove queries with negations in them
tokens = kw.split(" ")
# remove single char keywords
if len(tokens) == 1 and len(tokens[0]) == 1:
if tokens[0].isalpha():
return True
return False
if any(map(lambda t: t.strip().startswith("-"), tokens)):
return False
return True
def map_keyword(kw):
return kw.replace(" ", "_")
def vectors_to_file(fname, terms, model):
with open(fname, "w+") as f:
for term in terms:
vec = get_norm_vector(term, model)
if vec is not None: #in model
f.write("%s=>" % term)
for i, val in enumerate(vec):
# left pad the string so the same number of characters
f.write("%s|%f " %(str(i).rjust(3, "0"),val))
f.write("\n")
In [10]:
import gensim, time
from gensim.models.word2vec import Word2Vec
model = Word2Vec.load(MODEL_FILE)
In [17]:
if PHRASES_FILE is not None:
phrases = load_stop_words(PHRASES_FILE)
else:
phrases = set()
len(phrases)
Out[17]:
In [21]:
un_keywords = set()
if KEY_WORDS_FILE is not None:
with open(KEY_WORDS_FILE) as f:
for line in f:
kw = line.strip()
if len(kw) > 0 and is_valid_search_keyword(kw):
un_keywords.add(kw)
print("%i keywords loaded from %s" % (len(un_keywords), KEY_WORDS_FILE))
In [26]:
all_terms = un_keywords.union(phrases)
print(len(all_terms), "total terms")
In [31]:
vectors_to_file(VECTOR_SYNONYMS_FILE, all_terms, model)