In [1]:
#top 5k dice keywords
KEY_WORDS_FILE = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/top_5k_keywords.txt"
TOPN = 30
PAYLOAD_SYNONYMS_FILE = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/top%i_keyword_synonyms.txt" % TOPN
SYNONYMS_FILE = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/keywords.txt"
PHRASES_FILE = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/Phrases.txt"
MODEL_FILE = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/keyword_model.w2v"
In [2]:
#Shared
#just used to load phrases file
def load_stop_words(stop_words_file):
stop_words = set()
with open(stop_words_file) as f:
for line in f:
word = line.strip()
if word[0] != "#":
word = word.lower()
stop_words.add(word)
return stop_words
In [3]:
#functions
def map_keyword(kw):
return kw.replace(" ", "_")
def write_most_similar_synonyms(topn, key_words, model, expand_fname, map_fname):
key_words = set(key_words)
missing = set()
no_sim = set()
all_syns = set()
with open(expand_fname, "w+") as exp_f:
for word in key_words:
if not word in model.vocab:
missing.add(word)
continue
top_matches = model.most_similar(positive=word, topn=topn*10)
valid = []
for t,sim in top_matches:
if t in key_words and sim > 0.01:
valid.append((t,sim))
if len(valid) >= topn:
break
if len(valid) > 0:
all_syns.add(word)
exp_f.write("%s=>" % word)
for key, val in valid:
all_syns.add(key)
kw = map_keyword(key)
exp_f.write("%s|%f " %(kw,val))
exp_f.write("\n")
else:
no_sim.add(word)
#print("No matching similar terms in word2vec model for term: %s" % word)
with open(map_fname, "w+") as f:
for syn in sorted(all_syns):
f.write("%s=>%s\n" % (syn, map_keyword(syn)))
return all_syns, missing, no_sim
In [4]:
import gensim, time
from gensim.models.word2vec import Word2Vec
model = Word2Vec.load(MODEL_FILE)
In [5]:
keywords = load_stop_words(PHRASES_FILE)
len(keywords)
Out[5]:
In [6]:
with open(KEY_WORDS_FILE) as f:
for line in f:
kw = line.strip()
if len(kw) > 0:
keywords.add(kw)
print("%i keywords loaded" % (len(keywords)))
In [7]:
all_syns, missing, no_sim = write_most_similar_synonyms(TOPN, keywords, model, PAYLOAD_SYNONYMS_FILE, SYNONYMS_FILE)
In [8]:
print len(missing), len(no_sim), len(keywords)