In [1]:
#top 5k dice keywords
KEY_WORDS_FILE = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/top_5k_keywords.txt"
TOPN           = 30
PAYLOAD_SYNONYMS_FILE  = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/top%i_keyword_synonyms.txt" % TOPN
SYNONYMS_FILE  = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/keywords.txt"
PHRASES_FILE = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/Phrases.txt"
MODEL_FILE     = "/Users/simon.hughes/Documents/Dice Data/LuceneTalk/keyword_model.w2v"

In [2]:
#Shared
#just used to load phrases file
def load_stop_words(stop_words_file):
    stop_words = set()
    with open(stop_words_file) as f:
            for line in f:
                word = line.strip()
                if word[0] != "#":
                    word = word.lower()
                    stop_words.add(word)
    return stop_words

In [3]:
#functions
def map_keyword(kw):
    return kw.replace(" ", "_")

def write_most_similar_synonyms(topn, key_words, model, expand_fname, map_fname):
    key_words = set(key_words)
    missing = set()
    no_sim = set()
    all_syns = set()
    with open(expand_fname, "w+") as exp_f:
        for word in key_words:
            if not word in model.vocab:
                missing.add(word)
                continue
            
            top_matches = model.most_similar(positive=word, topn=topn*10)
            valid = []
            for t,sim in top_matches:
                if t in key_words and sim > 0.01:
                    valid.append((t,sim))
                    if len(valid) >= topn:
                        break
                
            if len(valid) > 0:
                all_syns.add(word)
                exp_f.write("%s=>" % word)
                for key, val in valid:
                    all_syns.add(key)
                    kw = map_keyword(key)                        
                    exp_f.write("%s|%f " %(kw,val))
                exp_f.write("\n")
            else:
                no_sim.add(word)
                #print("No matching similar terms in word2vec model for term: %s" % word)
    with open(map_fname, "w+") as f:
        for syn in sorted(all_syns):
            f.write("%s=>%s\n" % (syn, map_keyword(syn)))
    return all_syns, missing, no_sim

In [4]:
import gensim, time
from gensim.models.word2vec import Word2Vec

model = Word2Vec.load(MODEL_FILE)

In [5]:
keywords = load_stop_words(PHRASES_FILE)
len(keywords)


Out[5]:
21278

In [6]:
with open(KEY_WORDS_FILE) as f:
    for line in f:
        kw = line.strip()
        if len(kw) > 0:
            keywords.add(kw)
print("%i keywords loaded" % (len(keywords)))


24114 keywords loaded

In [7]:
all_syns, missing, no_sim = write_most_similar_synonyms(TOPN, keywords, model, PAYLOAD_SYNONYMS_FILE, SYNONYMS_FILE)

In [8]:
print len(missing), len(no_sim), len(keywords)


4962 26 24114