In [1]:
import word2vec
from string import punctuation
import codecs, re
import cPickle as pickle
In [12]:
trans = {s: ' ' for s in punctuation}
trans['1'] = ' one '
trans['2'] = ' two '
trans['3'] = ' three '
trans['4'] = ' four '
trans['5'] = ' five '
trans['6'] = ' six '
trans['7'] = ' seven '
trans['8'] = ' eight '
trans['9'] = ' nine '
trans['0'] = ' zero '
fout = codecs.open('data/text','w','utf-8')
for row in codecs.open('data/stanford_sentence_list.csv','r','utf-8').readlines():
fout.write(re.sub(' +',' ',''.join([trans[c] if c in trans else c.lower() for c in row])))
fout.close()
In [24]:
word2vec.word2phrase('data/text',
'data/criteria_phrases', verbose=True)
In [25]:
word2vec.word2vec('data/criteria_phrases',
'data/criteria.bin', size=100, verbose=True)
In [26]:
word2vec.word2clusters('data/text',
'data/criteria-clusters.txt', 100, verbose=True)
In [29]:
model = word2vec.load('data/criteria.bin')
In [18]:
def get_syn(word):
indexes, metrics = model.cosine(word, n=20)
return model.generate_response(indexes, metrics).tolist()
In [19]:
syns = get_syn('drug')
In [17]:
model.cosine??
In [20]:
syns
Out[20]:
In [27]:
clusters = word2vec.load_clusters('data/criteria-clusters.txt')
In [28]:
def get_cluster(word):
cluster_ix = clusters[word]
return clusters.get_words_on_cluster(cluster_ix)[:20]
In [33]:
clust = get_cluster('drug')
In [34]:
clust
Out[34]:
Add the clusters to the vector model
In [35]:
model.clusters = clusters
In [36]:
syns = get_syn('drug')
syns
Out[36]:
In [ ]: