In [1]:
import word2vec
from string import punctuation
import codecs, re
import cPickle as pickle

In [12]:
trans = {s: ' ' for s in punctuation}
trans['1'] = ' one '
trans['2'] = ' two '
trans['3'] = ' three '
trans['4'] = ' four '
trans['5'] = ' five '
trans['6'] = ' six '
trans['7'] = ' seven '
trans['8'] = ' eight '
trans['9'] = ' nine '
trans['0'] = ' zero '

fout = codecs.open('data/text','w','utf-8')
for row in codecs.open('data/stanford_sentence_list.csv','r','utf-8').readlines():
    fout.write(re.sub('  +',' ',''.join([trans[c] if c in trans else c.lower() for c in row])))

fout.close()

In [24]:
word2vec.word2phrase('data/text',
                     'data/criteria_phrases', verbose=True)


[u'word2phrase', u'-train', u'data/text', u'-output', u'data/criteria_phrases', u'-min-count', u'5', u'-threshold', u'100', u'-debug', u'2']
Starting training using file data/text
Words processed: 33000K     Vocab size: 2178K  
Vocab size (unigrams + bigrams): 1309404
Words in train file: 33058339
Words written: 33000K

In [25]:
word2vec.word2vec('data/criteria_phrases',
                  'data/criteria.bin', size=100, verbose=True)


Starting training using file data/criteria_phrases
Vocab size: 68486
Words in train file: 31616280
Alpha: 0.000002  Progress: 100.00%  Words/thread/sec: 270.88k  

In [26]:
word2vec.word2clusters('data/text',
                       'data/criteria-clusters.txt', 100, verbose=True)


Starting training using file data/text
Vocab size: 33611
Words in train file: 35575077
Alpha: 0.000002  Progress: 100.00%  Words/thread/sec: 278.17k  

Predictions


In [29]:
model = word2vec.load('data/criteria.bin')

In [18]:
def get_syn(word):
    indexes, metrics = model.cosine(word, n=20)
    return model.generate_response(indexes, metrics).tolist()

In [19]:
syns = get_syn('drug')

In [17]:
model.cosine??

In [20]:
syns


Out[20]:
[(u'medication', 0.7640904744172379, 4),
 (u'drugs', 0.7292337499487276, 16),
 (u'investigational_product', 0.6865529320955848, 68),
 (u'agent', 0.669449733039265, 28),
 (u'test_article', 0.6170045578361203, 26),
 (u'imp', 0.6159925378784474, 32),
 (u'medicinal_product', 0.6044398848766634, 24),
 (u'medications', 0.6015288249942685, 30),
 (u'investigational_drug', 0.5847365742309929, 72),
 (u'vaccine', 0.5732292125758749, 64),
 (u'prescription_drugs', 0.5701140632570989, 45),
 (u'compound', 0.5602940358727715, 39),
 (u'substance', 0.5527357010502594, 91),
 (u'otc_medications', 0.5454989533700731, 72),
 (u'medicinal_products', 0.5437592873164172, 23),
 (u'product', 0.5435930865996139, 8),
 (u'medicines', 0.5353486761877477, 76),
 (u'ip', 0.533222274131262, 36),
 (u'illicit_drug', 0.528962979888314, 8),
 (u'investigational_compound', 0.528536367787627, 66)]

Clusters


In [27]:
clusters = word2vec.load_clusters('data/criteria-clusters.txt')

In [28]:
def get_cluster(word):
    cluster_ix = clusters[word]
    return clusters.get_words_on_cluster(cluster_ix)[:20]

In [33]:
clust = get_cluster('drug')

In [34]:
clust


Out[34]:
array(['any', 'treatment', 'therapy', 'drug', 'medication', 'drugs',
       'medications', 'agents', 'another', 'vaccine', 'products', 'agent',
       'product', 'antibiotics', 'therapies', 'treatments', 'vaccines',
       'compounds', 'trials', 'medicine'], dtype=object)

Add the clusters to the vector model


In [35]:
model.clusters = clusters

In [36]:
syns = get_syn('drug')
syns


Out[36]:
[(u'medication', 0.7640904744172379, 4),
 (u'drugs', 0.7292337499487276, 16),
 (u'investigational_product', 0.6865529320955848, 68),
 (u'agent', 0.669449733039265, 28),
 (u'test_article', 0.6170045578361203, 26),
 (u'imp', 0.6159925378784474, 32),
 (u'medicinal_product', 0.6044398848766634, 24),
 (u'medications', 0.6015288249942685, 30),
 (u'investigational_drug', 0.5847365742309929, 72),
 (u'vaccine', 0.5732292125758749, 64)]

In [ ]: