WORD2VEC MODEL


In [2]:
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from nltk.corpus import genesis
import matplotlib.pyplot as plt
from textblob import TextBlob
from pprint import pprint
import pandas as pd
import numpy as np
import logging
import csv
import re
% matplotlib inline
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

BIBLE (Genesis)


In [31]:
sentences = genesis.sents(fileids='english-kjv.txt')
print len(sentences)
pprint(sentences[0])


1467
[u'In',
 u'the',
 u'beginning',
 u'God',
 u'created',
 u'the',
 u'heaven',
 u'and',
 u'the',
 u'earth',
 u'.']

In [49]:
w2v = Word2Vec(sentences, min_count=1, workers=2)
labels = w2v.index2word
vectors = w2v.syn0
tisney = TSNE(n_components=2, random_state=42)
vectors2d = tisney.fit_transform(vectors)

In [50]:
print 'number of unique words = {}'.format(vectors.shape[0])
print 'number of dimensions = {}'.format(vectors.shape[1])
print 'man | king ==> semantic similarity: {}'.format(w2v.similarity('man','king'))


number of unique words = 2789
number of dimensions = 100
man | king ==> semantic similarity: 0.99983951168

In [55]:
plt.figure(figsize=(15, 15))
for i, label in enumerate(labels):
    if i > 1500:
        break
    x, y = vectors2d[i,:]
    plt.scatter(x, y)
    plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom',color='b')
plt.grid(True)
plt.show()


GOOGLE NEWS

Google's own Word2Vec model! Trained on over 100 billion words from Google News dataset. Includes vectors for a vocabulary of 3 million words and phrases. The vector length is 300 features.


In [3]:
w2v = Word2Vec.load_word2vec_format(fname='models/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [ ]:
labels = w2v.index2word
vectors = w2v.syn0
tisney = TSNE(n_components=2, random_state=42)
vectors2d = tisney.fit_transform(vectors)

In [7]:
print 'man | king ==> semantic similarity:{}'.format(w2v.similarity('man','king'))
print 'man | queen ==> semantic similarity:{}'.format(w2v.similarity('man','queen'))
print 'man | woman ==> semantic similarity:{}'.format(w2v.similarity('man','woman'))
print 'woman | king ==> semantic similarity:{}'.format(w2v.similarity('woman','king'))
print 'influenza | virus ==> semantic similarity:{}'.format(w2v.similarity('influenza','virus'))


man | king ==> semantic similarity:0.229426704576
man | queen ==> semantic similarity:0.16658202971
man | woman ==> semantic similarity:0.7664012231
woman | king ==> semantic similarity:0.12847973557
influenza | virus ==> semantic similarity:0.661505429389

In [ ]:
plt.figure(figsize=(15, 15))
for i, label in enumerate(labels):
    if i > 1500:
        break
    x, y = vectors2d[i,:]
    plt.scatter(x, y)
    plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom',color='b')
plt.grid(True)
plt.show()

GYANT


In [176]:
df = pd.read_csv('data/161207_ZikaLabels.csv')
sentences = []
for i in range(1000):
    sent = df.diagnosisRAW[i].decode('ISO-8859-2').encode('ASCII','ignore').encode('UTF8').replace(',','').lower().split()
    sentences.append(sent)

In [178]:
w2v = Word2Vec(sentences, min_count=1, workers=2)
labels = w2v.index2word  
vectors = w2v.syn0
tisney = TSNE(n_components=2, random_state=42)
vectors2d = tisney.fit_transform(vectors)

In [208]:
plt.figure(figsize=(15, 15))
for i, label in enumerate(labels):
    if i > 1000:
        break
    x, y = vectors2d[i,:]
    plt.scatter(x, y)
    if label in ['zica','zika','zoka']:
        plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom',color='r')
    else:
        pass
        plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom',color='k')
plt.show()