In [2]:
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from nltk.corpus import genesis
import matplotlib.pyplot as plt
from textblob import TextBlob
from pprint import pprint
import pandas as pd
import numpy as np
import logging
import csv
import re
% matplotlib inline
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [31]:
sentences = genesis.sents(fileids='english-kjv.txt')
print len(sentences)
pprint(sentences[0])
In [49]:
w2v = Word2Vec(sentences, min_count=1, workers=2)
labels = w2v.index2word
vectors = w2v.syn0
tisney = TSNE(n_components=2, random_state=42)
vectors2d = tisney.fit_transform(vectors)
In [50]:
print 'number of unique words = {}'.format(vectors.shape[0])
print 'number of dimensions = {}'.format(vectors.shape[1])
print 'man | king ==> semantic similarity: {}'.format(w2v.similarity('man','king'))
In [55]:
plt.figure(figsize=(15, 15))
for i, label in enumerate(labels):
if i > 1500:
break
x, y = vectors2d[i,:]
plt.scatter(x, y)
plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom',color='b')
plt.grid(True)
plt.show()
In [3]:
w2v = Word2Vec.load_word2vec_format(fname='models/GoogleNews-vectors-negative300.bin.gz', binary=True)
In [ ]:
labels = w2v.index2word
vectors = w2v.syn0
tisney = TSNE(n_components=2, random_state=42)
vectors2d = tisney.fit_transform(vectors)
In [7]:
print 'man | king ==> semantic similarity:{}'.format(w2v.similarity('man','king'))
print 'man | queen ==> semantic similarity:{}'.format(w2v.similarity('man','queen'))
print 'man | woman ==> semantic similarity:{}'.format(w2v.similarity('man','woman'))
print 'woman | king ==> semantic similarity:{}'.format(w2v.similarity('woman','king'))
print 'influenza | virus ==> semantic similarity:{}'.format(w2v.similarity('influenza','virus'))
In [ ]:
plt.figure(figsize=(15, 15))
for i, label in enumerate(labels):
if i > 1500:
break
x, y = vectors2d[i,:]
plt.scatter(x, y)
plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom',color='b')
plt.grid(True)
plt.show()
In [176]:
df = pd.read_csv('data/161207_ZikaLabels.csv')
sentences = []
for i in range(1000):
sent = df.diagnosisRAW[i].decode('ISO-8859-2').encode('ASCII','ignore').encode('UTF8').replace(',','').lower().split()
sentences.append(sent)
In [178]:
w2v = Word2Vec(sentences, min_count=1, workers=2)
labels = w2v.index2word
vectors = w2v.syn0
tisney = TSNE(n_components=2, random_state=42)
vectors2d = tisney.fit_transform(vectors)
In [208]:
plt.figure(figsize=(15, 15))
for i, label in enumerate(labels):
if i > 1000:
break
x, y = vectors2d[i,:]
plt.scatter(x, y)
if label in ['zica','zika','zoka']:
plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom',color='r')
else:
pass
plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom',color='k')
plt.show()