In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import numpy as np
import csv
def read_csv(filename, has_header = True):
items = []
with open(filename, 'rU') as f:
reader = csv.reader(f)
if has_header:
reader.next()
for row in reader:
items.append(row)
return items
In [27]:
themes = read_csv('../topicWords/wedding_themes_collapsed_4.csv')
props = read_csv('../topicWords/wedding_props_collapsed_4.csv')
print len(themes)
print themes[0]
In [4]:
GLOVE_NAMES = '/Users/paopow/Data/iis-dev/Data/GloVe/6B.300d.names'
GLOVE_VECS = '/Users/paopow/Data/iis-dev/Data/GloVe/6B.300d.npy'
from gloveSim import VectorSpace
vector_space = VectorSpace.from_glovedata(300, GLOVE_NAMES, GLOVE_VECS)
In [31]:
def get_glove_vec(word_list):
word_vecs = []
word_keys = []
discard_words = []
for t in word_list:
v = vector_space.vec_for_sentence(t[1])
if len(v.shape)== 0:
discard_words.append(t[1])
else:
word_vecs.append(np.reshape(v,(1,len(v))))
word_keys.append(t[1])
word_vec_array = np.concatenate(word_vecs, axis=0)
return (word_vec_array, word_keys, discard_words)
theme_word_vec_array, theme_word_keys, theme_discard_words = get_glove_vec(themes)
prop_word_vec_array, prop_word_keys, prop_discard_words = get_glove_vec(props)
In [67]:
num_valid_theme = len(theme_word_keys)
num_valid_prop = len(prop_word_keys)
In [34]:
print theme_discard_words
In [35]:
print prop_discard_words
In [37]:
from sklearn.metrics.pairwise import pairwise_distances
theme_distances = pairwise_distances(theme_word_vec_array)
prop_distances = pairwise_distances(prop_word_vec_array)
In [52]:
from scipy.spatial.distance import pdist
theme_sim = 1 - pdist(theme_word_vec_array, 'cosine')
prop_sim = 1 - pdist(prop_word_vec_array, 'cosine')
In [71]:
print 'THEME;'
print 'Mean:'
print np.mean(theme_sim)
print 'SD:'
print np.std(theme_sim)
print 'PROP:'
print 'Mean:'
print np.mean(prop_sim)
print 'SD:'
print np.std(prop_sim)
In [60]:
from itertools import combinations
theme_valid_sim = []
for i,j in combinations(range(len(theme_word_keys)),2):
print i,j
theme_valid_sim.append(theme_sim[i][j])
prop_valid_sim = []
for i,j in combinations(range(len(prop_word_keys)),2):
prop_valid_sim.append(prop_sim[i][j])
In [43]:
np.max(theme_distansces)
Out[43]:
In [56]:
hist, edges = np.histogram(prop_sim, density=True, bins=50)
output_file('prop_sim_hist.html')
p = figure()
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],fill_color="#036564", line_color="#033649")
show(p)
In [55]:
hist, edges = np.histogram(theme_sim, density=True, bins=50)
output_file('theme_sim_hist.html')
p = figure()
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],fill_color="#036564", line_color="#033649")
show(p)
In [32]:
from sklearn.manifold import TSNE
model = TSNE()
theme_embedding = model.fit_transform(theme_word_vec_array)
prop_embedding = model.fit_transform(prop_word_vec_array)
In [33]:
from bokeh.io import output_notebook, show, output_file
from bokeh.models.glyphs import Text
from bokeh.plotting import figure
def plot_scatter(Y, texts, filename,labels=None):
output_notebook()
output_file(filename + '.html')
x = [r[0] for r in Y]
y = [r[1] for r in Y]
p = figure()
if labels is not None:
p.circle(x,y, fill_alpha=1., line_color=None)
else:
p.circle(x,y)
for i in range(len(texts)):
t = Text(x=x[i],y=y[i], text=[texts[i]])
p.add_glyph(t)
show(p)
plot_scatter(theme_embedding, theme_word_keys, 'theme_tsne')
plot_scatter(prop_embedding, prop_word_keys, 'prop_tsne')