In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import csv
def read_csv(filename, has_header = True):
    items = []
    with open(filename, 'rU') as f:
        reader = csv.reader(f)
        if has_header:
            reader.next()

        for row in reader:
            items.append(row)
    return items

In [27]:
themes = read_csv('../topicWords/wedding_themes_collapsed_4.csv')
props = read_csv('../topicWords/wedding_props_collapsed_4.csv')
print len(themes)
print themes[0]


482
['wedding-theme-A1QK90OHMNVT6N1', 'the 80s', 'second']

In [4]:
GLOVE_NAMES = '/Users/paopow/Data/iis-dev/Data/GloVe/6B.300d.names'
GLOVE_VECS = '/Users/paopow/Data/iis-dev/Data/GloVe/6B.300d.npy'
from gloveSim import VectorSpace
vector_space = VectorSpace.from_glovedata(300, GLOVE_NAMES, GLOVE_VECS)

In [31]:
def get_glove_vec(word_list):
    word_vecs = []
    word_keys = []
    discard_words = []
    
    for t in word_list:
        v = vector_space.vec_for_sentence(t[1])
        if len(v.shape)== 0:
            discard_words.append(t[1])
        else:
            word_vecs.append(np.reshape(v,(1,len(v))))
            word_keys.append(t[1])
    word_vec_array = np.concatenate(word_vecs, axis=0)
    return (word_vec_array, word_keys, discard_words)

theme_word_vec_array, theme_word_keys, theme_discard_words = get_glove_vec(themes)
prop_word_vec_array, prop_word_keys, prop_discard_words = get_glove_vec(props)

In [67]:
num_valid_theme = len(theme_word_keys)
num_valid_prop = len(prop_word_keys)

In [34]:
print theme_discard_words


['the 80s', '1800s', '1920s', '50s', '70s', 'Astonomy', 'Blooons', 'Cinco de Mayo', 'Karaoke', 'Moroccan', 'selfies', 'Sharknado', "the 60's", 'Whiteout']

In [35]:
print prop_discard_words


['Camoflague', 'glowstick', 'Kimono', 'memorabilia', 'monocole', 'Smile', 'stethescope', 'sunscreen', 'vegibles']

In [37]:
from sklearn.metrics.pairwise import pairwise_distances
theme_distances = pairwise_distances(theme_word_vec_array)
prop_distances = pairwise_distances(prop_word_vec_array)

In [52]:
from scipy.spatial.distance import pdist
theme_sim = 1 - pdist(theme_word_vec_array, 'cosine')
prop_sim = 1 - pdist(prop_word_vec_array, 'cosine')

In [71]:
print 'THEME;'
print 'Mean:'
print np.mean(theme_sim)
print 'SD:'
print np.std(theme_sim)

print 'PROP:'
print 'Mean:'
print np.mean(prop_sim)
print 'SD:'
print np.std(prop_sim)


THEME;
Mean:
0.111250152359
SD:
0.123640928544
PROP:
Mean:
0.141468741736
SD:
0.129488421166

In [60]:
from itertools import combinations
theme_valid_sim = []
for i,j in combinations(range(len(theme_word_keys)),2):
    print i,j
    theme_valid_sim.append(theme_sim[i][j])
    
prop_valid_sim = []
for i,j in combinations(range(len(prop_word_keys)),2):
    prop_valid_sim.append(prop_sim[i][j])


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-60-26644907408d> in <module>()
      3 for i,j in combinations(range(len(theme_word_keys)),2):
      4     print i,j
----> 5     theme_valid_sim.append(theme_sim[i][j])
      6 
      7 prop_valid_sim = []

IndexError: invalid index to scalar variable.
0 1

In [43]:
np.max(theme_distansces)


Out[43]:
12.010139

In [56]:
hist, edges = np.histogram(prop_sim, density=True, bins=50)
output_file('prop_sim_hist.html')
p = figure()
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],fill_color="#036564", line_color="#033649")
show(p)



In [55]:
hist, edges = np.histogram(theme_sim, density=True, bins=50)
output_file('theme_sim_hist.html')
p = figure()
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],fill_color="#036564", line_color="#033649")
show(p)



In [32]:
from sklearn.manifold import TSNE
model = TSNE()
theme_embedding = model.fit_transform(theme_word_vec_array)
prop_embedding = model.fit_transform(prop_word_vec_array)

In [33]:
from bokeh.io import output_notebook, show, output_file
from bokeh.models.glyphs import Text
from bokeh.plotting import figure

def plot_scatter(Y, texts, filename,labels=None):
    output_notebook()
    output_file(filename + '.html')
    x = [r[0] for r in Y]
    y = [r[1] for r in Y]
    p = figure()
    if labels is not None:
        p.circle(x,y, fill_alpha=1., line_color=None)
    else:
        p.circle(x,y)
    for i in range(len(texts)):
        t = Text(x=x[i],y=y[i], text=[texts[i]])
        p.add_glyph(t)

    show(p)
plot_scatter(theme_embedding, theme_word_keys, 'theme_tsne')
plot_scatter(prop_embedding, prop_word_keys, 'prop_tsne')


BokehJS successfully loaded.

Warning: BokehJS previously loaded

BokehJS successfully loaded.

Warning: BokehJS previously loaded