In [1]:
# import modules
import pandas as pd
import numpy as np

from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from sklearn import preprocessing

In [2]:
xls_file = pd.ExcelFile('Domains-and-glossary.xlsx')

print(xls_file.sheet_names)


df = xls_file.parse('Glossary (by domain)')
df.head()


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-2-6b32a8b8bc99> in <module>()
----> 1 xls_file = pd.ExcelFile('Domains-and-glossary.xlsx')
      2 
      3 print(xls_file.sheet_names)
      4 
      5 

/usr/local/lib/python3.5/dist-packages/pandas/io/excel.py in __init__(self, io, **kwds)
    247             self.book = xlrd.open_workbook(file_contents=data)
    248         elif isinstance(io, compat.string_types):
--> 249             self.book = xlrd.open_workbook(io)
    250         else:
    251             raise ValueError('Must explicitly set engine if not passing in'

/usr/local/lib/python3.5/dist-packages/xlrd/__init__.py in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows)
    114         peek = file_contents[:peeksz]
    115     else:
--> 116         with open(filename, "rb") as f:
    117             peek = f.read(peeksz)
    118     if peek == b"PK\x03\x04": # a ZIP file

FileNotFoundError: [Errno 2] No such file or directory: 'Domains-and-glossary.xlsx'

In [3]:
def load_w2v(word2vec_model_file):
    # load the finished model from disk
    word2vec_model = Word2Vec.load(word2vec_model_file)
    word2vec_model.init_sims(replace=True)
    return word2vec_model

word2vec_model_file = '/home/sonic/sonic/eosdb/data/eos/word2vec_model_all.model'
word2vec_model = load_w2v(word2vec_model_file)

In [4]:
keywords = df['Violence'][1:]

keywords = keywords.dropna(how='any')
# print(keywords)

In [5]:
def document_vector(word2vec_model, doc):
    return np.mean(word2vec_model[doc], axis=0)

In [7]:
keyword_w2v = []

for word in keywords:
#     print(document_vector(word2vec_model, doc))

    # remove out-of-vocabulary words
    doc = [word for word in word.split() if word in word2vec_model.wv.vocab]
    if len(doc) == 0:
        continue
    
    keyword_w2v.append(document_vector(word2vec_model, doc))
    
keyword_w2v = np.array(keyword_w2v)
# # # Scaled
# X_embedded_scaled = preprocessing.scale(topic_w2v)
X_normalized = preprocessing.normalize(keyword_w2v, norm='l2')


print(len(keyword_w2v))
print(keyword_w2v[0])
# print(X_embedded_scaled[0])
# print(X_normalized[0])


76

In [186]:
def get_nmf_dynamic_topics(k, dictionary, topn=20):
    
    topic_list = []
    topic_df = pd.read_pickle('../dynamic_nmf/data/windowbin/result/dynamic.df/dynamic_k%s.pkl' % (k))

    for c in topic_df.ix[:,1:].columns:
        # filter out any token not in 
        if dictionary is not None:
            topic = [x for x in topic_df[c].tolist() if x in dictionary.token2id.keys()]
        else:
            topic = topic_df[c].tolist() 
        topic_list.append(topic[:topn])
        
#     print(topic_list)
    return topic_list


# print(get_nmf_dynamic_topics(42, None, 20))  
df = pd.DataFrame(get_nmf_dynamic_topics(42, None, 20))
df.to_csv('dynamic_nmf_42.csv')


topic_w2v = []

for index, row in df.iterrows():
#     print(row)
    doc = [word for word in row if word in word2vec_model.wv.vocab]
    if len(doc) == 0:
        continue
        
    topic_w2v.append(document_vector(word2vec_model, doc))
#     break

topic_w2v = np.array(topic_w2v)

print(topic_w2v)
    
df.head()


[[ 0.03559574 -0.04429134  0.1150565  ..., -0.0234171   0.00550879
  -0.0983898 ]
 [-0.03052965 -0.11964951  0.07880859 ..., -0.017168   -0.10105554
  -0.03308823]
 [ 0.00585283 -0.10805272  0.07068779 ..., -0.01099878 -0.06338074
  -0.08180024]
 ..., 
 [-0.01316302 -0.16323963  0.04590257 ..., -0.02707528 -0.12101123
  -0.06573825]
 [ 0.08877812 -0.08373272  0.04539325 ..., -0.08314145 -0.0118684
  -0.08342692]
 [ 0.03137421 -0.09060578  0.10033997 ..., -0.03020529 -0.06571938
  -0.05896141]]
Out[186]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
0 can will one year go like get make people say time good world know see just now work think many
1 shelling damascus suburbs regime homs neighborhood idlib forces daraa report fierce hama town fsa city martyrs artillery al mortar army
2 syrian syria assad opposition talk geneva say foreign arab peace al_assad regime damascus government president_bashar meeting political conference support terrorism
3 israel israeli palestinian jerusalem palestinians hamas gaza netanyahu west_bank jewish israelis peace aqsa abbas palestine arab jews jordan benjamin_netanyahu state
4 turkey turkish erdogan ankara kurdish istanbul pkk border syria davutoglu kurds tayyip_erdogan syrian nato party ypg armenian recep_tayyip visit coup

In [ ]:
from sklearn.metrics.pairwise import pairwise_distances

A = np.array([[i] for i in range(len(corpus))])

print (A)

def f(x, y):
    return word2vec_model.wv.wmdistance(corpus[int(x)], corpus[int(y)])

X_wmd_distance_eos = pairwise_distances(A, metric=f, n_jobs=-1)

df_X_wmd_distance_eos = pd.DataFrame(X_wmd_distance_eos)