In [1]:
# import modules
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from sklearn import preprocessing
In [2]:
xls_file = pd.ExcelFile('Domains-and-glossary.xlsx')
print(xls_file.sheet_names)
df = xls_file.parse('Glossary (by domain)')
df.head()
In [3]:
def load_w2v(word2vec_model_file):
# load the finished model from disk
word2vec_model = Word2Vec.load(word2vec_model_file)
word2vec_model.init_sims(replace=True)
return word2vec_model
word2vec_model_file = '/home/sonic/sonic/eosdb/data/eos/word2vec_model_all.model'
word2vec_model = load_w2v(word2vec_model_file)
In [4]:
keywords = df['Violence'][1:]
keywords = keywords.dropna(how='any')
# print(keywords)
In [5]:
def document_vector(word2vec_model, doc):
return np.mean(word2vec_model[doc], axis=0)
In [7]:
keyword_w2v = []
for word in keywords:
# print(document_vector(word2vec_model, doc))
# remove out-of-vocabulary words
doc = [word for word in word.split() if word in word2vec_model.wv.vocab]
if len(doc) == 0:
continue
keyword_w2v.append(document_vector(word2vec_model, doc))
keyword_w2v = np.array(keyword_w2v)
# # # Scaled
# X_embedded_scaled = preprocessing.scale(topic_w2v)
X_normalized = preprocessing.normalize(keyword_w2v, norm='l2')
print(len(keyword_w2v))
print(keyword_w2v[0])
# print(X_embedded_scaled[0])
# print(X_normalized[0])
In [186]:
def get_nmf_dynamic_topics(k, dictionary, topn=20):
topic_list = []
topic_df = pd.read_pickle('../dynamic_nmf/data/windowbin/result/dynamic.df/dynamic_k%s.pkl' % (k))
for c in topic_df.ix[:,1:].columns:
# filter out any token not in
if dictionary is not None:
topic = [x for x in topic_df[c].tolist() if x in dictionary.token2id.keys()]
else:
topic = topic_df[c].tolist()
topic_list.append(topic[:topn])
# print(topic_list)
return topic_list
# print(get_nmf_dynamic_topics(42, None, 20))
df = pd.DataFrame(get_nmf_dynamic_topics(42, None, 20))
df.to_csv('dynamic_nmf_42.csv')
topic_w2v = []
for index, row in df.iterrows():
# print(row)
doc = [word for word in row if word in word2vec_model.wv.vocab]
if len(doc) == 0:
continue
topic_w2v.append(document_vector(word2vec_model, doc))
# break
topic_w2v = np.array(topic_w2v)
print(topic_w2v)
df.head()
Out[186]:
In [ ]:
from sklearn.metrics.pairwise import pairwise_distances
A = np.array([[i] for i in range(len(corpus))])
print (A)
def f(x, y):
return word2vec_model.wv.wmdistance(corpus[int(x)], corpus[int(y)])
X_wmd_distance_eos = pairwise_distances(A, metric=f, n_jobs=-1)
df_X_wmd_distance_eos = pd.DataFrame(X_wmd_distance_eos)