notebook.community

Edit and run



In [1]:

    
# import modules
import pandas as pd
import numpy as np

from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from sklearn import preprocessing



In [2]:

    
xls_file = pd.ExcelFile('Domains-and-glossary.xlsx')

print(xls_file.sheet_names)


df = xls_file.parse('Glossary (by domain)')
df.head()









    



---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-2-6b32a8b8bc99> in <module>()
----> 1 xls_file = pd.ExcelFile('Domains-and-glossary.xlsx')
      2 
      3 print(xls_file.sheet_names)
      4 
      5 

/usr/local/lib/python3.5/dist-packages/pandas/io/excel.py in __init__(self, io, **kwds)
    247             self.book = xlrd.open_workbook(file_contents=data)
    248         elif isinstance(io, compat.string_types):
--> 249             self.book = xlrd.open_workbook(io)
    250         else:
    251             raise ValueError('Must explicitly set engine if not passing in'

/usr/local/lib/python3.5/dist-packages/xlrd/__init__.py in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows)
    114         peek = file_contents[:peeksz]
    115     else:
--> 116         with open(filename, "rb") as f:
    117             peek = f.read(peeksz)
    118     if peek == b"PK\x03\x04": # a ZIP file

FileNotFoundError: [Errno 2] No such file or directory: 'Domains-and-glossary.xlsx'



In [3]:

    
def load_w2v(word2vec_model_file):
    # load the finished model from disk
    word2vec_model = Word2Vec.load(word2vec_model_file)
    word2vec_model.init_sims(replace=True)
    return word2vec_model

word2vec_model_file = '/home/sonic/sonic/eosdb/data/eos/word2vec_model_all.model'
word2vec_model = load_w2v(word2vec_model_file)



In [4]:

    
keywords = df['Violence'][1:]

keywords = keywords.dropna(how='any')
# print(keywords)



In [5]:

    
def document_vector(word2vec_model, doc):
    return np.mean(word2vec_model[doc], axis=0)



In [7]:

    
keyword_w2v = []

for word in keywords:
#     print(document_vector(word2vec_model, doc))

    # remove out-of-vocabulary words
    doc = [word for word in word.split() if word in word2vec_model.wv.vocab]
    if len(doc) == 0:
        continue
    
    keyword_w2v.append(document_vector(word2vec_model, doc))
    
keyword_w2v = np.array(keyword_w2v)
# # # Scaled
# X_embedded_scaled = preprocessing.scale(topic_w2v)
X_normalized = preprocessing.normalize(keyword_w2v, norm='l2')


print(len(keyword_w2v))
print(keyword_w2v[0])
# print(X_embedded_scaled[0])
# print(X_normalized[0])



In [186]:

    
def get_nmf_dynamic_topics(k, dictionary, topn=20):
    
    topic_list = []
    topic_df = pd.read_pickle('../dynamic_nmf/data/windowbin/result/dynamic.df/dynamic_k%s.pkl' % (k))

    for c in topic_df.ix[:,1:].columns:
        # filter out any token not in 
        if dictionary is not None:
            topic = [x for x in topic_df[c].tolist() if x in dictionary.token2id.keys()]
        else:
            topic = topic_df[c].tolist() 
        topic_list.append(topic[:topn])
        
#     print(topic_list)
    return topic_list


# print(get_nmf_dynamic_topics(42, None, 20))  
df = pd.DataFrame(get_nmf_dynamic_topics(42, None, 20))
df.to_csv('dynamic_nmf_42.csv')


topic_w2v = []

for index, row in df.iterrows():
#     print(row)
    doc = [word for word in row if word in word2vec_model.wv.vocab]
    if len(doc) == 0:
        continue
        
    topic_w2v.append(document_vector(word2vec_model, doc))
#     break

topic_w2v = np.array(topic_w2v)

print(topic_w2v)
    
df.head()









    



[[ 0.03559574 -0.04429134  0.1150565  ..., -0.0234171   0.00550879
  -0.0983898 ]
 [-0.03052965 -0.11964951  0.07880859 ..., -0.017168   -0.10105554
  -0.03308823]
 [ 0.00585283 -0.10805272  0.07068779 ..., -0.01099878 -0.06338074
  -0.08180024]
 ..., 
 [-0.01316302 -0.16323963  0.04590257 ..., -0.02707528 -0.12101123
  -0.06573825]
 [ 0.08877812 -0.08373272  0.04539325 ..., -0.08314145 -0.0118684
  -0.08342692]
 [ 0.03137421 -0.09060578  0.10033997 ..., -0.03020529 -0.06571938
  -0.05896141]]






    Out[186]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
    
  
  
    
      0
      can
      will
      one
      year
      go
      like
      get
      make
      people
      say
      time
      good
      world
      know
      see
      just
      now
      work
      think
      many
    
    
      1
      shelling
      damascus
      suburbs
      regime
      homs
      neighborhood
      idlib
      forces
      daraa
      report
      fierce
      hama
      town
      fsa
      city
      martyrs
      artillery
      al
      mortar
      army
    
    
      2
      syrian
      syria
      assad
      opposition
      talk
      geneva
      say
      foreign
      arab
      peace
      al_assad
      regime
      damascus
      government
      president_bashar
      meeting
      political
      conference
      support
      terrorism
    
    
      3
      israel
      israeli
      palestinian
      jerusalem
      palestinians
      hamas
      gaza
      netanyahu
      west_bank
      jewish
      israelis
      peace
      aqsa
      abbas
      palestine
      arab
      jews
      jordan
      benjamin_netanyahu
      state
    
    
      4
      turkey
      turkish
      erdogan
      ankara
      kurdish
      istanbul
      pkk
      border
      syria
      davutoglu
      kurds
      tayyip_erdogan
      syrian
      nato
      party
      ypg
      armenian
      recep_tayyip
      visit
      coup



In [ ]:

    
from sklearn.metrics.pairwise import pairwise_distances

A = np.array([[i] for i in range(len(corpus))])

print (A)

def f(x, y):
    return word2vec_model.wv.wmdistance(corpus[int(x)], corpus[int(y)])

X_wmd_distance_eos = pairwise_distances(A, metric=f, n_jobs=-1)

df_X_wmd_distance_eos = pd.DataFrame(X_wmd_distance_eos)

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19
0	can	will	one	year	go	like	get	make	people	say	time	good	world	know	see	just	now	work	think	many
1	shelling	damascus	suburbs	regime	homs	neighborhood	idlib	forces	daraa	report	fierce	hama	town	fsa	city	martyrs	artillery	al	mortar	army
2	syrian	syria	assad	opposition	talk	geneva	say	foreign	arab	peace	al_assad	regime	damascus	government	president_bashar	meeting	political	conference	support	terrorism
3	israel	israeli	palestinian	jerusalem	palestinians	hamas	gaza	netanyahu	west_bank	jewish	israelis	peace	aqsa	abbas	palestine	arab	jews	jordan	benjamin_netanyahu	state
4	turkey	turkish	erdogan	ankara	kurdish	istanbul	pkk	border	syria	davutoglu	kurds	tayyip_erdogan	syrian	nato	party	ypg	armenian	recep_tayyip	visit	coup