notebook.community

Edit and run



In [1]:

    
# LDA analysis Done



In [2]:

    
import glob
from datetime import datetime
import logging as log
import gensim
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim
from gensim.models import CoherenceModel
from sklearn.externals import joblib
import gzip
from multiprocessing import Pool
import time
import numpy as np


import pandas as pd
from collections import OrderedDict
from datetime import date
from IPython.display import display, HTML
log.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=log.INFO)

import matplotlib
matplotlib.style.use('ggplot')
%matplotlib notebook



In [3]:

    
class ModelSimilarity:

    # Uses a model (e.g. Word2Vec model) to calculate the similarity between two terms.
    
    def __init__(self, model):
        self.model = model

    def similarity(self, ranking_i, ranking_j):
        sim = 0.0
        pairs = 0
        for term_i in ranking_i:
            for term_j in ranking_j:
                try:
                    sim += self.model.similarity(term_i, term_j)
                    pairs += 1
                except:
                    # print "Failed pair (%s,%s)" % (term_i,term_j)
                    pass
        if pairs == 0:
            return 0.0
        return sim / pairs



In [4]:

    
class WithinTopicMeasure:
 
    # Measures within-topic coherence for a topic model, based on a set of term rankings.

    def __init__(self, metric):
        self.metric = metric

    def evaluate_ranking(self, term_ranking):
        return self.metric.similarity(term_ranking, term_ranking)

    def evaluate_rankings(self, term_rankings):
        scores = []
        overall = 0.0
        for topic_index in range(len(term_rankings)):
            score = self.evaluate_ranking(term_rankings[topic_index])
            scores.append(score)
            overall += score
        overall /= len(term_rankings)
        return overall



In [5]:

    
# To get the topic words from the model
def get_topics(ldamodel, num_topics, num_words):    
    top_words = [[word for word, _ in ldamodel.show_topic(topicno, topn=num_words)] 
                 for topicno in range(ldamodel.num_topics)]
#     print(top_words)
    return top_words



In [6]:

    
class MyDocuments(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        with gzip.open(self.dirname, 'rb') as f:
            for line in f:
                yield line.decode().split('\t')[1].split()



In [7]:

    
def execute_coherence(topic_model_path):
    
    
    model_list = glob.glob(topic_model_path)
    model_list.sort()

    model_path = 'data/eos/word2vec_model_all.model'
    log.info("Loading Word2Vec model from %s ..." % model_path)
    model = gensim.models.Word2Vec.load(model_path)
    
    metric = ModelSimilarity(model)
    validation_measure = WithinTopicMeasure(metric)

    indices =[]
    for fname in model_list:
        window = fname[-16:-9]
        indices.append(window)
        
    indices = list(set(indices))
    indices.sort()
#     print(indices)
    
    coherence_list = {}
    for window in indices:
        start_time = time.time()
        temp = []
        for fname in model_list:
            if (window == fname[-16:-9]):
                ldamodel = joblib.load(fname)
                topic_num = int(fname[-6:-4])
                truncated_term_rankings = get_topics(ldamodel, topic_num, 20)
                coherence_score = validation_measure.evaluate_rankings(truncated_term_rankings)
                log.info("Model coherence window=%s (k=%d) = %.4f" % (window, topic_num, coherence_score))
                temp.append ([topic_num, coherence_score])
        coherence_list[window] = temp
        
        elapsed_time = time.time() - start_time
        print('took %s to process %s' % (elapsed_time, topic_model_path) )
    

    return coherence_list, indices



In [8]:

    
def execute_coherence_gensim(topic_model_path):
    
    dictionary_filepath = 'data/eos/dic_bow/bigram_dict_%s.dict' 
    bow_filepath = 'data/eos/dic_bow/bigram_bow_corpus_%s.mm'    
    
    model_list = glob.glob(topic_model_path)
    model_list.sort()

    indices =[]
    for fname in model_list:
        window = fname[-16:-9]
        indices.append(window)
        
    indices = list(set(indices))
    indices.sort()
    print(indices)
    
    coherence_list = {}
    for window in indices:
        temp = []
        # Load to memory
        corpus = gensim.corpora.MmCorpus(bow_filepath % window)
        dictionary = gensim.corpora.Dictionary.load(dictionary_filepath % window)    
        corpus_text = MyDocuments('dynamic_nmf/data/windowbin/slices/tokenized_window_%s.gz' % window)   
        for fname in model_list:
            if (window == fname[-16:-9]):
                ldamodel = joblib.load(fname)
                topic_num = int(fname[-6:-4])
                cm = CoherenceModel(dictionary=dictionary, corpus=corpus, 
                                    texts=corpus_text, topics=get_topics(ldamodel, topic_num, 20), 
                                    coherence='c_v')
                #     print(cm)
                coherence_score = cm.get_coherence()                
                log.info("Model coherence window=%s (k=%d) = %.4f" % (window, topic_num, coherence_score))
                temp.append ([topic_num, coherence_score])
        coherence_list[window] = temp

        
    return coherence_list, indices



In [24]:

    
model_list = {
              'lda' : 'data/eos/lda/LDAmodel_*.pkl', 
              'lsi' : 'data/eos/lsi/LSImodel_*.pkl', 
              'mallet' : 'data/eos/mallet/Malletmodel_*.pkl'
             }



In [10]:

    
def sort_coherence(coherence_dict):
    
    y_coherence = []
    y_k = []
    
    for key in sorted(coherence_dict):
        coherence_dict[key].sort(key=lambda x: x[1], reverse=True)
#         print ("%s : %s" % (key, coherence_dict[key]))
        y_k.append(coherence_dict[key][0][0])
        y_coherence.append(coherence_dict[key][0][1])

#     print(y_coherence)
#     print(y_k)
    return y_coherence, y_k



In [11]:

    
%%time



df = pd.DataFrame()
# Run TC-W2V coherense score
for name, path in model_list.items():
    print (name, path) 
    coherence_list, indices = execute_coherence(path)
    y_coherence, y_k = sort_coherence(coherence_list)
    df['window'] = indices
    df['%s_TC_W2V' % name] = y_coherence
    df['%s_TC_W2V_k' % name] = y_k
    

df.set_index(df['window'], inplace=True)
display(df.head())
df.to_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')









    



mallet data/eos/mallet/Malletmodel_*.pkl
took 244.52745938301086 to process data/eos/mallet/Malletmodel_*.pkl
lda data/eos/lda/LDAmodel_*.pkl
took 294.26973938941956 to process data/eos/lda/LDAmodel_*.pkl
lsi data/eos/lsi/LSImodel_*.pkl
took 251.3368363380432 to process data/eos/lsi/LSImodel_*.pkl






    






  
    
      
      window
      mallet_TC_W2V
      mallet_TC_W2V_k
      lda_TC_W2V
      lda_TC_W2V_k
      lsi_TC_W2V
      lsi_TC_W2V_k
    
    
      window
      
      
      
      
      
      
      
    
  
  
    
      2012_01
      2012_01
      0.373322
      22
      0.367360
      12
      0.344060
      18
    
    
      2012_02
      2012_02
      0.371982
      26
      0.356412
      14
      0.356953
      12
    
    
      2012_03
      2012_03
      0.376637
      28
      0.358276
      20
      0.354839
      10
    
    
      2012_04
      2012_04
      0.372540
      24
      0.356079
      10
      0.344663
      16
    
    
      2012_05
      2012_05
      0.385459
      16
      0.369615
      10
      0.385599
      10
    
  








    



CPU times: user 11min 24s, sys: 6.37 s, total: 11min 30s
Wall time: 13min 11s



In [25]:

    
%%time


df = pd.read_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')
display(df.tail())

# Run Gensim on LDA/LSA/Mallet
for name, path in model_list.items() :
    print (name, path)    
    coherence_list, indices = execute_coherence_gensim(path)
    y_coherence, y_k = sort_coherence(coherence_list)
    df['%s_Unify' % name] = y_coherence
    df['%s_Unify_k' % name] = y_k
    

display(df.head())
display(df.tail())
df.to_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')









    






  
    
      
      Unnamed: 0
      window
      window.1
      mallet_TC_W2V
      mallet_TC_W2V_k
      lda_TC_W2V
      lda_TC_W2V_k
      lsi_TC_W2V
      lsi_TC_W2V_k
      NMF_TC_W2V
      NMF_TC_W2V_k
      NMF_Unify
      NMF_Unify_k
    
  
  
    
      62
      62
      2017_03
      2017_03
      0.368832
      22
      0.344859
      28
      0.336967
      24
      0.375299
      28
      0.690587
      22
    
    
      63
      63
      2017_04
      2017_04
      0.366006
      12
      0.341303
      20
      0.326046
      12
      0.374008
      24
      0.650894
      20
    
    
      64
      64
      2017_05
      2017_05
      0.360582
      22
      0.333149
      28
      0.319035
      12
      0.357726
      30
      0.677018
      14
    
    
      65
      65
      2017_06
      2017_06
      0.381813
      28
      0.335946
      12
      0.328623
      22
      0.373138
      18
      0.725945
      10
    
    
      66
      66
      2017_07
      2017_07
      0.349201
      22
      0.339650
      28
      0.322636
      30
      0.347636
      22
      0.694142
      16
    
  








    



mallet data/eos/mallet/Malletmodel_*.pkl
['2012_01', '2012_02', '2012_03', '2012_04', '2012_05', '2012_06', '2012_07', '2012_08', '2012_09', '2012_10', '2012_11', '2012_12', '2013_01', '2013_02', '2013_03', '2013_04', '2013_05', '2013_06', '2013_07', '2013_08', '2013_09', '2013_10', '2013_11', '2013_12', '2014_01', '2014_02', '2014_03', '2014_04', '2014_05', '2014_06', '2014_07', '2014_08', '2014_09', '2014_10', '2014_11', '2014_12', '2015_01', '2015_02', '2015_03', '2015_04', '2015_05', '2015_06', '2015_07', '2015_08', '2015_09', '2015_10', '2015_11', '2015_12', '2016_01', '2016_02', '2016_03', '2016_04', '2016_05', '2016_06', '2016_07', '2016_08', '2016_09', '2016_10', '2016_11', '2016_12', '2017_01', '2017_02', '2017_03', '2017_04', '2017_05', '2017_06', '2017_07']
lda data/eos/lda/LDAmodel_*.pkl
['2012_01', '2012_02', '2012_03', '2012_04', '2012_05', '2012_06', '2012_07', '2012_08', '2012_09', '2012_10', '2012_11', '2012_12', '2013_01', '2013_02', '2013_03', '2013_04', '2013_05', '2013_06', '2013_07', '2013_08', '2013_09', '2013_10', '2013_11', '2013_12', '2014_01', '2014_02', '2014_03', '2014_04', '2014_05', '2014_06', '2014_07', '2014_08', '2014_09', '2014_10', '2014_11', '2014_12', '2015_01', '2015_02', '2015_03', '2015_04', '2015_05', '2015_06', '2015_07', '2015_08', '2015_09', '2015_10', '2015_11', '2015_12', '2016_01', '2016_02', '2016_03', '2016_04', '2016_05', '2016_06', '2016_07', '2016_08', '2016_09', '2016_10', '2016_11', '2016_12', '2017_01', '2017_02', '2017_03', '2017_04', '2017_05', '2017_06', '2017_07']
lsi data/eos/lsi/LSImodel_*.pkl
['2012_01', '2012_02', '2012_03', '2012_04', '2012_05', '2012_06', '2012_07', '2012_08', '2012_09', '2012_10', '2012_11', '2012_12', '2013_01', '2013_02', '2013_03', '2013_04', '2013_05', '2013_06', '2013_07', '2013_08', '2013_09', '2013_10', '2013_11', '2013_12', '2014_01', '2014_02', '2014_03', '2014_04', '2014_05', '2014_06', '2014_07', '2014_08', '2014_09', '2014_10', '2014_11', '2014_12', '2015_01', '2015_02', '2015_03', '2015_04', '2015_05', '2015_06', '2015_07', '2015_08', '2015_09', '2015_10', '2015_11', '2015_12', '2016_01', '2016_02', '2016_03', '2016_04', '2016_05', '2016_06', '2016_07', '2016_08', '2016_09', '2016_10', '2016_11', '2016_12', '2017_01', '2017_02', '2017_03', '2017_04', '2017_05', '2017_06', '2017_07']






    



/usr/local/lib/python3.5/dist-packages/numpy/core/fromnumeric.py:2909: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
/usr/local/lib/python3.5/dist-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)






    






  
    
      
      Unnamed: 0
      window
      window.1
      mallet_TC_W2V
      mallet_TC_W2V_k
      lda_TC_W2V
      lda_TC_W2V_k
      lsi_TC_W2V
      lsi_TC_W2V_k
      NMF_TC_W2V
      NMF_TC_W2V_k
      NMF_Unify
      NMF_Unify_k
      mallet_Unify
      mallet_Unify_k
      lda_Unify
      lda_Unify_k
      lsi_Unify
      lsi_Unify_k
    
  
  
    
      0
      0
      2012_01
      2012_01
      0.373322
      22
      0.367360
      12
      0.344060
      18
      0.371245
      22
      0.686599
      16
      0.570021
      14
      0.371903
      18
      0.513629
      12
    
    
      1
      1
      2012_02
      2012_02
      0.371982
      26
      0.356412
      14
      0.356953
      12
      0.360304
      24
      0.658763
      12
      0.571140
      26
      0.448183
      22
      0.525276
      10
    
    
      2
      2
      2012_03
      2012_03
      0.376637
      28
      0.358276
      20
      0.354839
      10
      0.374951
      24
      0.662382
      24
      0.573819
      14
      0.350011
      30
      0.481642
      12
    
    
      3
      3
      2012_04
      2012_04
      0.372540
      24
      0.356079
      10
      0.344663
      16
      0.369469
      26
      0.627686
      20
      0.582972
      20
      0.392451
      26
      0.557337
      12
    
    
      4
      4
      2012_05
      2012_05
      0.385459
      16
      0.369615
      10
      0.385599
      10
      0.381602
      18
      0.615484
      30
      0.586302
      16
      0.356175
      20
      0.461101
      12
    
  








    






  
    
      
      Unnamed: 0
      window
      window.1
      mallet_TC_W2V
      mallet_TC_W2V_k
      lda_TC_W2V
      lda_TC_W2V_k
      lsi_TC_W2V
      lsi_TC_W2V_k
      NMF_TC_W2V
      NMF_TC_W2V_k
      NMF_Unify
      NMF_Unify_k
      mallet_Unify
      mallet_Unify_k
      lda_Unify
      lda_Unify_k
      lsi_Unify
      lsi_Unify_k
    
  
  
    
      62
      62
      2017_03
      2017_03
      0.368832
      22
      0.344859
      28
      0.336967
      24
      0.375299
      28
      0.690587
      22
      0.577780
      12
      0.369719
      26
      0.376879
      18
    
    
      63
      63
      2017_04
      2017_04
      0.366006
      12
      0.341303
      20
      0.326046
      12
      0.374008
      24
      0.650894
      20
      0.583312
      20
      0.468642
      14
      0.385933
      10
    
    
      64
      64
      2017_05
      2017_05
      0.360582
      22
      0.333149
      28
      0.319035
      12
      0.357726
      30
      0.677018
      14
      0.583450
      28
      0.320494
      26
      0.389233
      10
    
    
      65
      65
      2017_06
      2017_06
      0.381813
      28
      0.335946
      12
      0.328623
      22
      0.373138
      18
      0.725945
      10
      0.607344
      12
      0.301083
      24
      0.342559
      30
    
    
      66
      66
      2017_07
      2017_07
      0.349201
      22
      0.339650
      28
      0.322636
      30
      0.347636
      22
      0.694142
      16
      0.582325
      22
      0.335810
      28
      0.452428
      26
    
  








    



CPU times: user 2h 17min 30s, sys: 4min 4s, total: 2h 21min 35s
Wall time: 16h 8min 9s



In [ ]:

    
# score_df = pd.read_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')


# display(score_df.tail())
# df_nmf = pd.read_csv('dynamic_nmf/data/windowbin/csv/window_topic_coherence_results.csv') 


# # df = df.drop(df.columns[0], axis=1)

# display(df_nmf.ix[:, 1:-1])

# score_list =  {}
# for column in df_nmf.ix[:, 1:-1]:
#     scoretmp = []
# #     print(column)
# #     print(df[column][0].strip("{}").split(', '))
#     for score in (df_nmf[column][0].strip("{}").split(', ')):
#         scoretmp.append( [int(score.split(': ')[0]), float(score.split(': ')[1])] )

# #     sorted(scoretmp)
#     score_list[column[-7:]] = scoretmp

    
# y_coherence, y_k = sort_coherence(score_list)  

# print(y_coherence)
# score_df['nmf_TC_W2V'] = y_coherence
# score_df['nmf_TC_W2V_k'] = y_k
# display(score_df.tail())

# df.to_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')



In [12]:

    
import os.path


def get_nmf_topics(window, k, dictionary, topn=20):
    
    topic_list = []
    topic_df = pd.read_pickle('dynamic_nmf/data/windowbin/result/window.df/window_%s_k%s.pkl' % (window, k))

    for c in topic_df.ix[:,1:].columns:
        # filter out any token not in 
        if dictionary is not None:
            topic = [x for x in topic_df[c].tolist() if x in dictionary.token2id.keys()]
        else:
            topic = topic_df[c].tolist() 
        topic_list.append(topic[:topn])
        
#     print(topic_list)
    return topic_list



In [20]:

    
def execute_coherence_gensim_nmf(topic_model_path, actual_kmin=10, actual_kmax=30):
    
    dictionary_filepath = 'data/eos/dic_bow/bigram_dict_%s.dict' 
    bow_filepath = 'data/eos/dic_bow/bigram_bow_corpus_%s.mm'    
    
    model_list = glob.glob(topic_model_path)
    model_list.sort()

    indices = []
    for fname in model_list:
        indices.append(fname[-28:-21])
        
    indices = list(set(indices))
    indices.sort()
    print(indices)

    coherence_list = {}   
    for window in indices:
        temp = []
        # Load to memory
        corpus = gensim.corpora.MmCorpus(bow_filepath % window)
        dictionary = gensim.corpora.Dictionary.load(dictionary_filepath % window)  
        corpus_text = MyDocuments('dynamic_nmf/data/windowbin/slices/tokenized_window_%s.gz' % window)  
        
        for k in range(actual_kmin, actual_kmax + 2, 2):
            #             get_nmf_topics(window, k, dictionary, 10)
            if(not os.path.isfile('dynamic_nmf/data/windowbin/result/window.df/window_%s_k%s.pkl' % (window, k))):
                print('dynamic_nmf/data/windowbin/result/window.df/window_%s_k%s.pkl' % (window, k))
                coherence_score = np.nan
            else:         
                cm = CoherenceModel(dictionary=dictionary, corpus=corpus, 
                                    texts=corpus_text, topics=get_nmf_topics(window, k, dictionary, 20), 
                                    coherence='c_v')

                coherence_score = cm.get_coherence()                
            log.info("Model coherence window=%s (k=%d) = %.4f" % (window, k, coherence_score))
            temp.append ([k, coherence_score])
            
        coherence_list[window] = temp
    

    return coherence_list, indices



In [16]:

    
def execute_coherence_w2v_NMF(topic_model_path,actual_kmin=10, actual_kmax=30):
    start_time = time.time()
    
    model_list = glob.glob(topic_model_path)
    model_list.sort()

    model_path = 'data/eos/word2vec_model_all.model'
    log.info("Loading Word2Vec model from %s ..." % model_path)
    model = gensim.models.Word2Vec.load(model_path)
    
    metric = ModelSimilarity(model)
    validation_measure = WithinTopicMeasure(metric)

    indices = []
    for fname in model_list:
        indices.append(fname[-28:-21])
        
    indices = list(set(indices))
    indices.sort()
    print(indices)
    
    coherence_list = {}   
    for window in indices:
        temp = []

        for k in range(actual_kmin, actual_kmax + 2, 2):
            #             get_nmf_topics(window, k, dictionary, 10)
            if(not os.path.isfile('dynamic_nmf/data/windowbin/result/window.df/window_%s_k%s.pkl' % (window, k))):
                print('dynamic_nmf/data/windowbin/result/window.df/window_%s_k%s.pkl' % (window, k))
                coherence_score = np.nan
            else:
                coherence_score = validation_measure.evaluate_rankings(
                    get_nmf_topics(window, k, None, 20))
                
              
            log.info("Model coherence window=%s (k=%d) = %.4f" % (window, k, coherence_score))
            temp.append ([k, coherence_score])
            
        coherence_list[window] = temp
 
        
    elapsed_time = time.time() - start_time
    print('took %s to process %s' % (elapsed_time, topic_model_path) )
    return coherence_list, indices



In [21]:

    
%%time


model_list = {'NMF' : 'dynamic_nmf/data/windowbin/result/tf_idf_tokenized_window_20*.pkl'}

df = pd.read_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')
display(df.tail())

# Calculate NMF modele result -
for name, path in model_list.items():
    print (name, path)    
    coherence_list, indices = execute_coherence_gensim_nmf(path)
    y_coherence, y_k = sort_coherence(coherence_list)
    df['%s_Unify' % name] = y_coherence
    df['%s_Unify_k' % name] = y_k

display(df)    
df.to_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')









    






  
    
      
      window
      window.1
      mallet_TC_W2V
      mallet_TC_W2V_k
      lda_TC_W2V
      lda_TC_W2V_k
      lsi_TC_W2V
      lsi_TC_W2V_k
      NMF_TC_W2V
      NMF_TC_W2V_k
    
  
  
    
      62
      2017_03
      2017_03
      0.368832
      22
      0.344859
      28
      0.336967
      24
      0.375299
      28
    
    
      63
      2017_04
      2017_04
      0.366006
      12
      0.341303
      20
      0.326046
      12
      0.374008
      24
    
    
      64
      2017_05
      2017_05
      0.360582
      22
      0.333149
      28
      0.319035
      12
      0.357726
      30
    
    
      65
      2017_06
      2017_06
      0.381813
      28
      0.335946
      12
      0.328623
      22
      0.373138
      18
    
    
      66
      2017_07
      2017_07
      0.349201
      22
      0.339650
      28
      0.322636
      30
      0.347636
      22
    
  








    



NMF dynamic_nmf/data/windowbin/result/tf_idf_tokenized_window_20*.pkl
['2012_01', '2012_02', '2012_03', '2012_04', '2012_05', '2012_06', '2012_07', '2012_08', '2012_09', '2012_10', '2012_11', '2012_12', '2013_01', '2013_02', '2013_03', '2013_04', '2013_05', '2013_06', '2013_07', '2013_08', '2013_09', '2013_10', '2013_11', '2013_12', '2014_01', '2014_02', '2014_03', '2014_04', '2014_05', '2014_06', '2014_07', '2014_08', '2014_09', '2014_10', '2014_11', '2014_12', '2015_01', '2015_02', '2015_03', '2015_04', '2015_05', '2015_06', '2015_07', '2015_08', '2015_09', '2015_10', '2015_11', '2015_12', '2016_01', '2016_02', '2016_03', '2016_04', '2016_05', '2016_06', '2016_07', '2016_08', '2016_09', '2016_10', '2016_11', '2016_12', '2017_01', '2017_02', '2017_03', '2017_04', '2017_05', '2017_06', '2017_07']
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k10.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k12.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k14.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k16.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k18.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k20.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k22.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k24.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k26.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k28.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k30.pkl






    



/usr/local/lib/python3.5/dist-packages/numpy/core/fromnumeric.py:2909: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
/usr/local/lib/python3.5/dist-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)






    



dynamic_nmf/data/windowbin/result/window.df/window_2015_02_k28.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_02_k30.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k16.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k18.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k20.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k22.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k24.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k26.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k28.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k30.pkl






    






  
    
      
      window
      window.1
      mallet_TC_W2V
      mallet_TC_W2V_k
      lda_TC_W2V
      lda_TC_W2V_k
      lsi_TC_W2V
      lsi_TC_W2V_k
      NMF_TC_W2V
      NMF_TC_W2V_k
      NMF_Unify
      NMF_Unify_k
    
  
  
    
      0
      2012_01
      2012_01
      0.373322
      22
      0.367360
      12
      0.344060
      18
      0.371245
      22
      0.686599
      16
    
    
      1
      2012_02
      2012_02
      0.371982
      26
      0.356412
      14
      0.356953
      12
      0.360304
      24
      0.658763
      12
    
    
      2
      2012_03
      2012_03
      0.376637
      28
      0.358276
      20
      0.354839
      10
      0.374951
      24
      0.662382
      24
    
    
      3
      2012_04
      2012_04
      0.372540
      24
      0.356079
      10
      0.344663
      16
      0.369469
      26
      0.627686
      20
    
    
      4
      2012_05
      2012_05
      0.385459
      16
      0.369615
      10
      0.385599
      10
      0.381602
      18
      0.615484
      30
    
    
      5
      2012_06
      2012_06
      0.376252
      22
      0.366449
      10
      0.365106
      10
      0.385673
      30
      0.630452
      22
    
    
      6
      2012_07
      2012_07
      0.376602
      20
      0.365431
      20
      0.356745
      16
      0.383768
      18
      0.593058
      18
    
    
      7
      2012_08
      2012_08
      0.385489
      10
      0.372908
      30
      0.362992
      10
      0.386008
      14
      0.603607
      28
    
    
      8
      2012_09
      2012_09
      0.369887
      22
      0.369904
      10
      0.338247
      10
      0.364662
      26
      0.612656
      24
    
    
      9
      2012_10
      2012_10
      0.385737
      20
      0.350987
      18
      0.353095
      20
      0.369448
      22
      0.598698
      10
    
    
      10
      2012_11
      2012_11
      0.387742
      14
      0.369008
      20
      0.359603
      10
      0.361450
      10
      0.569218
      30
    
    
      11
      2012_12
      2012_12
      0.367923
      12
      0.366609
      10
      0.367527
      10
      0.394530
      10
      0.622516
      14
    
    
      12
      2013_01
      2013_01
      0.383147
      20
      0.369593
      10
      0.378486
      12
      0.392638
      12
      0.756288
      12
    
    
      13
      2013_02
      2013_02
      0.386742
      10
      0.382875
      18
      0.388220
      10
      0.403955
      12
      0.698672
      30
    
    
      14
      2013_03
      2013_03
      0.376408
      28
      0.372112
      22
      0.369501
      10
      0.380131
      16
      0.690384
      16
    
    
      15
      2013_04
      2013_04
      0.379029
      24
      0.379843
      20
      0.365484
      18
      0.395827
      24
      0.695751
      30
    
    
      16
      2013_05
      2013_05
      0.380623
      20
      0.383695
      20
      0.374426
      10
      0.391170
      10
      0.732174
      10
    
    
      17
      2013_06
      2013_06
      0.379687
      30
      0.385105
      12
      0.359631
      10
      0.395412
      28
      0.748097
      12
    
    
      18
      2013_07
      2013_07
      0.377045
      12
      0.366235
      12
      0.379858
      12
      0.393422
      16
      0.716875
      12
    
    
      19
      2013_08
      2013_08
      0.372550
      18
      0.380078
      12
      0.367726
      10
      0.382074
      12
      0.730104
      10
    
    
      20
      2013_09
      2013_09
      0.370974
      18
      0.355240
      10
      0.341069
      10
      0.371867
      20
      0.702392
      14
    
    
      21
      2013_10
      2013_10
      0.418281
      14
      0.350561
      26
      0.374440
      14
      0.385737
      24
      0.682867
      12
    
    
      22
      2013_11
      2013_11
      0.372005
      24
      0.340547
      20
      0.357003
      30
      0.386085
      28
      0.764088
      12
    
    
      23
      2013_12
      2013_12
      0.377475
      28
      0.378226
      18
      0.367346
      10
      0.396046
      10
      0.711426
      24
    
    
      24
      2014_01
      2014_01
      0.378321
      28
      0.375230
      10
      0.365671
      10
      0.404594
      26
      0.668364
      10
    
    
      25
      2014_02
      2014_02
      0.374724
      26
      0.380598
      18
      0.378763
      10
      0.397362
      18
      0.641937
      28
    
    
      26
      2014_03
      2014_03
      0.387159
      14
      0.392522
      14
      0.380349
      10
      0.403873
      10
      0.700665
      22
    
    
      27
      2014_04
      2014_04
      0.378734
      30
      0.363843
      16
      0.362907
      10
      0.391023
      12
      0.692710
      30
    
    
      28
      2014_05
      2014_05
      0.382013
      18
      0.354075
      12
      0.341620
      30
      0.391467
      26
      0.673758
      30
    
    
      29
      2014_06
      2014_06
      0.389454
      28
      0.403460
      20
      0.397063
      10
      0.405224
      22
      0.632361
      18
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      37
      2015_02
      2015_02
      0.343132
      28
      0.374119
      18
      0.369051
      10
      0.370054
      10
      NaN
      10
    
    
      38
      2015_03
      2015_03
      0.358734
      14
      0.349748
      10
      0.368912
      10
      0.351930
      24
      0.694756
      14
    
    
      39
      2015_04
      2015_04
      0.345661
      16
      0.362625
      26
      0.334458
      10
      0.367630
      10
      0.553187
      10
    
    
      40
      2015_05
      2015_05
      0.357787
      14
      0.359777
      24
      0.344511
      18
      0.355867
      12
      0.650168
      16
    
    
      41
      2015_06
      2015_06
      0.358147
      10
      0.349219
      28
      0.337925
      12
      0.358493
      14
      0.584301
      10
    
    
      42
      2015_07
      2015_07
      0.361747
      10
      0.385091
      10
      0.338774
      10
      0.364008
      10
      0.558097
      18
    
    
      43
      2015_08
      2015_08
      0.351548
      10
      0.354375
      10
      0.322496
      10
      0.343917
      28
      0.635225
      10
    
    
      44
      2015_09
      2015_09
      0.358648
      10
      0.359002
      30
      0.359048
      10
      0.358920
      10
      0.579560
      18
    
    
      45
      2015_10
      2015_10
      0.390945
      26
      0.356480
      22
      0.377019
      14
      0.391810
      14
      0.684972
      12
    
    
      46
      2015_11
      2015_11
      0.381952
      26
      0.331720
      30
      0.355430
      10
      0.385261
      26
      0.672617
      24
    
    
      47
      2015_12
      2015_12
      0.377789
      30
      0.338472
      24
      0.356513
      14
      0.385321
      30
      0.703951
      26
    
    
      48
      2016_01
      2016_01
      0.371399
      24
      0.318332
      24
      0.353870
      16
      0.384108
      24
      0.756247
      28
    
    
      49
      2016_02
      2016_02
      0.377769
      26
      0.321813
      20
      0.364931
      18
      0.403619
      20
      0.706451
      24
    
    
      50
      2016_03
      2016_03
      0.372038
      28
      0.318642
      24
      0.372687
      20
      0.381604
      22
      0.711577
      20
    
    
      51
      2016_04
      2016_04
      0.370497
      20
      0.313523
      20
      0.345647
      18
      0.387416
      22
      0.698371
      30
    
    
      52
      2016_05
      2016_05
      0.378633
      30
      0.324715
      14
      0.386714
      10
      0.390143
      22
      0.661451
      28
    
    
      53
      2016_06
      2016_06
      0.369599
      24
      0.322562
      30
      0.341940
      28
      0.379794
      22
      0.658772
      18
    
    
      54
      2016_07
      2016_07
      0.372861
      24
      0.316271
      30
      0.334943
      30
      0.363780
      30
      0.707597
      18
    
    
      55
      2016_08
      2016_08
      0.370081
      26
      0.317142
      12
      0.341775
      20
      0.377458
      26
      0.653474
      26
    
    
      56
      2016_09
      2016_09
      0.381448
      26
      0.320212
      10
      0.343436
      18
      0.375838
      30
      0.650172
      22
    
    
      57
      2016_10
      2016_10
      0.377324
      16
      0.373840
      20
      0.356631
      20
      0.382469
      16
      0.631388
      22
    
    
      58
      2016_11
      2016_11
      0.381070
      12
      0.350985
      28
      0.372430
      14
      0.389218
      16
      0.665048
      28
    
    
      59
      2016_12
      2016_12
      0.365415
      30
      0.348647
      10
      0.336426
      16
      0.371006
      30
      0.715129
      14
    
    
      60
      2017_01
      2017_01
      0.363971
      22
      0.339760
      18
      0.332809
      20
      0.362297
      18
      0.716772
      26
    
    
      61
      2017_02
      2017_02
      0.360129
      18
      0.330765
      22
      0.324862
      14
      0.368217
      14
      0.714027
      14
    
    
      62
      2017_03
      2017_03
      0.368832
      22
      0.344859
      28
      0.336967
      24
      0.375299
      28
      0.690587
      22
    
    
      63
      2017_04
      2017_04
      0.366006
      12
      0.341303
      20
      0.326046
      12
      0.374008
      24
      0.650894
      20
    
    
      64
      2017_05
      2017_05
      0.360582
      22
      0.333149
      28
      0.319035
      12
      0.357726
      30
      0.677018
      14
    
    
      65
      2017_06
      2017_06
      0.381813
      28
      0.335946
      12
      0.328623
      22
      0.373138
      18
      0.725945
      10
    
    
      66
      2017_07
      2017_07
      0.349201
      22
      0.339650
      28
      0.322636
      30
      0.347636
      22
      0.694142
      16
    
  

67 rows × 12 columns







    



CPU times: user 45min 5s, sys: 1min 21s, total: 46min 27s
Wall time: 6h 10min 55s



In [15]:

    
%%time


model_list = {'NMF' : 'dynamic_nmf/data/windowbin/result/tf_idf_tokenized_window_20*.pkl'}

# TC-W2V NMF models 
for name, path in model_list.items():
    print (name, path)    
    coherence_list, indices = execute_coherence_w2v_NMF(path)
    y_coherence, y_k = sort_coherence(coherence_list)
    df['%s_TC_W2V' % name] = y_coherence
    df['%s_TC_W2V_k' % name] = y_k
#     break
    
display(df)    
df.to_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')









    



NMF dynamic_nmf/data/windowbin/result/tf_idf_tokenized_window_20*.pkl
['2012_01', '2012_02', '2012_03', '2012_04', '2012_05', '2012_06', '2012_07', '2012_08', '2012_09', '2012_10', '2012_11', '2012_12', '2013_01', '2013_02', '2013_03', '2013_04', '2013_05', '2013_06', '2013_07', '2013_08', '2013_09', '2013_10', '2013_11', '2013_12', '2014_01', '2014_02', '2014_03', '2014_04', '2014_05', '2014_06', '2014_07', '2014_08', '2014_09', '2014_10', '2014_11', '2014_12', '2015_01', '2015_02', '2015_03', '2015_04', '2015_05', '2015_06', '2015_07', '2015_08', '2015_09', '2015_10', '2015_11', '2015_12', '2016_01', '2016_02', '2016_03', '2016_04', '2016_05', '2016_06', '2016_07', '2016_08', '2016_09', '2016_10', '2016_11', '2016_12', '2017_01', '2017_02', '2017_03', '2017_04', '2017_05', '2017_06', '2017_07']
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k10.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k12.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k14.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k16.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k18.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k20.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k22.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k24.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k26.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k28.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k30.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_02_k28.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_02_k30.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k16.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k18.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k20.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k22.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k24.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k26.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k28.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k30.pkl
took 45.993279695510864 to process dynamic_nmf/data/windowbin/result/tf_idf_tokenized_window_20*.pkl






    






  
    
      
      window
      mallet_TC_W2V
      mallet_TC_W2V_k
      lda_TC_W2V
      lda_TC_W2V_k
      lsi_TC_W2V
      lsi_TC_W2V_k
      NMF_TC_W2V
      NMF_TC_W2V_k
    
    
      window
      
      
      
      
      
      
      
      
      
    
  
  
    
      2012_01
      2012_01
      0.373322
      22
      0.367360
      12
      0.344060
      18
      0.371245
      22
    
    
      2012_02
      2012_02
      0.371982
      26
      0.356412
      14
      0.356953
      12
      0.360304
      24
    
    
      2012_03
      2012_03
      0.376637
      28
      0.358276
      20
      0.354839
      10
      0.374951
      24
    
    
      2012_04
      2012_04
      0.372540
      24
      0.356079
      10
      0.344663
      16
      0.369469
      26
    
    
      2012_05
      2012_05
      0.385459
      16
      0.369615
      10
      0.385599
      10
      0.381602
      18
    
    
      2012_06
      2012_06
      0.376252
      22
      0.366449
      10
      0.365106
      10
      0.385673
      30
    
    
      2012_07
      2012_07
      0.376602
      20
      0.365431
      20
      0.356745
      16
      0.383768
      18
    
    
      2012_08
      2012_08
      0.385489
      10
      0.372908
      30
      0.362992
      10
      0.386008
      14
    
    
      2012_09
      2012_09
      0.369887
      22
      0.369904
      10
      0.338247
      10
      0.364662
      26
    
    
      2012_10
      2012_10
      0.385737
      20
      0.350987
      18
      0.353095
      20
      0.369448
      22
    
    
      2012_11
      2012_11
      0.387742
      14
      0.369008
      20
      0.359603
      10
      0.361450
      10
    
    
      2012_12
      2012_12
      0.367923
      12
      0.366609
      10
      0.367527
      10
      0.394530
      10
    
    
      2013_01
      2013_01
      0.383147
      20
      0.369593
      10
      0.378486
      12
      0.392638
      12
    
    
      2013_02
      2013_02
      0.386742
      10
      0.382875
      18
      0.388220
      10
      0.403955
      12
    
    
      2013_03
      2013_03
      0.376408
      28
      0.372112
      22
      0.369501
      10
      0.380131
      16
    
    
      2013_04
      2013_04
      0.379029
      24
      0.379843
      20
      0.365484
      18
      0.395827
      24
    
    
      2013_05
      2013_05
      0.380623
      20
      0.383695
      20
      0.374426
      10
      0.391170
      10
    
    
      2013_06
      2013_06
      0.379687
      30
      0.385105
      12
      0.359631
      10
      0.395412
      28
    
    
      2013_07
      2013_07
      0.377045
      12
      0.366235
      12
      0.379858
      12
      0.393422
      16
    
    
      2013_08
      2013_08
      0.372550
      18
      0.380078
      12
      0.367726
      10
      0.382074
      12
    
    
      2013_09
      2013_09
      0.370974
      18
      0.355240
      10
      0.341069
      10
      0.371867
      20
    
    
      2013_10
      2013_10
      0.418281
      14
      0.350561
      26
      0.374440
      14
      0.385737
      24
    
    
      2013_11
      2013_11
      0.372005
      24
      0.340547
      20
      0.357003
      30
      0.386085
      28
    
    
      2013_12
      2013_12
      0.377475
      28
      0.378226
      18
      0.367346
      10
      0.396046
      10
    
    
      2014_01
      2014_01
      0.378321
      28
      0.375230
      10
      0.365671
      10
      0.404594
      26
    
    
      2014_02
      2014_02
      0.374724
      26
      0.380598
      18
      0.378763
      10
      0.397362
      18
    
    
      2014_03
      2014_03
      0.387159
      14
      0.392522
      14
      0.380349
      10
      0.403873
      10
    
    
      2014_04
      2014_04
      0.378734
      30
      0.363843
      16
      0.362907
      10
      0.391023
      12
    
    
      2014_05
      2014_05
      0.382013
      18
      0.354075
      12
      0.341620
      30
      0.391467
      26
    
    
      2014_06
      2014_06
      0.389454
      28
      0.403460
      20
      0.397063
      10
      0.405224
      22
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      2015_02
      2015_02
      0.343132
      28
      0.374119
      18
      0.369051
      10
      0.370054
      10
    
    
      2015_03
      2015_03
      0.358734
      14
      0.349748
      10
      0.368912
      10
      0.351930
      24
    
    
      2015_04
      2015_04
      0.345661
      16
      0.362625
      26
      0.334458
      10
      0.367630
      10
    
    
      2015_05
      2015_05
      0.357787
      14
      0.359777
      24
      0.344511
      18
      0.355867
      12
    
    
      2015_06
      2015_06
      0.358147
      10
      0.349219
      28
      0.337925
      12
      0.358493
      14
    
    
      2015_07
      2015_07
      0.361747
      10
      0.385091
      10
      0.338774
      10
      0.364008
      10
    
    
      2015_08
      2015_08
      0.351548
      10
      0.354375
      10
      0.322496
      10
      0.343917
      28
    
    
      2015_09
      2015_09
      0.358648
      10
      0.359002
      30
      0.359048
      10
      0.358920
      10
    
    
      2015_10
      2015_10
      0.390945
      26
      0.356480
      22
      0.377019
      14
      0.391810
      14
    
    
      2015_11
      2015_11
      0.381952
      26
      0.331720
      30
      0.355430
      10
      0.385261
      26
    
    
      2015_12
      2015_12
      0.377789
      30
      0.338472
      24
      0.356513
      14
      0.385321
      30
    
    
      2016_01
      2016_01
      0.371399
      24
      0.318332
      24
      0.353870
      16
      0.384108
      24
    
    
      2016_02
      2016_02
      0.377769
      26
      0.321813
      20
      0.364931
      18
      0.403619
      20
    
    
      2016_03
      2016_03
      0.372038
      28
      0.318642
      24
      0.372687
      20
      0.381604
      22
    
    
      2016_04
      2016_04
      0.370497
      20
      0.313523
      20
      0.345647
      18
      0.387416
      22
    
    
      2016_05
      2016_05
      0.378633
      30
      0.324715
      14
      0.386714
      10
      0.390143
      22
    
    
      2016_06
      2016_06
      0.369599
      24
      0.322562
      30
      0.341940
      28
      0.379794
      22
    
    
      2016_07
      2016_07
      0.372861
      24
      0.316271
      30
      0.334943
      30
      0.363780
      30
    
    
      2016_08
      2016_08
      0.370081
      26
      0.317142
      12
      0.341775
      20
      0.377458
      26
    
    
      2016_09
      2016_09
      0.381448
      26
      0.320212
      10
      0.343436
      18
      0.375838
      30
    
    
      2016_10
      2016_10
      0.377324
      16
      0.373840
      20
      0.356631
      20
      0.382469
      16
    
    
      2016_11
      2016_11
      0.381070
      12
      0.350985
      28
      0.372430
      14
      0.389218
      16
    
    
      2016_12
      2016_12
      0.365415
      30
      0.348647
      10
      0.336426
      16
      0.371006
      30
    
    
      2017_01
      2017_01
      0.363971
      22
      0.339760
      18
      0.332809
      20
      0.362297
      18
    
    
      2017_02
      2017_02
      0.360129
      18
      0.330765
      22
      0.324862
      14
      0.368217
      14
    
    
      2017_03
      2017_03
      0.368832
      22
      0.344859
      28
      0.336967
      24
      0.375299
      28
    
    
      2017_04
      2017_04
      0.366006
      12
      0.341303
      20
      0.326046
      12
      0.374008
      24
    
    
      2017_05
      2017_05
      0.360582
      22
      0.333149
      28
      0.319035
      12
      0.357726
      30
    
    
      2017_06
      2017_06
      0.381813
      28
      0.335946
      12
      0.328623
      22
      0.373138
      18
    
    
      2017_07
      2017_07
      0.349201
      22
      0.339650
      28
      0.322636
      30
      0.347636
      22
    
  

67 rows × 9 columns







    



CPU times: user 46.1 s, sys: 156 ms, total: 46.3 s
Wall time: 46.3 s



In [ ]:



In [164]:

    
def graph_coherence_score(df, columns, title):
    
    ax = df[columns].plot()
    plt.xticks(rotation=90, fontsize=4)
    plt.xlabel('time windows')
    plt.ylabel('score')
    plt.title('%s coherence' % title)
    plt.legend(loc='best')
    plt.show()
    plt.savefig('data/eos/graphs/%s_%s_coherence_plot.png' % (title, 'all'), dpi=800)    

def graph_k(df, columns, title):
    ax = df[columns].plot(kind='bar', stacked=True)
    plt.xticks(rotation=90, fontsize=4)
    plt.xlabel('time windows')
    plt.ylabel('k')
    title= '%s top k' % title
    plt.title(title)
    plt.legend(loc='best')
    plt.show()
    plt.savefig('data/eos/graphs/%s_%s_topK_plot.png' % (title, 'all'), dpi=800)



In [167]:

    
%%time


df = pd.read_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')
print (df.columns.tolist())
df.set_index(df['window'], inplace=True)
df['NMF_TC_W2V'].fillna(method='bfill', inplace=True)
df['NMF_Unify'].fillna(method='bfill', inplace=True)


df['NMF_TC_W2V'].replace(0, df['NMF_TC_W2V'].mean(), inplace=True)
df['NMF_Unify'].replace(0, df['NMF_Unify'].mean(), inplace=True)

display(df.tail())



print (df.describe())


ax = None
title= 'TC_W2V probabilistic'
graph_coherence_score(df, ['mallet_TC_W2V', 'lda_TC_W2V', 'lsi_TC_W2V'], title)
graph_k(df, ['mallet_TC_W2V_k', 'lda_TC_W2V_k', 'lsi_TC_W2V_k'], title)

title= 'TC_W2V comparison'
graph_coherence_score(df, ['mallet_TC_W2V', 'NMF_TC_W2V'], title)
graph_k(df, ['mallet_TC_W2V_k', 'NMF_TC_W2V_k'], title)

title= 'Unify probabilistic'
graph_coherence_score(df, ['mallet_Unify', 'lda_Unify', 'lsi_Unify'], title)
graph_k(df, ['mallet_Unify_k', 'lda_Unify_k', 'lsi_Unify_k'], title)  

title= 'Unify comparison'
graph_coherence_score(df, ['mallet_Unify',  'NMF_Unify'], title)
graph_k(df, ['mallet_Unify_k', 'NMF_Unify_k'], title)









    



['Unnamed: 0', 'Unnamed: 0.1', 'window', 'window.1', 'mallet_TC_W2V', 'mallet_TC_W2V_k', 'lda_TC_W2V', 'lda_TC_W2V_k', 'lsi_TC_W2V', 'lsi_TC_W2V_k', 'NMF_TC_W2V', 'NMF_TC_W2V_k', 'NMF_Unify', 'NMF_Unify_k', 'mallet_Unify', 'mallet_Unify_k', 'lda_Unify', 'lda_Unify_k', 'lsi_Unify', 'lsi_Unify_k']






    






  
    
      
      Unnamed: 0
      Unnamed: 0.1
      window
      window.1
      mallet_TC_W2V
      mallet_TC_W2V_k
      lda_TC_W2V
      lda_TC_W2V_k
      lsi_TC_W2V
      lsi_TC_W2V_k
      NMF_TC_W2V
      NMF_TC_W2V_k
      NMF_Unify
      NMF_Unify_k
      mallet_Unify
      mallet_Unify_k
      lda_Unify
      lda_Unify_k
      lsi_Unify
      lsi_Unify_k
    
    
      window
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2017_03
      62
      62
      2017_03
      2017_03
      0.368832
      22
      0.344859
      28
      0.336967
      24
      0.375299
      28
      0.690587
      22
      0.577780
      12
      0.369719
      26
      0.376879
      18
    
    
      2017_04
      63
      63
      2017_04
      2017_04
      0.366006
      12
      0.341303
      20
      0.326046
      12
      0.374008
      24
      0.650894
      20
      0.583312
      20
      0.468642
      14
      0.385933
      10
    
    
      2017_05
      64
      64
      2017_05
      2017_05
      0.360582
      22
      0.333149
      28
      0.319035
      12
      0.357726
      30
      0.677018
      14
      0.583450
      28
      0.320494
      26
      0.389233
      10
    
    
      2017_06
      65
      65
      2017_06
      2017_06
      0.381813
      28
      0.335946
      12
      0.328623
      22
      0.373138
      18
      0.725945
      10
      0.607344
      12
      0.301083
      24
      0.342559
      30
    
    
      2017_07
      66
      66
      2017_07
      2017_07
      0.349201
      22
      0.339650
      28
      0.322636
      30
      0.347636
      22
      0.694142
      16
      0.582325
      22
      0.335810
      28
      0.452428
      26
    
  








    



       Unnamed: 0  Unnamed: 0.1  mallet_TC_W2V  mallet_TC_W2V_k  lda_TC_W2V  \
count   67.000000     67.000000      67.000000        67.000000   67.000000   
mean    33.000000     33.000000       0.374285        21.223881    0.354714   
std     19.485037     19.485037       0.012844         6.227556    0.020813   
min      0.000000      0.000000       0.342095        10.000000    0.313523   
25%     16.500000     16.500000       0.369215        16.000000    0.340154   
50%     33.000000     33.000000       0.376602        22.000000    0.356079   
75%     49.500000     49.500000       0.381630        26.000000    0.369604   
max     66.000000     66.000000       0.418281        30.000000    0.403460   

       lda_TC_W2V_k  lsi_TC_W2V  lsi_TC_W2V_k  NMF_TC_W2V  NMF_TC_W2V_k  \
count     67.000000   67.000000     67.000000   67.000000     67.000000   
mean      18.955224    0.355349     14.208955    0.381597     20.089552   
std        6.798914    0.026423      5.795864    0.016945      6.745888   
min       10.000000    0.210950     10.000000    0.343917     10.000000   
25%       12.000000    0.341698     10.000000    0.369761     14.000000   
50%       20.000000    0.357003     12.000000    0.382074     22.000000   
75%       24.000000    0.369276     18.000000    0.392224     26.000000   
max       30.000000    0.400399     30.000000    0.422887     30.000000   

       NMF_Unify  NMF_Unify_k  mallet_Unify  mallet_Unify_k  lda_Unify  \
count  67.000000    67.000000     67.000000       67.000000  67.000000   
mean    0.672085    18.865672      0.570267       21.582090   0.346530   
std     0.049464     7.008427      0.046503        5.934361   0.059256   
min     0.553187    10.000000      0.342099       10.000000   0.249121   
25%     0.642056    12.000000      0.563148       18.000000   0.304975   
50%     0.682867    18.000000      0.577800       22.000000   0.337970   
75%     0.707024    24.000000      0.597061       26.000000   0.374178   
max     0.764088    30.000000      0.627599       30.000000   0.592133   

       lda_Unify_k  lsi_Unify  lsi_Unify_k  
count    67.000000  66.000000    67.000000  
mean     21.432836   0.458723    13.492537  
std       6.731257   0.059270     6.162873  
min      10.000000   0.342559    10.000000  
25%      16.000000   0.409362    10.000000  
50%      22.000000   0.461874    10.000000  
75%      27.000000   0.497757    12.000000  
max      30.000000   0.579197    30.000000  






    














    











    














    











    














    











    














    











    














    











    














    











    














    











    














    











    



CPU times: user 10.4 s, sys: 1.66 s, total: 12.1 s
Wall time: 10.9 s



In [9]:

    
def get_nmf_dynamic_topics(k, dictionary, topn=20):
    
    topic_list = []
    topic_df = pd.read_pickle('dynamic_nmf/data/windowbin/result/dynamic.df/dynamic_k%s.pkl' % (k))

    for c in topic_df.ix[:,1:].columns:
        # filter out any token not in 
        if dictionary is not None:
            topic = [x for x in topic_df[c].tolist() if x in dictionary.token2id.keys()]
        else:
            topic = topic_df[c].tolist() 
        topic_list.append(topic[:topn])
        
#     print(topic_list)
    return topic_list



In [85]:

    
%time


log.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=log.INFO)

actual_kmin=10
actual_kmax=100
df_dynamic = pd.DataFrame(index=list(range(actual_kmin, actual_kmax + 2, 2)))

                          
model_path = 'data/eos/word2vec_model_all.model'
log.info("Loading Word2Vec model from %s ..." % model_path)
model = gensim.models.Word2Vec.load(model_path)

metric = ModelSimilarity(model)
validation_measure = WithinTopicMeasure(metric)


# TC-W2V NMF models 
name = 'NMF_dynamic_tc-w2v'
print (name)    

coherence = []
for k in range(actual_kmin, actual_kmax + 2, 2):
    get_nmf_dynamic_topics(k, None, 20)
    coherence_score = validation_measure.evaluate_rankings(
            get_nmf_dynamic_topics(k, None, 20))

    print("Model coherence dynamic (k=%d) = %.4f" % (k, coherence_score))
    coherence.append(coherence_score)

print(coherence)

df_dynamic['%s_TC_W2V' % name] = coherence

display(df_dynamic)    
df_dynamic.to_csv('dynamic_nmf/data/windowbin/csv/dynamic_coherence_k_performance.csv')









    



CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 3.58 µs
NMF_dynamic_tc-w2v
Model coherence dynamic (k=10) = 0.4356
Model coherence dynamic (k=12) = 0.4188
Model coherence dynamic (k=14) = 0.4212
Model coherence dynamic (k=16) = 0.4112
Model coherence dynamic (k=18) = 0.4116
Model coherence dynamic (k=20) = 0.4124
Model coherence dynamic (k=22) = 0.4111
Model coherence dynamic (k=24) = 0.4075
Model coherence dynamic (k=26) = 0.4048
Model coherence dynamic (k=28) = 0.4063
Model coherence dynamic (k=30) = 0.4078
Model coherence dynamic (k=32) = 0.4071
Model coherence dynamic (k=34) = 0.4111
Model coherence dynamic (k=36) = 0.4118
Model coherence dynamic (k=38) = 0.4119
Model coherence dynamic (k=40) = 0.4121
Model coherence dynamic (k=42) = 0.4127
Model coherence dynamic (k=44) = 0.4116
Model coherence dynamic (k=46) = 0.4085
Model coherence dynamic (k=48) = 0.4077
Model coherence dynamic (k=50) = 0.4119
Model coherence dynamic (k=52) = 0.4073
Model coherence dynamic (k=54) = 0.4057
Model coherence dynamic (k=56) = 0.4087
Model coherence dynamic (k=58) = 0.4054
Model coherence dynamic (k=60) = 0.4049
Model coherence dynamic (k=62) = 0.4037
Model coherence dynamic (k=64) = 0.4016
Model coherence dynamic (k=66) = 0.3992
Model coherence dynamic (k=68) = 0.4036
Model coherence dynamic (k=70) = 0.4035
Model coherence dynamic (k=72) = 0.4034
Model coherence dynamic (k=74) = 0.4036
Model coherence dynamic (k=76) = 0.4010
Model coherence dynamic (k=78) = 0.4025
Model coherence dynamic (k=80) = 0.3999
Model coherence dynamic (k=82) = 0.3989
Model coherence dynamic (k=84) = 0.3997
Model coherence dynamic (k=86) = 0.4008
Model coherence dynamic (k=88) = 0.4030
Model coherence dynamic (k=90) = 0.4003
Model coherence dynamic (k=92) = 0.3990
Model coherence dynamic (k=94) = 0.3993
Model coherence dynamic (k=96) = 0.4010
Model coherence dynamic (k=98) = 0.4000
Model coherence dynamic (k=100) = 0.3989
[0.43558251710222085, 0.41878171980054857, 0.42124753384467567, 0.41122089970597364, 0.41164816784264041, 0.41240486115331842, 0.41105816954271646, 0.40751686464697973, 0.40481427279611248, 0.40628028383916986, 0.40780652618912072, 0.40707285165923679, 0.41109877112004828, 0.41176407969384399, 0.41193576538593951, 0.41205235363049358, 0.41266146583333646, 0.4116385111469873, 0.40846269162858245, 0.4076680482684214, 0.41185767759162739, 0.40729751288400162, 0.40569684912949844, 0.40866860616530992, 0.40539166023667061, 0.40487143859536995, 0.40373536477744421, 0.40163646080418197, 0.39917533912041958, 0.40362254188464924, 0.40346109657009804, 0.40344426250858789, 0.40358695332550576, 0.40095118387119649, 0.40246291609985824, 0.39994385595474424, 0.39889532370122138, 0.39967259254734849, 0.40075715748832119, 0.4030340950441138, 0.40025185200399926, 0.39898919130985688, 0.39932140870381505, 0.40098986553158994, 0.40000084904370448, 0.39891480233048637]






    






  
    
      
      NMF_dynamic_tc-w2v_TC_W2V
    
  
  
    
      10
      0.435583
    
    
      12
      0.418782
    
    
      14
      0.421248
    
    
      16
      0.411221
    
    
      18
      0.411648
    
    
      20
      0.412405
    
    
      22
      0.411058
    
    
      24
      0.407517
    
    
      26
      0.404814
    
    
      28
      0.406280
    
    
      30
      0.407807
    
    
      32
      0.407073
    
    
      34
      0.411099
    
    
      36
      0.411764
    
    
      38
      0.411936
    
    
      40
      0.412052
    
    
      42
      0.412661
    
    
      44
      0.411639
    
    
      46
      0.408463
    
    
      48
      0.407668
    
    
      50
      0.411858
    
    
      52
      0.407298
    
    
      54
      0.405697
    
    
      56
      0.408669
    
    
      58
      0.405392
    
    
      60
      0.404871
    
    
      62
      0.403735
    
    
      64
      0.401636
    
    
      66
      0.399175
    
    
      68
      0.403623
    
    
      70
      0.403461
    
    
      72
      0.403444
    
    
      74
      0.403587
    
    
      76
      0.400951
    
    
      78
      0.402463
    
    
      80
      0.399944
    
    
      82
      0.398895
    
    
      84
      0.399673
    
    
      86
      0.400757
    
    
      88
      0.403034
    
    
      90
      0.400252
    
    
      92
      0.398989
    
    
      94
      0.399321
    
    
      96
      0.400990
    
    
      98
      0.400001
    
    
      100
      0.398915



In [86]:

    
display(df_dynamic.describe())









    






  
    
      
      NMF_dynamic_tc-w2v_TC_W2V
    
  
  
    
      count
      46.000000
    
    
      mean
      0.406725
    
    
      std
      0.006955
    
    
      min
      0.398895
    
    
      25%
      0.401152
    
    
      50%
      0.405544
    
    
      75%
      0.411190
    
    
      max
      0.435583



In [87]:

    
%%time

## DO NOT RUN AGAIN 2 days...
dictionary_filepath = 'data/eos/dictionary_EOS_all.dict'
bow_filepath = 'data/eos/corpus_EOS_all.mm'  

corpus = gensim.corpora.MmCorpus(bow_filepath)
print('here')
dictionary = gensim.corpora.Dictionary.load_from_text(dictionary_filepath)  
corpus_text = MyDocuments('dynamic_nmf/data/windowbin/slices/tokenized_window_all.gz') 

coherence = []


name = 'NMF_dynamic_unify'
print ('here')

for k in range(actual_kmin, actual_kmax + 2, 2):
    cm = CoherenceModel(dictionary=dictionary, corpus=corpus, 
                        texts=corpus_text, topics=get_nmf_dynamic_topics(k, dictionary, 20), 
                        coherence='c_v')

    coherence_score = cm.get_coherence()                
    print("Model coherence dynamic (k=%d) = %.4f" % (k, coherence_score))
    coherence.append(coherence_score)

print(coherence)

df_dynamic['%s_TC_W2V' % name] = coherence

display(df_dynamic)    
df_dynamic.to_csv('dynamic_nmf/data/windowbin/csv/dynamic_coherence_k_performance.csv')









    



here
here






    



Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/queues.py", line 247, in _feed
    send_bytes(obj)
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 397, in _send_bytes
    self._send(header)
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
  File "/usr/lib/python3.5/multiprocessing/queues.py", line 247, in _feed
    send_bytes(obj)






    



Model coherence dynamic (k=10) = 0.5969
Model coherence dynamic (k=12) = 0.5724
Model coherence dynamic (k=14) = 0.5690
Model coherence dynamic (k=16) = 0.5471
Model coherence dynamic (k=18) = 0.5514
Model coherence dynamic (k=20) = 0.5519
Model coherence dynamic (k=22) = 0.5585
Model coherence dynamic (k=24) = 0.5574
Model coherence dynamic (k=26) = 0.5577
Model coherence dynamic (k=28) = 0.5662
Model coherence dynamic (k=30) = 0.5622
Model coherence dynamic (k=32) = 0.5714
Model coherence dynamic (k=34) = 0.5778
Model coherence dynamic (k=36) = 0.5809
Model coherence dynamic (k=38) = 0.5886
Model coherence dynamic (k=40) = 0.6010
Model coherence dynamic (k=42) = 0.6084
Model coherence dynamic (k=44) = 0.6066
Model coherence dynamic (k=46) = 0.5931
Model coherence dynamic (k=48) = 0.5866
Model coherence dynamic (k=50) = 0.6024
Model coherence dynamic (k=52) = 0.5971
Model coherence dynamic (k=54) = 0.5879
Model coherence dynamic (k=56) = 0.5952
Model coherence dynamic (k=58) = 0.5838
Model coherence dynamic (k=60) = 0.5825
Model coherence dynamic (k=62) = 0.5859
Model coherence dynamic (k=64) = 0.5850
Model coherence dynamic (k=66) = 0.5791
Model coherence dynamic (k=68) = 0.5863
Model coherence dynamic (k=70) = 0.5854
Model coherence dynamic (k=72) = 0.5803
Model coherence dynamic (k=74) = 0.5804
Model coherence dynamic (k=76) = 0.5761
Model coherence dynamic (k=78) = 0.5748
Model coherence dynamic (k=80) = 0.5771
Model coherence dynamic (k=82) = 0.5703
Model coherence dynamic (k=84) = 0.5692
Model coherence dynamic (k=86) = 0.5744
Model coherence dynamic (k=88) = 0.5746
Model coherence dynamic (k=90) = 0.5695
Model coherence dynamic (k=92) = 0.5716
Model coherence dynamic (k=94) = 0.5741
Model coherence dynamic (k=96) = 0.5736
Model coherence dynamic (k=98) = 0.5721
Model coherence dynamic (k=100) = 0.5711
[0.59685718083157935, 0.57243815670398845, 0.56895329132197026, 0.54707846683242112, 0.55137140918704242, 0.55186270140030058, 0.55845470837134059, 0.55743752327028473, 0.55767427855501883, 0.56620148424714889, 0.56222318265270921, 0.57137359173250646, 0.5778136078291054, 0.58087685945564449, 0.5885965236172247, 0.6010096272688884, 0.60836754481607058, 0.60656226372342048, 0.59306467556595066, 0.5865661419409226, 0.60237448166429564, 0.59707855499849294, 0.58793466474759892, 0.5952005474862172, 0.58381854164007219, 0.58251507820379977, 0.58593599795534312, 0.58496830541521672, 0.57908672406237216, 0.58634242321268726, 0.58542058245629314, 0.58032406219262689, 0.58036681626225683, 0.5761011045486214, 0.57478888810070772, 0.57706021208912905, 0.57027452178779592, 0.56916148291614554, 0.57439777567311323, 0.57457949881571801, 0.56949653726024396, 0.57157180018176712, 0.57414290114975475, 0.57357983234057686, 0.57209878558009886, 0.5711301616598794]






    






  
    
      
      NMF_dynamic_tc-w2v_TC_W2V
      NMF_dynamic_unify_TC_W2V
    
  
  
    
      10
      0.435583
      0.596857
    
    
      12
      0.418782
      0.572438
    
    
      14
      0.421248
      0.568953
    
    
      16
      0.411221
      0.547078
    
    
      18
      0.411648
      0.551371
    
    
      20
      0.412405
      0.551863
    
    
      22
      0.411058
      0.558455
    
    
      24
      0.407517
      0.557438
    
    
      26
      0.404814
      0.557674
    
    
      28
      0.406280
      0.566201
    
    
      30
      0.407807
      0.562223
    
    
      32
      0.407073
      0.571374
    
    
      34
      0.411099
      0.577814
    
    
      36
      0.411764
      0.580877
    
    
      38
      0.411936
      0.588597
    
    
      40
      0.412052
      0.601010
    
    
      42
      0.412661
      0.608368
    
    
      44
      0.411639
      0.606562
    
    
      46
      0.408463
      0.593065
    
    
      48
      0.407668
      0.586566
    
    
      50
      0.411858
      0.602374
    
    
      52
      0.407298
      0.597079
    
    
      54
      0.405697
      0.587935
    
    
      56
      0.408669
      0.595201
    
    
      58
      0.405392
      0.583819
    
    
      60
      0.404871
      0.582515
    
    
      62
      0.403735
      0.585936
    
    
      64
      0.401636
      0.584968
    
    
      66
      0.399175
      0.579087
    
    
      68
      0.403623
      0.586342
    
    
      70
      0.403461
      0.585421
    
    
      72
      0.403444
      0.580324
    
    
      74
      0.403587
      0.580367
    
    
      76
      0.400951
      0.576101
    
    
      78
      0.402463
      0.574789
    
    
      80
      0.399944
      0.577060
    
    
      82
      0.398895
      0.570275
    
    
      84
      0.399673
      0.569161
    
    
      86
      0.400757
      0.574398
    
    
      88
      0.403034
      0.574579
    
    
      90
      0.400252
      0.569497
    
    
      92
      0.398989
      0.571572
    
    
      94
      0.399321
      0.574143
    
    
      96
      0.400990
      0.573580
    
    
      98
      0.400001
      0.572099
    
    
      100
      0.398915
      0.571130
    
  








    



CPU times: user 2h 53min 14s, sys: 1min 46s, total: 2h 55min
Wall time: 2d 12h 24min 5s



In [169]:

    
# Graph

df_dynamic = pd.read_csv('dynamic_nmf/data/windowbin/csv/dynamic_coherence_k_performance.csv') 



# df_dynamic.describe()
df_dynamic.columns = ['k', 'NMF dynamic TC-W2V', 'NMF dynamic unify']
df_dynamic.set_index('k', inplace=True)

print('Dynamic tc-w2v')
# print(get_nmf_dynamic_topics(10, None, 20))
print(df_dynamic.sort_values(by=['NMF dynamic TC-W2V'],  ascending=False).head())

print('Dynamic unify')
print(get_nmf_dynamic_topics(42, None, 20))  
print(df_dynamic.sort_values(by=['NMF dynamic unify'],  ascending=False).head())
    
ax = df_dynamic[['NMF dynamic unify']].plot(xticks=df_dynamic.index)
plt.xticks(rotation=90, fontsize=4)
plt.xlabel('dynamic')

plt.ylabel('score')
plt.title('dynamic coherence')
plt.legend(loc='best')
plt.show()
plt.savefig('data/eos/graphs/dynamic_coherence_plot.png', dpi=800)   

df_dynamic.head()
# df_dynamic.describe()









    



Dynamic tc-w2v
    NMF dynamic TC-W2V  NMF dynamic unify
k                                        
10            0.435583           0.596857
14            0.421248           0.568953
12            0.418782           0.572438
42            0.412661           0.608368
20            0.412405           0.551863
Dynamic unify
[['can', 'will', 'one', 'year', 'go', 'like', 'get', 'make', 'people', 'say', 'time', 'good', 'world', 'know', 'see', 'just', 'now', 'work', 'think', 'many'], ['shelling', 'damascus', 'suburbs', 'regime', 'homs', 'neighborhood', 'idlib', 'forces', 'daraa', 'report', 'fierce', 'hama', 'town', 'fsa', 'city', 'martyrs', 'artillery', 'al', 'mortar', 'army'], ['syrian', 'syria', 'assad', 'opposition', 'talk', 'geneva', 'say', 'foreign', 'arab', 'peace', 'al_assad', 'regime', 'damascus', 'government', 'president_bashar', 'meeting', 'political', 'conference', 'support', 'terrorism'], ['israel', 'israeli', 'palestinian', 'jerusalem', 'palestinians', 'hamas', 'gaza', 'netanyahu', 'west_bank', 'jewish', 'israelis', 'peace', 'aqsa', 'abbas', 'palestine', 'arab', 'jews', 'jordan', 'benjamin_netanyahu', 'state'], ['turkey', 'turkish', 'erdogan', 'ankara', 'kurdish', 'istanbul', 'pkk', 'border', 'syria', 'davutoglu', 'kurds', 'tayyip_erdogan', 'syrian', 'nato', 'party', 'ypg', 'armenian', 'recep_tayyip', 'visit', 'coup'], ['iran', 'nuclear', 'iranian', 'tehran', 'sanction', 'talk', 'saudi_arabia', 'program', 'islamic', 'rouhani', 'foreign', 'power', 'israel', 'country', 'us', 'iranians', 'saudi', 'republic', 'zarif', 'united_states'], ['refugee', 'child', 'million', 'jordan', 'unhcr', 'syrian_refugee', '000', 'humanitarian', 'people', 'country', 'aid', 'flee', 'syria', 'un', 'syrians', 'conflict', 'need', 'camp', 'crisis', 'food'], ['iraq', 'iraqi', 'baghdad', 'sunni', 'maliki', 'shi', 'ite', 'government', 'kurdish', 'prime_minister', 'shiite', 'country', 'violence', 'sectarian', 'iraqis', 'kurds', 'sunnis', 'sadr', 'al', 'say'], ['russia', 'russian', 'putin', 'moscow', 'ukraine', 'lavrov', 'president_vladimir', 'kremlin', 'say', 'nato', 'syria', 'military', 'sanction', 'air', 'foreign', 'ukrainian', 'meeting', 'crimea', 'relation', 'russians'], ['oil', 'market', 'price', 'barrel', 'opec', 'production', 'crude', 'oil_price', 'percent', 'output', 'company', 'rise', 'energy', 'export', 'low', 'supply', 'cut', 'high', 'year', 'cent'], ['alert', 'click_here', 'disable', 'remove', 'add', 'daesh', 'afp', 'al', 'file', 'dubai', 'follow', 'united_nations', 'bashar_al', 'anadolu_agency', 'army', 'uae', 'business', 'al_qaeda', 'hamdan', 'dewa'], ['chemical', 'weapons', 'use', 'syria', 'opcw', 'syrian', 'attack', 'say', 'gas', 'destroy', 'prohibition', 'stockpile', 'assad', 'report', 'un', 'damascus', 'destruction', 'sarin', 'arsenal', 'mustard'], ['mosul', 'iraqi', 'forces', 'city', 'iraq', 'islamic_state', 'fallujah', 'troop', 'militant', 'daesh', 'retake', 'civilian', 'say', 'baghdad', 'offensive', 'operation', 'army', 'western', 'control', 'fighters'], ['say', 'police', 'suspect', 'man', 'attack', 'arrested', 'video', 'french', 'charge', 'terrorism', 'paris', 'terrorist', 'terror', 'attacks', 'court', 'group', 'authority', 'year_old', 'france', 'london'], ['un', 'council', 'resolution', 'syria', 'security', 'united_nations', 'say', 'annan', 'observer', 'violence', 'mission', 'secretary_general', 'international', 'humanitarian', 'league', 'peace', 'plan', 'arab', 'call', 'ceasefire'], ['full_story', 'channel_news', 'read', 'asia', '2014', 'syria', 'daily_mail', 'jerusalem', 'sunday', 'saturday', 'cbs_news', 'al_jazeera', 'monday', 'huffington_post', 'independent', 'friday', 'post', 'tuesday', 'radiofreeeurope', 'iraq'], ['quot', 'say', 'the', 'we', 'it', 'add', 'annan', 'tell', 'afp', 'there', 'this', 'reuters', 'assad', 'they', 'apos', 'if', 'but', 'terrorists', 'call', 'that'], ['window_click', 'share', 'new', 'opens', 'iraqinews', 'com', 'click', 'reddit_opens', 'google', 'twitter_opens', '_opens', 'window', 'email', 'add', 'print', 'facebook', 'archival_photo', 'source', 'baghdad', 'friend'], ['trump', 'donald_trump', 'clinton', 'president', 'republican', 'ban', 'say', 'mr', 'campaign', 'hillary_clinton', 'order', 'white_house', 'election', 'policy', 'muslim', 'candidate', 'debate', 'democratic', 'america', 'united_states'], ['lebanon', 'hezbollah', 'lebanese', 'beirut', 'syria', 'syrian', 'border', 'assad', 'tripoli', 'group', 'war', 'sunni', 'say', 'nasrallah', 'lebanese_army', 'rebel', 'shi', 'civil', 'arsal', 'fighter'], ['yemen', 'saudi', 'hadi', 'yemeni', 'saudi_arabia', 'saleh', 'al_qaeda', 'houthis', 'houthi', 'mansour', 'aden', 'sanaa', 'rebels', 'militant', 'riyadh', 'say', 'government', 'ali_abdullah', 'president_abd', 'official'], ['migrant', 'eu', 'refugee', 'europe', 'greece', 'border', 'germany', 'european', 'greek', 'say', 'european_union', 'boat', 'migration', 'turkey', 'asylum_seeker', 'country', 'merkel', 'island', 'italy', 'macedonia'], ['js', 'function', 'replyid', 'var', 'width', 'fjs', 'border', 'id', 'isloggedin', 'rferl', 'comment', 'height', 'ffffff', 'color', 'if', 'background', 'replycomment', 'solid', 'left', 'shwtimer'], ['photo', 'istanbul', 'photos', 'turkish', 'turkey', 'amid', 'curfew', 'police', 'protest', 'destruction', 'day', 'clashes', 'celebrate', 'sur', 'across', 'visitor', 'isil', 'aegean', 'photograph', 'depict'], ['apos', 'it', 'assad', 'amp', 'don', 'com', 'apo', 'au', 'reuters', 'they', 'that', 'we', 'he', 're', '2012', 've', 'ite', 'http', 'australia', 'shi'], ['egypt', 'egyptian', 'cairo', 'morsi', 'president', 'protest', 'muslim_brotherhood', 'brotherhood', 'sisi', 'islamist', 'mubarak', 'government', 'mursi', 'supporter', 'arab', 'political', 'state', 'say', 'interim', 'constitution'], ['isis', 'iraq', 'syria', 'group', 'fighters', 'video', 'terror', 'baghdadi', 'terrorist', 'attacks', 'jihadi', 'claim', 'daily_mail', 'join', 'extremist', 'independent', 'caliphate', 'leader', 'abu_bakr', 'capture'], ['match', 'seed', 'win', 'round', 'play', 'ivanovic', 'open', 'williams', 'game', 'final', 'player', 'set', 'second', 'first', 'beat', 'australian', 'tournament', 'champion', 'third', 'wimbledon'], ['provide', 'project', 'support', 'management', 'development', 'work', 'humanitarian', 'experience', 'program', 'ensure', 'need', 'programme', 'assistance', 'will', 'staff', 'health', 'activity', 'information', 'training', 'food'], ['killed', 'baghdad', 'wound', 'car', 'attack', 'bomb', 'people', 'suicide', 'police', 'attacks', 'least', 'blast', 'killing', 'bombing', 'security', 'say', 'source', 'bomber', 'capital', 'bombings'], ['canada', 'canadian', 'trudeau', 'say', 'canadians', 'ottawa', 'will', 'government', 'mission', 'nato', 'liberal', 'defence', 'minister', 'justin_trudeau', 'harper', 'liberals', 'prime_minister', 'military', 'isil', 'sajjan'], ['al', 'terrorist', 'army', 'unit', 'daraa', 'source', 'destroy', 'countryside', 'nusra', 'jabhat_al', 'group', 'area', 'province', 'terrorist_organization', 'sana', 'baghdadi', 'number', 'terrorism', 'vehicle', 'position'], ['obama', 'white_house', 'president_barack', 'president', 'washington', 'say', 'hagel', 'will', 'congress', 'us', 'american', 'republican', 'united_states', 'policy', 'administration', 'leader', 'secretary', 'bush', 'military', 'romney'], ['iraq', 'registration', 'article', 'news', 'please', 'p_articles', 'iraqupdates', 'monthly_basis', 'inbox', 'rss_feed', 'subscriber', 'relat', 'everyday', 'subscription', 'students', 'newsletter', 'register', 'archive', 'weekly', 'headline'], ['mr', 'labour', 'corbyn', 'vote', 'party', 'britain', 'mp', 'british', 'jeremy_corbyn', 'uk', 'prime_minister', 'election', 'leader', 'london', 'will', 'parliament', 'scotland', 'australia', 'cameron', 'campaign'], ['30', 'reuters', 'getty', '2016', '2017', '16', '13', '15', '14', 'image', '29', '17', '18', '19', '11', '12', '10', '27', '21', '22'], ['islamic_state', 'group', 'kurdish', 'fighters', 'say', 'militant', 'syria', 'raqqa', 'forces', 'strike', 'isil', 'coalition', 'air_strike', 'syrian', 'town', 'sdf', 'fighting', 'airstrike', 'northern', 'ypg'], ['2010', '2011', 'comment', '2009', 'arab', 'arabic', 'post', 'news', '2012', 'august', 'september', 'october', 'december', 'january', 'july', 'november', 'read', 'february', 'story', 'tags'], ['use', 'osc', 'english', 'original', 'dissemination', 'copyrighted_material', 'usage', 'reproduction', 'contain', 'purpose', 'product', 'subject', 'authorize', 'national', 'policy', 'copyright', 'united_states', 'security', 'government', 'lebanon'], ['aleppo', 'rebel', 'syrian', 'rebels', 'say', 'city', 'government', 'area', 'activist', 'killed', 'civilian', 'held', 'assad', 'syria', 'regime', 'opposition', 'syrian_observatory', 'army', 'observatory', 'human_rights'], ['china', 'chinese', 'beijing', 'north_korea', 'japan', 'xi', 'trade', 'global', 'summit', 'economic', 'asia', 'will', 'india', 'country', 'issue', 'international', 'cooperation', 'development', 'chen', 'nuclear'], ['us', 'afghanistan', 'afghan', 'military', 'taliban', 'troop', 'pakistan', 'nato', 'add', 'kabul', 'washington', 'american', 'war', 'iraq', 'united_states', 'forces', 'pentagon', 'karzai', 'force', 'defense']]
    NMF dynamic TC-W2V  NMF dynamic unify
k                                        
42            0.412661           0.608368
44            0.411639           0.606562
50            0.411858           0.602374
40            0.412052           0.601010
52            0.407298           0.597079






    














    











    Out[169]:






  
    
      
      NMF dynamic TC-W2V
      NMF dynamic unify
    
    
      k
      
      
    
  
  
    
      10
      0.435583
      0.596857
    
    
      12
      0.418782
      0.572438
    
    
      14
      0.421248
      0.568953
    
    
      16
      0.411221
      0.547078
    
    
      18
      0.411648
      0.551371



In [191]:

    
# Get the top 5 topic words in one window from LDA Mallet and NMF for comparison
nmf_model = 'dynamic_nmf/data/windowbin/result/tf_idf_tokenized_window_2016*.pkl'

mallet = 'data/eos/mallet/Malletmodel_%s_K_%s.pkl'
windows = ['2016_01', '2016_02', '2016_03', '2016_04','2016_05','2016_06'
#            '2016_07', '2016_09','2016_10','2016_11','2016_12'
          ]


df = pd.read_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')
df.set_index(df['window'], inplace=True)

df = df[df['window'].isin(windows)]
df.head()


for index, row in df.iterrows():
    print(index, 'NMF', row['NMF_TC_W2V_k'])
    print(get_nmf_topics(row['window'], row['NMF_TC_W2V_k'], None, 5))
    ldamodel = joblib.load(mallet % (row['window'], row['mallet_TC_W2V_k']))
    print(index, 'Mallet', row['mallet_TC_W2V_k'])
    print(get_topics(ldamodel, row['mallet_TC_W2V_k'], 5))
    

# ldamodel = joblib.load(fname)
# topic_num = 
# print(get_topics(ldamodel, topic_num, 5))









    



2016_01 NMF 24
[['photo', 'photos', 'istanbul', 'turkey', 'turkish'], ['child', 'humanitarian', 'un', 'aid', 'people'], ['window_click', 'share', 'new', 'opens', 'iraqinews'], ['inmate', 'jail', 'hallock', 'escape', 'orange_county'], ['talk', 'opposition', 'syrian', 'geneva', 'peace'], ['iran', 'rouhani', 'sanction', 'iranian', 'nuclear'], ['alert', 'click_here', 'disable', 'remove', 'add'], ['refugee', 'merkel', 'germany', 'border', 'migrant'], ['match', 'murray', 'australian', 'win', 'round'], ['russian', 'airspace', 'russia', 'turkey', 'turkish'], ['iran', 'saudi_arabia', 'saudi', 'shi', 'riyadh'], ['oil', 'opec', 'price', 'market', 'production'], ['iraq', 'islamic_state', 'strike', 'iraqi', 'coalition'], ['monastery', 'elijah', 'st', 'mosul', 'church'], ['video', 'paris', 'attacks', 'islamic_state', 'jihadi'], ['egypt', 'suspect', 'bomb', 'airport', 'source'], ['china', 'xi', 'chinese', 'egypt', 'president_xi'], ['migrant', 'greece', 'boat', 'turkey', 'europe'], ['turkey', 'kurdish', 'pkk', 'biden', 'ypg'], ['canada', 'canadian', 'canadians', 'defence', 'isis'], ['israel', 'israeli', 'palestinian', 'palestinians', 'jewish'], ['deir_al', 'zor', 'group', 'killed', 'syrian'], ['will', 'mr', 'say', 'can', 'one'], ['mansour', 'pickup_truck', 'car', 'crash', 'firefighter']]
2016_01 Mallet 24
[['russia', 'russian', 'nato', 'moscow', 'turkey'], ['turkey', 'turkish', 'kurdish', 'pkk', 'ankara'], ['british', 'report', 'mr', 'case', 'court'], ['refugee', 'migrant', 'europe', 'germany', 'border'], ['alert', 'click_here', 'add', 'remove', 'disable'], ['china', 'region', 'state', 'political', 'support'], ['oil', 'market', 'price', 'percent', 'low'], ['question', 'mr', 'party', 'trump', 'make'], ['jail', 'escape', 'man', 'charge', 'inmate'], ['police', 'islamic_state', 'video', 'group', '-attacks-'], ['day', 'film', 'time', 'family', 'life'], ['group', 'city', '-killed-', '-forces-', 'area'], ['britain', 'minister', 'prime_minister', 'mr', 'party'], ['israel', 'report', 'israeli', 'palestinian', '-war-'], ['iran', 'saudi_arabia', 'saudi', 'iranian', 'yemen'], ['work', 'school', 'service', 'project', 'student'], ['2016', 'jan', 'people', 'reply_alert', 'good'], ['people', 'child', 'refugee', 'million', 'aid'], ['-iraq-', 'u.s.', 'military', 'islamic_state', 'canada'], ['photo', 'photos', 'turkey', 'istanbul', 'turkish'], ['iran', 'sanction', 'deal', 'nuclear', 'visit'], ['open', 'win', 'match', 'australian', 'time'], ['share', 'window_click', 'iraqi', '-iraq-', 'isis'], ['talk', 'syrian', 'opposition', 'peace', 'geneva']]
2016_02 NMF 20
[['photo', 'photos', 'istanbul', 'turkey', 'amid'], ['ceasefire', 'russia', 'syria', 'cessation', 'hostility'], ['window_click', 'share', 'new', 'opens', 'iraqinews'], ['will', 'can', 'armenia', 'one', 'armenian'], ['alert', 'click_here', 'disable', 'remove', 'add'], ['talk', 'geneva', 'opposition', 'syrian', 'hnc'], ['migrant', 'greece', 'refugee', 'border', 'europe'], ['turkey', 'turkish', 'kurdish', 'ypg', 'pyd'], ['iran', 'saudi_arabia', 'saudi', 'lebanon', 'troop'], ['canada', 'canadian', 'citizenship', 'canadians', 'trudeau'], ['aleppo', 'syrian', 'rebel', 'russian', 'city'], ['plane', 'passenger', 'hole', 'say', 'mogadishu'], ['trump', 'republican', 'rubio', 'bush', 'cruz'], ['baghdad', 'suicide', 'killed', 'people', 'attack'], ['oil', 'barrel', 'production', 'price', 'opec'], ['taliban', 'afghanistan', 'afghan', 'fraser', 'kabul'], ['refugee', 'jordan', 'million', 'provide', 'aid'], ['shakil', 'son', 'woman', 'say', 'judge'], ['islamic_state', 'libya', 'iraq', 'mosul', 'iraqi'], ['israel', 'israeli', 'palestinian', 'palestinians', 'jerusalem']]
2016_02 Mallet 26
[['refugee', 'migrant', 'border', 'europe', 'greece'], ['people', 'refugee', 'million', 'aid', 'jordan'], ['canada', 'canadian', 'plane', 'north', 'north_korea'], ['-killed-', 'people', '-attacks-', 'group', '-attack-'], ['child', 'family', 'woman', 'life', 'time'], ['syrian', 'group', 'talk', 'russia', 'opposition'], ['political', 'people', 'world', 'make', '-war-'], ['police', 'court', 'case', 'charge', 'group'], ['israel', 'israeli', 'afghanistan', 'palestinian', 'taliban'], ['2016', 'february', 'reuters', 'feb', 'report'], ['trump', 'mr', 'republican', 'candidate', 'win'], ['armenian', 'church', 'university', 'community', 'student'], ['turkey', 'russia', 'russian', 'turkish', 'kurdish'], ['international', 'security', 'state', 'minister', 'government'], ['day', 'photo', 'local', 'military', 'continue'], ['photo', 'photos', 'turkey', 'istanbul', 'turkish'], ['win', 'move', 'team', 'world', 'make'], ['iran', 'military', 'u.s.', 'saudi_arabia', 'libya'], ['2016', 'feb', 'alert_moderator', 'people', 'government'], ['eu', 'britain', 'deal', 'prime_minister', 'uk'], ['oil', 'market', 'china', 'price', 'million'], ['syrian', 'aleppo', 'government', 'city', 'border'], ['day', 'continue', 'area', 'security', 'military'], ['alert', 'click_here', 'add', 'remove', 'disable'], ['work', 'service', 'include', 'provide', 'project'], ['isis', '-iraq-', 'share', 'islamic_state', 'iraqi']]
2016_03 NMF 22
[['photos', 'photo', 'istanbul', 'turkey', 'turkish'], ['islamic_state', 'say', 'strike', 'group', 'iraq'], ['brussels', 'belgian', 'attacks', 'paris', 'abdeslam'], ['window_click', 'share', 'new', 'opens', 'iraqinews'], ['alert', 'click_here', 'disable', 'remove', 'add'], ['greece', 'migrant', 'refugee', 'eu', 'turkey'], ['talk', 'syrian', 'geneva', 'peace', 'say'], ['child', 'year', 'will', 'work', 'country'], ['palmyra', 'syrian', 'city', 'army', 'islamic_state'], ['trump', 'republican', 'rubio', 'donald_trump', 'say'], ['habib', 'girlfriend', 'gatineau', 'charge', 'court'], ['kurdish', 'turkey', 'pkk', 'attack', 'ankara'], ['putin', 'russian', 'russia', 'moscow', 'syria'], ['yemen', 'saudi', 'aden', 'houthis', 'houthi'], ['easter', 'francis', 'st', 'pope', 'jesus'], ['libya', 'tunisia', 'tunisian', 'guerdan', 'militant'], ['canada', 'canadian', 'trudeau', 'sajjan', 'war'], ['israel', 'israeli', 'palestinian', 'palestinians', 'biden'], ['iraqi', 'baghdad', 'mosul', 'iraq', 'dam'], ['north_korea', 'iran', 'sanction', 'nuclear', 'north_korean'], ['hezbollah', 'lebanon', 'saudi_arabia', 'arab', 'iran'], ['provide', 'support', 'food', 'assistance', 'aid']]
2016_03 Mallet 28
[['photo', 'photos', 'istanbul', 'turkey', 'turkish'], ['syrian', 'talk', 'government', 'peace', 'opposition'], ['work', 'support', 'international', 'provide', 'development'], ['government', 'party', 'parliament', 'freedom', 'journalist'], ['israel', 'israeli', 'council', 'security', 'palestinian'], ['group', 'u.s.', 'official', 'document', 'security'], ['yemen', 'saudi_arabia', 'arab', 'saudi', 'lebanon'], ['russia', 'russian', 'moscow', 'putin', 'military'], ['university', 'news', 'work', 'picture', 'book'], ['-attack-', 'turkey', '-killed-', 'turkish', 'kurdish'], ['police', 'charge', 'court', 'man', 'video'], ['armenian', 'armenia', 'world', 'church', 'genocide'], ['brussels', '-attacks-', 'paris', 'belgian', 'police'], ['alert', 'click_here', 'add', 'remove', 'palmyra'], ['oil', 'market', 'million', 'company', 'percent'], ['-iraq-', 'iraqi', 'islamic_state', '-forces-', 'group'], ['eu', 'turkey', 'europe', 'european', 'prime_minister'], ['child', 'family', 'canada', 'home', 'school'], ['islamic_state', 'group', 'libya', '-iraq-', 'military'], ['photo', 'photos', 'turkey', 'istanbul', 'turkish'], ['world', '-war-', 'time', 'political', 'question'], ['trump', 'obama', 'u.s.', 'state', 'republican'], ['team', 'play', 'win', 'game', 'world'], ['iran', 'share', 'window_click', '-weapons-', 'nuclear'], ['march', '2016', 'report', 'reuters', 'post'], ['2016', 'mar', 'reply_alert', 'moderator', 'australia'], ['official', 'uae', 'plane', 'bank', 'system'], ['refugee', 'migrant', 'border', 'greece', 'turkey']]
2016_04 NMF 22
[['photo', 'photos', 'istanbul', 'turkey', 'turkish'], ['say', 'will', 'year', 'can', 'people'], ['window_click', 'share', 'new', 'iraqinews', 'opens'], ['aleppo', 'hospital', 'rebel', 'held', 'air_strike'], ['alert', 'click_here', 'disable', 'remove', 'add'], ['obama', 'germany', 'merkel', 'trade', 'president_barack'], ['provide', 'carter', 'iraq', 'support', 'food'], ['sadr', 'green', 'parliament', 'zone', 'protester'], ['abu_sayyaf', 'ridsdel', 'philippine', 'hostage', 'ransom'], ['refugee', 'turkey', 'migrant', 'eu', 'greece'], ['soldiers', 'light', 'training', 'canadian', 'combat'], ['30', '2016', 'april', '21', '23'], ['trudeau', 'quantum_computing', 'reporter', 'canadian', 'abuzz'], ['russian', 'russia', 'putin', 'moscow', 'ukraine'], ['baghdad', 'shi', 'suicide', 'ite', 'wound'], ['chaiyakorn', 'man', 'thai', 'son', 'tourist'], ['talk', 'geneva', 'opposition', 'syrian', 'peace'], ['israel', 'israeli', 'golan_heights', 'netanyahu', 'golan'], ['armenian', 'azerbaijan', 'armenia', 'armenians', 'genocide'], ['trump', 'clinton', 'policy', 'republican', 'speech'], ['yemen', 'iran', 'saudi', 'saudi_arabia', 'kuwait'], ['isis', 'islamic_state', 'iraq', 'forces', 'syria']]
2016_04 Mallet 20
[['2016', 'april', '21', 'reuters', 'world'], ['police', '-attacks-', 'brussels', 'man', 'suspect'], ['isis', '-iraq-', 'islamic_state', 'group', '-forces-'], ['oil', 'million', 'company', 'percent', 'market'], ['child', 'family', 'woman', 'court', 'mother'], ['obama', 'president', 'mr', 'britain', 'eu'], ['world', 'state', 'international', 'member', 'call'], ['time', 'family', 'work', 'day', 'life'], ['photo', 'photos', 'turkey', 'istanbul', 'turkish'], ['report', 'turkey', 'turkish', '2016', 'reuters'], ['alert', 'click_here', 'add', 'israel', 'remove'], ['syrian', 'aleppo', 'government', 'city', 'area'], ['share', 'military', 'troop', '-forces-', '-soldiers-'], ['development', 'agreement', 'support', 'national', 'future'], ['iran', 'saudi_arabia', 'saudi', 'obama', 'u.s.'], ['group', 'canadian', 'abu_sayyaf', 'government', 'hostage'], ['russia', 'russian', 'talk', 'peace', 'syrian'], ['2016', 'make', 'time', 'law', 'case'], ['refugee', 'migrant', 'turkey', 'child', 'europe'], ['parliament', 'police', 'man', 'government', 'protester']]
2016_05 NMF 22
[['photo', 'photos', 'istanbul', 'turkey', 'turkish'], ['baghdad', 'suicide', 'wound', 'killed', 'car'], ['window_click', 'share', 'new', 'opens', 'iraqinews'], ['taliban', 'mansour', 'pakistan', 'afghan', 'afghanistan'], ['humanitarian', 'aid', 'international', 'health', 'world'], ['aleppo', 'syrian', 'syria', 'rebel', 'say'], ['alert', 'click_here', 'disable', 'remove', 'add'], ['30', '2016', 'getty', 'april', 'may'], ['hezbollah', 'badreddine', 'commander', 'lebanese', 'group'], ['fallujah', 'iraqi', 'city', 'forces', 'iraq'], ['protester', 'zone', 'green', 'sadr', 'baghdad'], ['refugee', 'migrant', 'greece', 'europe', 'camp'], ['turkey', 'turkish', 'kurdish', 'pkk', 'erdogan'], ['egyptair', 'plane', 'egyptian', 'flight', 'cairo'], ['say', 'trump', 'will', 'go', 'can'], ['isis', '74', 'iraq', 'fighters', 'raqqa'], ['australia', 'police', 'man', 'boat', 'australian'], ['provide', 'fort_mcmurray', 'support', 'management', 'edmonton'], ['israel', 'israeli', 'jerusalem', 'palestinian', 'palestinians'], ['oil', 'iran', 'opec', 'saudi_arabia', 'barrel'], ['libya', 'government', 'sirte', 'islamic_state', 'say'], ['islamic_state', 'strike', 'kilis', 'syria', 'turkish']]
2016_05 Mallet 30
[['trump', 'reply_alert', 'australia', 'moderator', 'news'], ['share', 'libya', 'window_click', 'source', 'security'], ['humanitarian', 'work', 'provide', 'world', 'support'], ['israel', 'school', 'israeli', 'child', 'student'], ['city', 'area', 'month', 'home', 'local'], ['isis', 'iraqi', '-iraq-', 'city', '-forces-'], ['court', 'government', 'case', 'journalist', 'release'], ['wednesday', 'fire', 'fort_mcmurray', 'canada', 'city'], ['turkey', 'turkish', 'islamic_state', 'border', 'militant'], ['family', 'man', 'child', 'woman', 'picture'], ['party', 'president', 'political', 'government', 'election'], ['oil', 'company', 'million', 'market', 'business'], ['international', 'security', 'peace', 'state', 'council'], ['april', 'image', 'reuters', 'getty', 'london'], ['photo', 'photos', 'turkey', 'istanbul', 'turkish'], ['iran', 'hezbollah', 'group', 'iranian', 'saudi_arabia'], ['-baghdad-', '-killed-', 'security', '-attacks-', '-iraq-'], ['alert', 'click_here', 'add', 'remove', 'disable'], ['britain', 'eu', 'uk', 'mr', 'british'], ['world', '-war-', 'make', 'time', 'fact'], ['photo', 'photos', 'turkey', 'istanbul', 'turkish'], ['show', 'world', 'watch', 'video', 'film'], ['refugee', 'migrant', 'europe', 'turkey', 'border'], ['syrian', 'aleppo', 'government', 'city', 'civilian'], ['police', 'group', 'man', 'isis', 'terrorist'], ['plane', 'egyptian', 'flight', 'egypt', 'egyptair'], ['taliban', 'leader', '-mansour-', 'afghanistan', 'pakistan'], ['play', 'win', 'team', 'game', 'player'], ['russia', 'russian', 'military', 'nato', '-war-'], ['u.s.', '-iraq-', 'military', '-forces-', 'troop']]
2016_06 NMF 22
[['photo', 'photos', 'istanbul', 'turkey', 'turkish'], ['say', 'ali', 'go', 'can', 'will'], ['fallujah', 'iraqi', 'city', 'forces', 'iraq'], ['alert', 'click_here', 'disable', 'remove', 'add'], ['mateen', 'orlando', 'fbi', 'say', 'pulse'], ['window_click', 'share', 'new', 'opens', 'iraqinews'], ['aleppo', 'rebel', 'syrian', 'syria', 'aid'], ['cox', 'jo', 'mp', 'labour', 'mair'], ['airport', 'turkey', 'istanbul', 'attack', 'suicide'], ['refugee', 'unhcr', 'english', 'emergency', 'million'], ['30', '2016', 'june', 'getty', 'reuters'], ['israel', 'israeli', 'palestinian', 'palestinians', 'jerusalem'], ['isis', 'iraq', 'video', 'fighters', 'jihadi'], ['iran', 'yemen', 'saudi', 'saudi_arabia', 'say'], ['manbij', 'sdf', 'syrian', 'fighters', 'forces'], ['trump', 'clinton', 'obama', 'donald_trump', 'ban'], ['russia', 'russian', 'putin', 'assad', 'moscow'], ['yazidis', 'genocide', 'yazidi', 'say', 'crimes'], ['jordan', 'jordanian', 'border', 'attack', 'amman'], ['falluja', 'iraqi', 'baghdad', 'islamic_state', 'army'], ['project', 'development', 'work', 'program', 'provide'], ['eu', 'brexit', 'vote', 'britain', 'referendum']]
2016_06 Mallet 24
[['iraqi', 'city', '-forces-', 'fallujah', 'isis'], ['alert', 'click_here', 'add', 'remove', 'disable'], ['turkey', 'state', 'government', 'turkish', 'party'], ['-attack-', 'airport', 'turkey', 'istanbul', '-killed-'], ['russia', 'iran', 'russian', 'state', 'moscow'], ['military', 'u.s.', 'group', '-iraq-', 'official'], ['eu', 'ali', 'vote', 'britain', 'leave'], ['service', 'mr', 'community', 'nsw', 'significant'], ['june', 'report', 'news', 'file', 'publish'], ['company', 'market', 'million', 'business', 'percent'], ['trump', 'obama', 'u.s.', 'president', 'united_states'], ['israel', 'israeli', 'palestinian', 'arab', 'air'], ['family', 'child', 'work', 'day', 'school'], ['cox', 'jo', 'mp', 'labour', 'time'], ['syrian', 'area', 'aleppo', 'government', '-forces-'], ['mateen', 'orlando', 'shooting', 'nightclub', 'gay'], ['refugee', 'child', 'english', 'million', 'camp'], ['police', 'man', 'group', 'isis', 'suspect'], ['world', 'make', 'good', 'reply_alert', 'time'], ['share', 'al', 'window_click', 'dubai', 'yemen'], ['migrant', 'europe', 'germany', 'turkey', 'eu'], ['june', 'reuters', 'image', 'getty', 'ramadan'], ['photo', 'photos', 'turkey', 'istanbul', 'turkish'], ['work', 'international', 'include', 'support', 'council']]



In [71]:

    
%%time


from gensim import corpora
corpus_text = MyDocuments('dynamic_nmf/data/windowbin/slices/tokenized_window_all.gz') 

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(corpus_text)
dictionary.save_as_text('data/eos/dictionary_EOS_all.dict')









    



CPU times: user 5min 29s, sys: 7.97 s, total: 5min 37s
Wall time: 5min 37s



In [72]:

    
%%time


# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in corpus_text]
corpora.MmCorpus.serialize('data/eos/corpus_EOS_all.mm', corpus)









    



CPU times: user 7min 54s, sys: 8.06 s, total: 8min 2s
Wall time: 8min 4s



In [ ]:



In [13]:

    
# print(get_nmf_dynamic_topics(42, None, 20))  
df = pd.DataFrame(get_nmf_dynamic_topics(42, None, 20))
df.to_csv('experiment/dynamic_nmf_42.csv')



In [ ]:

	window	mallet_TC_W2V	mallet_TC_W2V_k	lda_TC_W2V	lda_TC_W2V_k	lsi_TC_W2V	lsi_TC_W2V_k
window
2012_01	2012_01	0.373322	22	0.367360	12	0.344060	18
2012_02	2012_02	0.371982	26	0.356412	14	0.356953	12
2012_03	2012_03	0.376637	28	0.358276	20	0.354839	10
2012_04	2012_04	0.372540	24	0.356079	10	0.344663	16
2012_05	2012_05	0.385459	16	0.369615	10	0.385599	10

	Unnamed: 0	window	window.1	mallet_TC_W2V	mallet_TC_W2V_k	lda_TC_W2V	lda_TC_W2V_k	lsi_TC_W2V	lsi_TC_W2V_k	NMF_TC_W2V	NMF_TC_W2V_k	NMF_Unify	NMF_Unify_k
62	62	2017_03	2017_03	0.368832	22	0.344859	28	0.336967	24	0.375299	28	0.690587	22
63	63	2017_04	2017_04	0.366006	12	0.341303	20	0.326046	12	0.374008	24	0.650894	20
64	64	2017_05	2017_05	0.360582	22	0.333149	28	0.319035	12	0.357726	30	0.677018	14
65	65	2017_06	2017_06	0.381813	28	0.335946	12	0.328623	22	0.373138	18	0.725945	10
66	66	2017_07	2017_07	0.349201	22	0.339650	28	0.322636	30	0.347636	22	0.694142	16

	NMF_dynamic_tc-w2v_TC_W2V
10	0.435583
12	0.418782
14	0.421248
16	0.411221
18	0.411648
20	0.412405
22	0.411058
24	0.407517
26	0.404814
28	0.406280
30	0.407807
32	0.407073
34	0.411099
36	0.411764
38	0.411936
40	0.412052
42	0.412661
44	0.411639
46	0.408463
48	0.407668
50	0.411858
52	0.407298
54	0.405697
56	0.408669
58	0.405392
60	0.404871
62	0.403735
64	0.401636
66	0.399175
68	0.403623
70	0.403461
72	0.403444
74	0.403587
76	0.400951
78	0.402463
80	0.399944
82	0.398895
84	0.399673
86	0.400757
88	0.403034
90	0.400252
92	0.398989
94	0.399321
96	0.400990
98	0.400001
100	0.398915

	NMF_dynamic_tc-w2v_TC_W2V
count	46.000000
mean	0.406725
std	0.006955
min	0.398895
25%	0.401152
50%	0.405544
75%	0.411190
max	0.435583

	NMF_dynamic_tc-w2v_TC_W2V	NMF_dynamic_unify_TC_W2V
10	0.435583	0.596857
12	0.418782	0.572438
14	0.421248	0.568953
16	0.411221	0.547078
18	0.411648	0.551371
20	0.412405	0.551863
22	0.411058	0.558455
24	0.407517	0.557438
26	0.404814	0.557674
28	0.406280	0.566201
30	0.407807	0.562223
32	0.407073	0.571374
34	0.411099	0.577814
36	0.411764	0.580877
38	0.411936	0.588597
40	0.412052	0.601010
42	0.412661	0.608368
44	0.411639	0.606562
46	0.408463	0.593065
48	0.407668	0.586566
50	0.411858	0.602374
52	0.407298	0.597079
54	0.405697	0.587935
56	0.408669	0.595201
58	0.405392	0.583819
60	0.404871	0.582515
62	0.403735	0.585936
64	0.401636	0.584968
66	0.399175	0.579087
68	0.403623	0.586342
70	0.403461	0.585421
72	0.403444	0.580324
74	0.403587	0.580367
76	0.400951	0.576101
78	0.402463	0.574789
80	0.399944	0.577060
82	0.398895	0.570275
84	0.399673	0.569161
86	0.400757	0.574398
88	0.403034	0.574579
90	0.400252	0.569497
92	0.398989	0.571572
94	0.399321	0.574143
96	0.400990	0.573580
98	0.400001	0.572099
100	0.398915	0.571130