In [1]:
# LDA analysis Done

In [2]:
import glob
from datetime import datetime
import logging as log
import gensim
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim
from gensim.models import CoherenceModel
from sklearn.externals import joblib
import gzip
from multiprocessing import Pool
import time
import numpy as np


import pandas as pd
from collections import OrderedDict
from datetime import date
from IPython.display import display, HTML
log.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=log.INFO)

import matplotlib
matplotlib.style.use('ggplot')
%matplotlib notebook

In [3]:
class ModelSimilarity:

    # Uses a model (e.g. Word2Vec model) to calculate the similarity between two terms.
    
    def __init__(self, model):
        self.model = model

    def similarity(self, ranking_i, ranking_j):
        sim = 0.0
        pairs = 0
        for term_i in ranking_i:
            for term_j in ranking_j:
                try:
                    sim += self.model.similarity(term_i, term_j)
                    pairs += 1
                except:
                    # print "Failed pair (%s,%s)" % (term_i,term_j)
                    pass
        if pairs == 0:
            return 0.0
        return sim / pairs

In [4]:
class WithinTopicMeasure:
 
    # Measures within-topic coherence for a topic model, based on a set of term rankings.

    def __init__(self, metric):
        self.metric = metric

    def evaluate_ranking(self, term_ranking):
        return self.metric.similarity(term_ranking, term_ranking)

    def evaluate_rankings(self, term_rankings):
        scores = []
        overall = 0.0
        for topic_index in range(len(term_rankings)):
            score = self.evaluate_ranking(term_rankings[topic_index])
            scores.append(score)
            overall += score
        overall /= len(term_rankings)
        return overall

In [5]:
# To get the topic words from the model
def get_topics(ldamodel, num_topics, num_words):    
    top_words = [[word for word, _ in ldamodel.show_topic(topicno, topn=num_words)] 
                 for topicno in range(ldamodel.num_topics)]
#     print(top_words)
    return top_words

In [6]:
class MyDocuments(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        with gzip.open(self.dirname, 'rb') as f:
            for line in f:
                yield line.decode().split('\t')[1].split()

In [7]:
def execute_coherence(topic_model_path):
    
    
    model_list = glob.glob(topic_model_path)
    model_list.sort()

    model_path = 'data/eos/word2vec_model_all.model'
    log.info("Loading Word2Vec model from %s ..." % model_path)
    model = gensim.models.Word2Vec.load(model_path)
    
    metric = ModelSimilarity(model)
    validation_measure = WithinTopicMeasure(metric)

    indices =[]
    for fname in model_list:
        window = fname[-16:-9]
        indices.append(window)
        
    indices = list(set(indices))
    indices.sort()
#     print(indices)
    
    coherence_list = {}
    for window in indices:
        start_time = time.time()
        temp = []
        for fname in model_list:
            if (window == fname[-16:-9]):
                ldamodel = joblib.load(fname)
                topic_num = int(fname[-6:-4])
                truncated_term_rankings = get_topics(ldamodel, topic_num, 20)
                coherence_score = validation_measure.evaluate_rankings(truncated_term_rankings)
                log.info("Model coherence window=%s (k=%d) = %.4f" % (window, topic_num, coherence_score))
                temp.append ([topic_num, coherence_score])
        coherence_list[window] = temp
        
        elapsed_time = time.time() - start_time
        print('took %s to process %s' % (elapsed_time, topic_model_path) )
    

    return coherence_list, indices

In [8]:
def execute_coherence_gensim(topic_model_path):
    
    dictionary_filepath = 'data/eos/dic_bow/bigram_dict_%s.dict' 
    bow_filepath = 'data/eos/dic_bow/bigram_bow_corpus_%s.mm'    
    
    model_list = glob.glob(topic_model_path)
    model_list.sort()

    indices =[]
    for fname in model_list:
        window = fname[-16:-9]
        indices.append(window)
        
    indices = list(set(indices))
    indices.sort()
    print(indices)
    
    coherence_list = {}
    for window in indices:
        temp = []
        # Load to memory
        corpus = gensim.corpora.MmCorpus(bow_filepath % window)
        dictionary = gensim.corpora.Dictionary.load(dictionary_filepath % window)    
        corpus_text = MyDocuments('dynamic_nmf/data/windowbin/slices/tokenized_window_%s.gz' % window)   
        for fname in model_list:
            if (window == fname[-16:-9]):
                ldamodel = joblib.load(fname)
                topic_num = int(fname[-6:-4])
                cm = CoherenceModel(dictionary=dictionary, corpus=corpus, 
                                    texts=corpus_text, topics=get_topics(ldamodel, topic_num, 20), 
                                    coherence='c_v')
                #     print(cm)
                coherence_score = cm.get_coherence()                
                log.info("Model coherence window=%s (k=%d) = %.4f" % (window, topic_num, coherence_score))
                temp.append ([topic_num, coherence_score])
        coherence_list[window] = temp

        
    return coherence_list, indices

In [24]:
model_list = {
              'lda' : 'data/eos/lda/LDAmodel_*.pkl', 
              'lsi' : 'data/eos/lsi/LSImodel_*.pkl', 
              'mallet' : 'data/eos/mallet/Malletmodel_*.pkl'
             }

In [10]:
def sort_coherence(coherence_dict):
    
    y_coherence = []
    y_k = []
    
    for key in sorted(coherence_dict):
        coherence_dict[key].sort(key=lambda x: x[1], reverse=True)
#         print ("%s : %s" % (key, coherence_dict[key]))
        y_k.append(coherence_dict[key][0][0])
        y_coherence.append(coherence_dict[key][0][1])

#     print(y_coherence)
#     print(y_k)
    return y_coherence, y_k

In [11]:
%%time



df = pd.DataFrame()
# Run TC-W2V coherense score
for name, path in model_list.items():
    print (name, path) 
    coherence_list, indices = execute_coherence(path)
    y_coherence, y_k = sort_coherence(coherence_list)
    df['window'] = indices
    df['%s_TC_W2V' % name] = y_coherence
    df['%s_TC_W2V_k' % name] = y_k
    

df.set_index(df['window'], inplace=True)
display(df.head())
df.to_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')


mallet data/eos/mallet/Malletmodel_*.pkl
took 244.52745938301086 to process data/eos/mallet/Malletmodel_*.pkl
lda data/eos/lda/LDAmodel_*.pkl
took 294.26973938941956 to process data/eos/lda/LDAmodel_*.pkl
lsi data/eos/lsi/LSImodel_*.pkl
took 251.3368363380432 to process data/eos/lsi/LSImodel_*.pkl
window mallet_TC_W2V mallet_TC_W2V_k lda_TC_W2V lda_TC_W2V_k lsi_TC_W2V lsi_TC_W2V_k
window
2012_01 2012_01 0.373322 22 0.367360 12 0.344060 18
2012_02 2012_02 0.371982 26 0.356412 14 0.356953 12
2012_03 2012_03 0.376637 28 0.358276 20 0.354839 10
2012_04 2012_04 0.372540 24 0.356079 10 0.344663 16
2012_05 2012_05 0.385459 16 0.369615 10 0.385599 10
CPU times: user 11min 24s, sys: 6.37 s, total: 11min 30s
Wall time: 13min 11s

In [25]:
%%time


df = pd.read_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')
display(df.tail())

# Run Gensim on LDA/LSA/Mallet
for name, path in model_list.items() :
    print (name, path)    
    coherence_list, indices = execute_coherence_gensim(path)
    y_coherence, y_k = sort_coherence(coherence_list)
    df['%s_Unify' % name] = y_coherence
    df['%s_Unify_k' % name] = y_k
    

display(df.head())
display(df.tail())
df.to_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')


Unnamed: 0 window window.1 mallet_TC_W2V mallet_TC_W2V_k lda_TC_W2V lda_TC_W2V_k lsi_TC_W2V lsi_TC_W2V_k NMF_TC_W2V NMF_TC_W2V_k NMF_Unify NMF_Unify_k
62 62 2017_03 2017_03 0.368832 22 0.344859 28 0.336967 24 0.375299 28 0.690587 22
63 63 2017_04 2017_04 0.366006 12 0.341303 20 0.326046 12 0.374008 24 0.650894 20
64 64 2017_05 2017_05 0.360582 22 0.333149 28 0.319035 12 0.357726 30 0.677018 14
65 65 2017_06 2017_06 0.381813 28 0.335946 12 0.328623 22 0.373138 18 0.725945 10
66 66 2017_07 2017_07 0.349201 22 0.339650 28 0.322636 30 0.347636 22 0.694142 16
mallet data/eos/mallet/Malletmodel_*.pkl
['2012_01', '2012_02', '2012_03', '2012_04', '2012_05', '2012_06', '2012_07', '2012_08', '2012_09', '2012_10', '2012_11', '2012_12', '2013_01', '2013_02', '2013_03', '2013_04', '2013_05', '2013_06', '2013_07', '2013_08', '2013_09', '2013_10', '2013_11', '2013_12', '2014_01', '2014_02', '2014_03', '2014_04', '2014_05', '2014_06', '2014_07', '2014_08', '2014_09', '2014_10', '2014_11', '2014_12', '2015_01', '2015_02', '2015_03', '2015_04', '2015_05', '2015_06', '2015_07', '2015_08', '2015_09', '2015_10', '2015_11', '2015_12', '2016_01', '2016_02', '2016_03', '2016_04', '2016_05', '2016_06', '2016_07', '2016_08', '2016_09', '2016_10', '2016_11', '2016_12', '2017_01', '2017_02', '2017_03', '2017_04', '2017_05', '2017_06', '2017_07']
lda data/eos/lda/LDAmodel_*.pkl
['2012_01', '2012_02', '2012_03', '2012_04', '2012_05', '2012_06', '2012_07', '2012_08', '2012_09', '2012_10', '2012_11', '2012_12', '2013_01', '2013_02', '2013_03', '2013_04', '2013_05', '2013_06', '2013_07', '2013_08', '2013_09', '2013_10', '2013_11', '2013_12', '2014_01', '2014_02', '2014_03', '2014_04', '2014_05', '2014_06', '2014_07', '2014_08', '2014_09', '2014_10', '2014_11', '2014_12', '2015_01', '2015_02', '2015_03', '2015_04', '2015_05', '2015_06', '2015_07', '2015_08', '2015_09', '2015_10', '2015_11', '2015_12', '2016_01', '2016_02', '2016_03', '2016_04', '2016_05', '2016_06', '2016_07', '2016_08', '2016_09', '2016_10', '2016_11', '2016_12', '2017_01', '2017_02', '2017_03', '2017_04', '2017_05', '2017_06', '2017_07']
lsi data/eos/lsi/LSImodel_*.pkl
['2012_01', '2012_02', '2012_03', '2012_04', '2012_05', '2012_06', '2012_07', '2012_08', '2012_09', '2012_10', '2012_11', '2012_12', '2013_01', '2013_02', '2013_03', '2013_04', '2013_05', '2013_06', '2013_07', '2013_08', '2013_09', '2013_10', '2013_11', '2013_12', '2014_01', '2014_02', '2014_03', '2014_04', '2014_05', '2014_06', '2014_07', '2014_08', '2014_09', '2014_10', '2014_11', '2014_12', '2015_01', '2015_02', '2015_03', '2015_04', '2015_05', '2015_06', '2015_07', '2015_08', '2015_09', '2015_10', '2015_11', '2015_12', '2016_01', '2016_02', '2016_03', '2016_04', '2016_05', '2016_06', '2016_07', '2016_08', '2016_09', '2016_10', '2016_11', '2016_12', '2017_01', '2017_02', '2017_03', '2017_04', '2017_05', '2017_06', '2017_07']
/usr/local/lib/python3.5/dist-packages/numpy/core/fromnumeric.py:2909: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
/usr/local/lib/python3.5/dist-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
Unnamed: 0 window window.1 mallet_TC_W2V mallet_TC_W2V_k lda_TC_W2V lda_TC_W2V_k lsi_TC_W2V lsi_TC_W2V_k NMF_TC_W2V NMF_TC_W2V_k NMF_Unify NMF_Unify_k mallet_Unify mallet_Unify_k lda_Unify lda_Unify_k lsi_Unify lsi_Unify_k
0 0 2012_01 2012_01 0.373322 22 0.367360 12 0.344060 18 0.371245 22 0.686599 16 0.570021 14 0.371903 18 0.513629 12
1 1 2012_02 2012_02 0.371982 26 0.356412 14 0.356953 12 0.360304 24 0.658763 12 0.571140 26 0.448183 22 0.525276 10
2 2 2012_03 2012_03 0.376637 28 0.358276 20 0.354839 10 0.374951 24 0.662382 24 0.573819 14 0.350011 30 0.481642 12
3 3 2012_04 2012_04 0.372540 24 0.356079 10 0.344663 16 0.369469 26 0.627686 20 0.582972 20 0.392451 26 0.557337 12
4 4 2012_05 2012_05 0.385459 16 0.369615 10 0.385599 10 0.381602 18 0.615484 30 0.586302 16 0.356175 20 0.461101 12
Unnamed: 0 window window.1 mallet_TC_W2V mallet_TC_W2V_k lda_TC_W2V lda_TC_W2V_k lsi_TC_W2V lsi_TC_W2V_k NMF_TC_W2V NMF_TC_W2V_k NMF_Unify NMF_Unify_k mallet_Unify mallet_Unify_k lda_Unify lda_Unify_k lsi_Unify lsi_Unify_k
62 62 2017_03 2017_03 0.368832 22 0.344859 28 0.336967 24 0.375299 28 0.690587 22 0.577780 12 0.369719 26 0.376879 18
63 63 2017_04 2017_04 0.366006 12 0.341303 20 0.326046 12 0.374008 24 0.650894 20 0.583312 20 0.468642 14 0.385933 10
64 64 2017_05 2017_05 0.360582 22 0.333149 28 0.319035 12 0.357726 30 0.677018 14 0.583450 28 0.320494 26 0.389233 10
65 65 2017_06 2017_06 0.381813 28 0.335946 12 0.328623 22 0.373138 18 0.725945 10 0.607344 12 0.301083 24 0.342559 30
66 66 2017_07 2017_07 0.349201 22 0.339650 28 0.322636 30 0.347636 22 0.694142 16 0.582325 22 0.335810 28 0.452428 26
CPU times: user 2h 17min 30s, sys: 4min 4s, total: 2h 21min 35s
Wall time: 16h 8min 9s

In [ ]:
# score_df = pd.read_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')


# display(score_df.tail())
# df_nmf = pd.read_csv('dynamic_nmf/data/windowbin/csv/window_topic_coherence_results.csv') 


# # df = df.drop(df.columns[0], axis=1)

# display(df_nmf.ix[:, 1:-1])

# score_list =  {}
# for column in df_nmf.ix[:, 1:-1]:
#     scoretmp = []
# #     print(column)
# #     print(df[column][0].strip("{}").split(', '))
#     for score in (df_nmf[column][0].strip("{}").split(', ')):
#         scoretmp.append( [int(score.split(': ')[0]), float(score.split(': ')[1])] )

# #     sorted(scoretmp)
#     score_list[column[-7:]] = scoretmp

    
# y_coherence, y_k = sort_coherence(score_list)  

# print(y_coherence)
# score_df['nmf_TC_W2V'] = y_coherence
# score_df['nmf_TC_W2V_k'] = y_k
# display(score_df.tail())

# df.to_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')

In [12]:
import os.path


def get_nmf_topics(window, k, dictionary, topn=20):
    
    topic_list = []
    topic_df = pd.read_pickle('dynamic_nmf/data/windowbin/result/window.df/window_%s_k%s.pkl' % (window, k))

    for c in topic_df.ix[:,1:].columns:
        # filter out any token not in 
        if dictionary is not None:
            topic = [x for x in topic_df[c].tolist() if x in dictionary.token2id.keys()]
        else:
            topic = topic_df[c].tolist() 
        topic_list.append(topic[:topn])
        
#     print(topic_list)
    return topic_list

In [20]:
def execute_coherence_gensim_nmf(topic_model_path, actual_kmin=10, actual_kmax=30):
    
    dictionary_filepath = 'data/eos/dic_bow/bigram_dict_%s.dict' 
    bow_filepath = 'data/eos/dic_bow/bigram_bow_corpus_%s.mm'    
    
    model_list = glob.glob(topic_model_path)
    model_list.sort()

    indices = []
    for fname in model_list:
        indices.append(fname[-28:-21])
        
    indices = list(set(indices))
    indices.sort()
    print(indices)

    coherence_list = {}   
    for window in indices:
        temp = []
        # Load to memory
        corpus = gensim.corpora.MmCorpus(bow_filepath % window)
        dictionary = gensim.corpora.Dictionary.load(dictionary_filepath % window)  
        corpus_text = MyDocuments('dynamic_nmf/data/windowbin/slices/tokenized_window_%s.gz' % window)  
        
        for k in range(actual_kmin, actual_kmax + 2, 2):
            #             get_nmf_topics(window, k, dictionary, 10)
            if(not os.path.isfile('dynamic_nmf/data/windowbin/result/window.df/window_%s_k%s.pkl' % (window, k))):
                print('dynamic_nmf/data/windowbin/result/window.df/window_%s_k%s.pkl' % (window, k))
                coherence_score = np.nan
            else:         
                cm = CoherenceModel(dictionary=dictionary, corpus=corpus, 
                                    texts=corpus_text, topics=get_nmf_topics(window, k, dictionary, 20), 
                                    coherence='c_v')

                coherence_score = cm.get_coherence()                
            log.info("Model coherence window=%s (k=%d) = %.4f" % (window, k, coherence_score))
            temp.append ([k, coherence_score])
            
        coherence_list[window] = temp
    

    return coherence_list, indices

In [16]:
def execute_coherence_w2v_NMF(topic_model_path,actual_kmin=10, actual_kmax=30):
    start_time = time.time()
    
    model_list = glob.glob(topic_model_path)
    model_list.sort()

    model_path = 'data/eos/word2vec_model_all.model'
    log.info("Loading Word2Vec model from %s ..." % model_path)
    model = gensim.models.Word2Vec.load(model_path)
    
    metric = ModelSimilarity(model)
    validation_measure = WithinTopicMeasure(metric)

    indices = []
    for fname in model_list:
        indices.append(fname[-28:-21])
        
    indices = list(set(indices))
    indices.sort()
    print(indices)
    
    coherence_list = {}   
    for window in indices:
        temp = []

        for k in range(actual_kmin, actual_kmax + 2, 2):
            #             get_nmf_topics(window, k, dictionary, 10)
            if(not os.path.isfile('dynamic_nmf/data/windowbin/result/window.df/window_%s_k%s.pkl' % (window, k))):
                print('dynamic_nmf/data/windowbin/result/window.df/window_%s_k%s.pkl' % (window, k))
                coherence_score = np.nan
            else:
                coherence_score = validation_measure.evaluate_rankings(
                    get_nmf_topics(window, k, None, 20))
                
              
            log.info("Model coherence window=%s (k=%d) = %.4f" % (window, k, coherence_score))
            temp.append ([k, coherence_score])
            
        coherence_list[window] = temp
 
        
    elapsed_time = time.time() - start_time
    print('took %s to process %s' % (elapsed_time, topic_model_path) )
    return coherence_list, indices

In [21]:
%%time


model_list = {'NMF' : 'dynamic_nmf/data/windowbin/result/tf_idf_tokenized_window_20*.pkl'}

df = pd.read_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')
display(df.tail())

# Calculate NMF modele result -
for name, path in model_list.items():
    print (name, path)    
    coherence_list, indices = execute_coherence_gensim_nmf(path)
    y_coherence, y_k = sort_coherence(coherence_list)
    df['%s_Unify' % name] = y_coherence
    df['%s_Unify_k' % name] = y_k

display(df)    
df.to_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')


window window.1 mallet_TC_W2V mallet_TC_W2V_k lda_TC_W2V lda_TC_W2V_k lsi_TC_W2V lsi_TC_W2V_k NMF_TC_W2V NMF_TC_W2V_k
62 2017_03 2017_03 0.368832 22 0.344859 28 0.336967 24 0.375299 28
63 2017_04 2017_04 0.366006 12 0.341303 20 0.326046 12 0.374008 24
64 2017_05 2017_05 0.360582 22 0.333149 28 0.319035 12 0.357726 30
65 2017_06 2017_06 0.381813 28 0.335946 12 0.328623 22 0.373138 18
66 2017_07 2017_07 0.349201 22 0.339650 28 0.322636 30 0.347636 22
NMF dynamic_nmf/data/windowbin/result/tf_idf_tokenized_window_20*.pkl
['2012_01', '2012_02', '2012_03', '2012_04', '2012_05', '2012_06', '2012_07', '2012_08', '2012_09', '2012_10', '2012_11', '2012_12', '2013_01', '2013_02', '2013_03', '2013_04', '2013_05', '2013_06', '2013_07', '2013_08', '2013_09', '2013_10', '2013_11', '2013_12', '2014_01', '2014_02', '2014_03', '2014_04', '2014_05', '2014_06', '2014_07', '2014_08', '2014_09', '2014_10', '2014_11', '2014_12', '2015_01', '2015_02', '2015_03', '2015_04', '2015_05', '2015_06', '2015_07', '2015_08', '2015_09', '2015_10', '2015_11', '2015_12', '2016_01', '2016_02', '2016_03', '2016_04', '2016_05', '2016_06', '2016_07', '2016_08', '2016_09', '2016_10', '2016_11', '2016_12', '2017_01', '2017_02', '2017_03', '2017_04', '2017_05', '2017_06', '2017_07']
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k10.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k12.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k14.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k16.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k18.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k20.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k22.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k24.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k26.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k28.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k30.pkl
/usr/local/lib/python3.5/dist-packages/numpy/core/fromnumeric.py:2909: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
/usr/local/lib/python3.5/dist-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
dynamic_nmf/data/windowbin/result/window.df/window_2015_02_k28.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_02_k30.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k16.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k18.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k20.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k22.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k24.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k26.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k28.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k30.pkl
window window.1 mallet_TC_W2V mallet_TC_W2V_k lda_TC_W2V lda_TC_W2V_k lsi_TC_W2V lsi_TC_W2V_k NMF_TC_W2V NMF_TC_W2V_k NMF_Unify NMF_Unify_k
0 2012_01 2012_01 0.373322 22 0.367360 12 0.344060 18 0.371245 22 0.686599 16
1 2012_02 2012_02 0.371982 26 0.356412 14 0.356953 12 0.360304 24 0.658763 12
2 2012_03 2012_03 0.376637 28 0.358276 20 0.354839 10 0.374951 24 0.662382 24
3 2012_04 2012_04 0.372540 24 0.356079 10 0.344663 16 0.369469 26 0.627686 20
4 2012_05 2012_05 0.385459 16 0.369615 10 0.385599 10 0.381602 18 0.615484 30
5 2012_06 2012_06 0.376252 22 0.366449 10 0.365106 10 0.385673 30 0.630452 22
6 2012_07 2012_07 0.376602 20 0.365431 20 0.356745 16 0.383768 18 0.593058 18
7 2012_08 2012_08 0.385489 10 0.372908 30 0.362992 10 0.386008 14 0.603607 28
8 2012_09 2012_09 0.369887 22 0.369904 10 0.338247 10 0.364662 26 0.612656 24
9 2012_10 2012_10 0.385737 20 0.350987 18 0.353095 20 0.369448 22 0.598698 10
10 2012_11 2012_11 0.387742 14 0.369008 20 0.359603 10 0.361450 10 0.569218 30
11 2012_12 2012_12 0.367923 12 0.366609 10 0.367527 10 0.394530 10 0.622516 14
12 2013_01 2013_01 0.383147 20 0.369593 10 0.378486 12 0.392638 12 0.756288 12
13 2013_02 2013_02 0.386742 10 0.382875 18 0.388220 10 0.403955 12 0.698672 30
14 2013_03 2013_03 0.376408 28 0.372112 22 0.369501 10 0.380131 16 0.690384 16
15 2013_04 2013_04 0.379029 24 0.379843 20 0.365484 18 0.395827 24 0.695751 30
16 2013_05 2013_05 0.380623 20 0.383695 20 0.374426 10 0.391170 10 0.732174 10
17 2013_06 2013_06 0.379687 30 0.385105 12 0.359631 10 0.395412 28 0.748097 12
18 2013_07 2013_07 0.377045 12 0.366235 12 0.379858 12 0.393422 16 0.716875 12
19 2013_08 2013_08 0.372550 18 0.380078 12 0.367726 10 0.382074 12 0.730104 10
20 2013_09 2013_09 0.370974 18 0.355240 10 0.341069 10 0.371867 20 0.702392 14
21 2013_10 2013_10 0.418281 14 0.350561 26 0.374440 14 0.385737 24 0.682867 12
22 2013_11 2013_11 0.372005 24 0.340547 20 0.357003 30 0.386085 28 0.764088 12
23 2013_12 2013_12 0.377475 28 0.378226 18 0.367346 10 0.396046 10 0.711426 24
24 2014_01 2014_01 0.378321 28 0.375230 10 0.365671 10 0.404594 26 0.668364 10
25 2014_02 2014_02 0.374724 26 0.380598 18 0.378763 10 0.397362 18 0.641937 28
26 2014_03 2014_03 0.387159 14 0.392522 14 0.380349 10 0.403873 10 0.700665 22
27 2014_04 2014_04 0.378734 30 0.363843 16 0.362907 10 0.391023 12 0.692710 30
28 2014_05 2014_05 0.382013 18 0.354075 12 0.341620 30 0.391467 26 0.673758 30
29 2014_06 2014_06 0.389454 28 0.403460 20 0.397063 10 0.405224 22 0.632361 18
... ... ... ... ... ... ... ... ... ... ... ... ...
37 2015_02 2015_02 0.343132 28 0.374119 18 0.369051 10 0.370054 10 NaN 10
38 2015_03 2015_03 0.358734 14 0.349748 10 0.368912 10 0.351930 24 0.694756 14
39 2015_04 2015_04 0.345661 16 0.362625 26 0.334458 10 0.367630 10 0.553187 10
40 2015_05 2015_05 0.357787 14 0.359777 24 0.344511 18 0.355867 12 0.650168 16
41 2015_06 2015_06 0.358147 10 0.349219 28 0.337925 12 0.358493 14 0.584301 10
42 2015_07 2015_07 0.361747 10 0.385091 10 0.338774 10 0.364008 10 0.558097 18
43 2015_08 2015_08 0.351548 10 0.354375 10 0.322496 10 0.343917 28 0.635225 10
44 2015_09 2015_09 0.358648 10 0.359002 30 0.359048 10 0.358920 10 0.579560 18
45 2015_10 2015_10 0.390945 26 0.356480 22 0.377019 14 0.391810 14 0.684972 12
46 2015_11 2015_11 0.381952 26 0.331720 30 0.355430 10 0.385261 26 0.672617 24
47 2015_12 2015_12 0.377789 30 0.338472 24 0.356513 14 0.385321 30 0.703951 26
48 2016_01 2016_01 0.371399 24 0.318332 24 0.353870 16 0.384108 24 0.756247 28
49 2016_02 2016_02 0.377769 26 0.321813 20 0.364931 18 0.403619 20 0.706451 24
50 2016_03 2016_03 0.372038 28 0.318642 24 0.372687 20 0.381604 22 0.711577 20
51 2016_04 2016_04 0.370497 20 0.313523 20 0.345647 18 0.387416 22 0.698371 30
52 2016_05 2016_05 0.378633 30 0.324715 14 0.386714 10 0.390143 22 0.661451 28
53 2016_06 2016_06 0.369599 24 0.322562 30 0.341940 28 0.379794 22 0.658772 18
54 2016_07 2016_07 0.372861 24 0.316271 30 0.334943 30 0.363780 30 0.707597 18
55 2016_08 2016_08 0.370081 26 0.317142 12 0.341775 20 0.377458 26 0.653474 26
56 2016_09 2016_09 0.381448 26 0.320212 10 0.343436 18 0.375838 30 0.650172 22
57 2016_10 2016_10 0.377324 16 0.373840 20 0.356631 20 0.382469 16 0.631388 22
58 2016_11 2016_11 0.381070 12 0.350985 28 0.372430 14 0.389218 16 0.665048 28
59 2016_12 2016_12 0.365415 30 0.348647 10 0.336426 16 0.371006 30 0.715129 14
60 2017_01 2017_01 0.363971 22 0.339760 18 0.332809 20 0.362297 18 0.716772 26
61 2017_02 2017_02 0.360129 18 0.330765 22 0.324862 14 0.368217 14 0.714027 14
62 2017_03 2017_03 0.368832 22 0.344859 28 0.336967 24 0.375299 28 0.690587 22
63 2017_04 2017_04 0.366006 12 0.341303 20 0.326046 12 0.374008 24 0.650894 20
64 2017_05 2017_05 0.360582 22 0.333149 28 0.319035 12 0.357726 30 0.677018 14
65 2017_06 2017_06 0.381813 28 0.335946 12 0.328623 22 0.373138 18 0.725945 10
66 2017_07 2017_07 0.349201 22 0.339650 28 0.322636 30 0.347636 22 0.694142 16

67 rows × 12 columns

CPU times: user 45min 5s, sys: 1min 21s, total: 46min 27s
Wall time: 6h 10min 55s

In [15]:
%%time


model_list = {'NMF' : 'dynamic_nmf/data/windowbin/result/tf_idf_tokenized_window_20*.pkl'}

# TC-W2V NMF models 
for name, path in model_list.items():
    print (name, path)    
    coherence_list, indices = execute_coherence_w2v_NMF(path)
    y_coherence, y_k = sort_coherence(coherence_list)
    df['%s_TC_W2V' % name] = y_coherence
    df['%s_TC_W2V_k' % name] = y_k
#     break
    
display(df)    
df.to_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')


NMF dynamic_nmf/data/windowbin/result/tf_idf_tokenized_window_20*.pkl
['2012_01', '2012_02', '2012_03', '2012_04', '2012_05', '2012_06', '2012_07', '2012_08', '2012_09', '2012_10', '2012_11', '2012_12', '2013_01', '2013_02', '2013_03', '2013_04', '2013_05', '2013_06', '2013_07', '2013_08', '2013_09', '2013_10', '2013_11', '2013_12', '2014_01', '2014_02', '2014_03', '2014_04', '2014_05', '2014_06', '2014_07', '2014_08', '2014_09', '2014_10', '2014_11', '2014_12', '2015_01', '2015_02', '2015_03', '2015_04', '2015_05', '2015_06', '2015_07', '2015_08', '2015_09', '2015_10', '2015_11', '2015_12', '2016_01', '2016_02', '2016_03', '2016_04', '2016_05', '2016_06', '2016_07', '2016_08', '2016_09', '2016_10', '2016_11', '2016_12', '2017_01', '2017_02', '2017_03', '2017_04', '2017_05', '2017_06', '2017_07']
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k10.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k12.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k14.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k16.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k18.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k20.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k22.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k24.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k26.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k28.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_01_k30.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_02_k28.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_02_k30.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k16.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k18.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k20.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k22.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k24.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k26.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k28.pkl
dynamic_nmf/data/windowbin/result/window.df/window_2015_04_k30.pkl
took 45.993279695510864 to process dynamic_nmf/data/windowbin/result/tf_idf_tokenized_window_20*.pkl
window mallet_TC_W2V mallet_TC_W2V_k lda_TC_W2V lda_TC_W2V_k lsi_TC_W2V lsi_TC_W2V_k NMF_TC_W2V NMF_TC_W2V_k
window
2012_01 2012_01 0.373322 22 0.367360 12 0.344060 18 0.371245 22
2012_02 2012_02 0.371982 26 0.356412 14 0.356953 12 0.360304 24
2012_03 2012_03 0.376637 28 0.358276 20 0.354839 10 0.374951 24
2012_04 2012_04 0.372540 24 0.356079 10 0.344663 16 0.369469 26
2012_05 2012_05 0.385459 16 0.369615 10 0.385599 10 0.381602 18
2012_06 2012_06 0.376252 22 0.366449 10 0.365106 10 0.385673 30
2012_07 2012_07 0.376602 20 0.365431 20 0.356745 16 0.383768 18
2012_08 2012_08 0.385489 10 0.372908 30 0.362992 10 0.386008 14
2012_09 2012_09 0.369887 22 0.369904 10 0.338247 10 0.364662 26
2012_10 2012_10 0.385737 20 0.350987 18 0.353095 20 0.369448 22
2012_11 2012_11 0.387742 14 0.369008 20 0.359603 10 0.361450 10
2012_12 2012_12 0.367923 12 0.366609 10 0.367527 10 0.394530 10
2013_01 2013_01 0.383147 20 0.369593 10 0.378486 12 0.392638 12
2013_02 2013_02 0.386742 10 0.382875 18 0.388220 10 0.403955 12
2013_03 2013_03 0.376408 28 0.372112 22 0.369501 10 0.380131 16
2013_04 2013_04 0.379029 24 0.379843 20 0.365484 18 0.395827 24
2013_05 2013_05 0.380623 20 0.383695 20 0.374426 10 0.391170 10
2013_06 2013_06 0.379687 30 0.385105 12 0.359631 10 0.395412 28
2013_07 2013_07 0.377045 12 0.366235 12 0.379858 12 0.393422 16
2013_08 2013_08 0.372550 18 0.380078 12 0.367726 10 0.382074 12
2013_09 2013_09 0.370974 18 0.355240 10 0.341069 10 0.371867 20
2013_10 2013_10 0.418281 14 0.350561 26 0.374440 14 0.385737 24
2013_11 2013_11 0.372005 24 0.340547 20 0.357003 30 0.386085 28
2013_12 2013_12 0.377475 28 0.378226 18 0.367346 10 0.396046 10
2014_01 2014_01 0.378321 28 0.375230 10 0.365671 10 0.404594 26
2014_02 2014_02 0.374724 26 0.380598 18 0.378763 10 0.397362 18
2014_03 2014_03 0.387159 14 0.392522 14 0.380349 10 0.403873 10
2014_04 2014_04 0.378734 30 0.363843 16 0.362907 10 0.391023 12
2014_05 2014_05 0.382013 18 0.354075 12 0.341620 30 0.391467 26
2014_06 2014_06 0.389454 28 0.403460 20 0.397063 10 0.405224 22
... ... ... ... ... ... ... ... ... ...
2015_02 2015_02 0.343132 28 0.374119 18 0.369051 10 0.370054 10
2015_03 2015_03 0.358734 14 0.349748 10 0.368912 10 0.351930 24
2015_04 2015_04 0.345661 16 0.362625 26 0.334458 10 0.367630 10
2015_05 2015_05 0.357787 14 0.359777 24 0.344511 18 0.355867 12
2015_06 2015_06 0.358147 10 0.349219 28 0.337925 12 0.358493 14
2015_07 2015_07 0.361747 10 0.385091 10 0.338774 10 0.364008 10
2015_08 2015_08 0.351548 10 0.354375 10 0.322496 10 0.343917 28
2015_09 2015_09 0.358648 10 0.359002 30 0.359048 10 0.358920 10
2015_10 2015_10 0.390945 26 0.356480 22 0.377019 14 0.391810 14
2015_11 2015_11 0.381952 26 0.331720 30 0.355430 10 0.385261 26
2015_12 2015_12 0.377789 30 0.338472 24 0.356513 14 0.385321 30
2016_01 2016_01 0.371399 24 0.318332 24 0.353870 16 0.384108 24
2016_02 2016_02 0.377769 26 0.321813 20 0.364931 18 0.403619 20
2016_03 2016_03 0.372038 28 0.318642 24 0.372687 20 0.381604 22
2016_04 2016_04 0.370497 20 0.313523 20 0.345647 18 0.387416 22
2016_05 2016_05 0.378633 30 0.324715 14 0.386714 10 0.390143 22
2016_06 2016_06 0.369599 24 0.322562 30 0.341940 28 0.379794 22
2016_07 2016_07 0.372861 24 0.316271 30 0.334943 30 0.363780 30
2016_08 2016_08 0.370081 26 0.317142 12 0.341775 20 0.377458 26
2016_09 2016_09 0.381448 26 0.320212 10 0.343436 18 0.375838 30
2016_10 2016_10 0.377324 16 0.373840 20 0.356631 20 0.382469 16
2016_11 2016_11 0.381070 12 0.350985 28 0.372430 14 0.389218 16
2016_12 2016_12 0.365415 30 0.348647 10 0.336426 16 0.371006 30
2017_01 2017_01 0.363971 22 0.339760 18 0.332809 20 0.362297 18
2017_02 2017_02 0.360129 18 0.330765 22 0.324862 14 0.368217 14
2017_03 2017_03 0.368832 22 0.344859 28 0.336967 24 0.375299 28
2017_04 2017_04 0.366006 12 0.341303 20 0.326046 12 0.374008 24
2017_05 2017_05 0.360582 22 0.333149 28 0.319035 12 0.357726 30
2017_06 2017_06 0.381813 28 0.335946 12 0.328623 22 0.373138 18
2017_07 2017_07 0.349201 22 0.339650 28 0.322636 30 0.347636 22

67 rows × 9 columns

CPU times: user 46.1 s, sys: 156 ms, total: 46.3 s
Wall time: 46.3 s

In [ ]:


In [164]:
def graph_coherence_score(df, columns, title):
    
    ax = df[columns].plot()
    plt.xticks(rotation=90, fontsize=4)
    plt.xlabel('time windows')
    plt.ylabel('score')
    plt.title('%s coherence' % title)
    plt.legend(loc='best')
    plt.show()
    plt.savefig('data/eos/graphs/%s_%s_coherence_plot.png' % (title, 'all'), dpi=800)    

def graph_k(df, columns, title):
    ax = df[columns].plot(kind='bar', stacked=True)
    plt.xticks(rotation=90, fontsize=4)
    plt.xlabel('time windows')
    plt.ylabel('k')
    title= '%s top k' % title
    plt.title(title)
    plt.legend(loc='best')
    plt.show()
    plt.savefig('data/eos/graphs/%s_%s_topK_plot.png' % (title, 'all'), dpi=800)

In [167]:
%%time


df = pd.read_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')
print (df.columns.tolist())
df.set_index(df['window'], inplace=True)
df['NMF_TC_W2V'].fillna(method='bfill', inplace=True)
df['NMF_Unify'].fillna(method='bfill', inplace=True)


df['NMF_TC_W2V'].replace(0, df['NMF_TC_W2V'].mean(), inplace=True)
df['NMF_Unify'].replace(0, df['NMF_Unify'].mean(), inplace=True)

display(df.tail())



print (df.describe())


ax = None
title= 'TC_W2V probabilistic'
graph_coherence_score(df, ['mallet_TC_W2V', 'lda_TC_W2V', 'lsi_TC_W2V'], title)
graph_k(df, ['mallet_TC_W2V_k', 'lda_TC_W2V_k', 'lsi_TC_W2V_k'], title)

title= 'TC_W2V comparison'
graph_coherence_score(df, ['mallet_TC_W2V', 'NMF_TC_W2V'], title)
graph_k(df, ['mallet_TC_W2V_k', 'NMF_TC_W2V_k'], title)

title= 'Unify probabilistic'
graph_coherence_score(df, ['mallet_Unify', 'lda_Unify', 'lsi_Unify'], title)
graph_k(df, ['mallet_Unify_k', 'lda_Unify_k', 'lsi_Unify_k'], title)  

title= 'Unify comparison'
graph_coherence_score(df, ['mallet_Unify',  'NMF_Unify'], title)
graph_k(df, ['mallet_Unify_k', 'NMF_Unify_k'], title)


['Unnamed: 0', 'Unnamed: 0.1', 'window', 'window.1', 'mallet_TC_W2V', 'mallet_TC_W2V_k', 'lda_TC_W2V', 'lda_TC_W2V_k', 'lsi_TC_W2V', 'lsi_TC_W2V_k', 'NMF_TC_W2V', 'NMF_TC_W2V_k', 'NMF_Unify', 'NMF_Unify_k', 'mallet_Unify', 'mallet_Unify_k', 'lda_Unify', 'lda_Unify_k', 'lsi_Unify', 'lsi_Unify_k']
Unnamed: 0 Unnamed: 0.1 window window.1 mallet_TC_W2V mallet_TC_W2V_k lda_TC_W2V lda_TC_W2V_k lsi_TC_W2V lsi_TC_W2V_k NMF_TC_W2V NMF_TC_W2V_k NMF_Unify NMF_Unify_k mallet_Unify mallet_Unify_k lda_Unify lda_Unify_k lsi_Unify lsi_Unify_k
window
2017_03 62 62 2017_03 2017_03 0.368832 22 0.344859 28 0.336967 24 0.375299 28 0.690587 22 0.577780 12 0.369719 26 0.376879 18
2017_04 63 63 2017_04 2017_04 0.366006 12 0.341303 20 0.326046 12 0.374008 24 0.650894 20 0.583312 20 0.468642 14 0.385933 10
2017_05 64 64 2017_05 2017_05 0.360582 22 0.333149 28 0.319035 12 0.357726 30 0.677018 14 0.583450 28 0.320494 26 0.389233 10
2017_06 65 65 2017_06 2017_06 0.381813 28 0.335946 12 0.328623 22 0.373138 18 0.725945 10 0.607344 12 0.301083 24 0.342559 30
2017_07 66 66 2017_07 2017_07 0.349201 22 0.339650 28 0.322636 30 0.347636 22 0.694142 16 0.582325 22 0.335810 28 0.452428 26
       Unnamed: 0  Unnamed: 0.1  mallet_TC_W2V  mallet_TC_W2V_k  lda_TC_W2V  \
count   67.000000     67.000000      67.000000        67.000000   67.000000   
mean    33.000000     33.000000       0.374285        21.223881    0.354714   
std     19.485037     19.485037       0.012844         6.227556    0.020813   
min      0.000000      0.000000       0.342095        10.000000    0.313523   
25%     16.500000     16.500000       0.369215        16.000000    0.340154   
50%     33.000000     33.000000       0.376602        22.000000    0.356079   
75%     49.500000     49.500000       0.381630        26.000000    0.369604   
max     66.000000     66.000000       0.418281        30.000000    0.403460   

       lda_TC_W2V_k  lsi_TC_W2V  lsi_TC_W2V_k  NMF_TC_W2V  NMF_TC_W2V_k  \
count     67.000000   67.000000     67.000000   67.000000     67.000000   
mean      18.955224    0.355349     14.208955    0.381597     20.089552   
std        6.798914    0.026423      5.795864    0.016945      6.745888   
min       10.000000    0.210950     10.000000    0.343917     10.000000   
25%       12.000000    0.341698     10.000000    0.369761     14.000000   
50%       20.000000    0.357003     12.000000    0.382074     22.000000   
75%       24.000000    0.369276     18.000000    0.392224     26.000000   
max       30.000000    0.400399     30.000000    0.422887     30.000000   

       NMF_Unify  NMF_Unify_k  mallet_Unify  mallet_Unify_k  lda_Unify  \
count  67.000000    67.000000     67.000000       67.000000  67.000000   
mean    0.672085    18.865672      0.570267       21.582090   0.346530   
std     0.049464     7.008427      0.046503        5.934361   0.059256   
min     0.553187    10.000000      0.342099       10.000000   0.249121   
25%     0.642056    12.000000      0.563148       18.000000   0.304975   
50%     0.682867    18.000000      0.577800       22.000000   0.337970   
75%     0.707024    24.000000      0.597061       26.000000   0.374178   
max     0.764088    30.000000      0.627599       30.000000   0.592133   

       lda_Unify_k  lsi_Unify  lsi_Unify_k  
count    67.000000  66.000000    67.000000  
mean     21.432836   0.458723    13.492537  
std       6.731257   0.059270     6.162873  
min      10.000000   0.342559    10.000000  
25%      16.000000   0.409362    10.000000  
50%      22.000000   0.461874    10.000000  
75%      27.000000   0.497757    12.000000  
max      30.000000   0.579197    30.000000  
CPU times: user 10.4 s, sys: 1.66 s, total: 12.1 s
Wall time: 10.9 s

In [9]:
def get_nmf_dynamic_topics(k, dictionary, topn=20):
    
    topic_list = []
    topic_df = pd.read_pickle('dynamic_nmf/data/windowbin/result/dynamic.df/dynamic_k%s.pkl' % (k))

    for c in topic_df.ix[:,1:].columns:
        # filter out any token not in 
        if dictionary is not None:
            topic = [x for x in topic_df[c].tolist() if x in dictionary.token2id.keys()]
        else:
            topic = topic_df[c].tolist() 
        topic_list.append(topic[:topn])
        
#     print(topic_list)
    return topic_list

In [85]:
%time


log.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=log.INFO)

actual_kmin=10
actual_kmax=100
df_dynamic = pd.DataFrame(index=list(range(actual_kmin, actual_kmax + 2, 2)))

                          
model_path = 'data/eos/word2vec_model_all.model'
log.info("Loading Word2Vec model from %s ..." % model_path)
model = gensim.models.Word2Vec.load(model_path)

metric = ModelSimilarity(model)
validation_measure = WithinTopicMeasure(metric)


# TC-W2V NMF models 
name = 'NMF_dynamic_tc-w2v'
print (name)    

coherence = []
for k in range(actual_kmin, actual_kmax + 2, 2):
    get_nmf_dynamic_topics(k, None, 20)
    coherence_score = validation_measure.evaluate_rankings(
            get_nmf_dynamic_topics(k, None, 20))

    print("Model coherence dynamic (k=%d) = %.4f" % (k, coherence_score))
    coherence.append(coherence_score)

print(coherence)

df_dynamic['%s_TC_W2V' % name] = coherence

display(df_dynamic)    
df_dynamic.to_csv('dynamic_nmf/data/windowbin/csv/dynamic_coherence_k_performance.csv')


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 3.58 µs
NMF_dynamic_tc-w2v
Model coherence dynamic (k=10) = 0.4356
Model coherence dynamic (k=12) = 0.4188
Model coherence dynamic (k=14) = 0.4212
Model coherence dynamic (k=16) = 0.4112
Model coherence dynamic (k=18) = 0.4116
Model coherence dynamic (k=20) = 0.4124
Model coherence dynamic (k=22) = 0.4111
Model coherence dynamic (k=24) = 0.4075
Model coherence dynamic (k=26) = 0.4048
Model coherence dynamic (k=28) = 0.4063
Model coherence dynamic (k=30) = 0.4078
Model coherence dynamic (k=32) = 0.4071
Model coherence dynamic (k=34) = 0.4111
Model coherence dynamic (k=36) = 0.4118
Model coherence dynamic (k=38) = 0.4119
Model coherence dynamic (k=40) = 0.4121
Model coherence dynamic (k=42) = 0.4127
Model coherence dynamic (k=44) = 0.4116
Model coherence dynamic (k=46) = 0.4085
Model coherence dynamic (k=48) = 0.4077
Model coherence dynamic (k=50) = 0.4119
Model coherence dynamic (k=52) = 0.4073
Model coherence dynamic (k=54) = 0.4057
Model coherence dynamic (k=56) = 0.4087
Model coherence dynamic (k=58) = 0.4054
Model coherence dynamic (k=60) = 0.4049
Model coherence dynamic (k=62) = 0.4037
Model coherence dynamic (k=64) = 0.4016
Model coherence dynamic (k=66) = 0.3992
Model coherence dynamic (k=68) = 0.4036
Model coherence dynamic (k=70) = 0.4035
Model coherence dynamic (k=72) = 0.4034
Model coherence dynamic (k=74) = 0.4036
Model coherence dynamic (k=76) = 0.4010
Model coherence dynamic (k=78) = 0.4025
Model coherence dynamic (k=80) = 0.3999
Model coherence dynamic (k=82) = 0.3989
Model coherence dynamic (k=84) = 0.3997
Model coherence dynamic (k=86) = 0.4008
Model coherence dynamic (k=88) = 0.4030
Model coherence dynamic (k=90) = 0.4003
Model coherence dynamic (k=92) = 0.3990
Model coherence dynamic (k=94) = 0.3993
Model coherence dynamic (k=96) = 0.4010
Model coherence dynamic (k=98) = 0.4000
Model coherence dynamic (k=100) = 0.3989
[0.43558251710222085, 0.41878171980054857, 0.42124753384467567, 0.41122089970597364, 0.41164816784264041, 0.41240486115331842, 0.41105816954271646, 0.40751686464697973, 0.40481427279611248, 0.40628028383916986, 0.40780652618912072, 0.40707285165923679, 0.41109877112004828, 0.41176407969384399, 0.41193576538593951, 0.41205235363049358, 0.41266146583333646, 0.4116385111469873, 0.40846269162858245, 0.4076680482684214, 0.41185767759162739, 0.40729751288400162, 0.40569684912949844, 0.40866860616530992, 0.40539166023667061, 0.40487143859536995, 0.40373536477744421, 0.40163646080418197, 0.39917533912041958, 0.40362254188464924, 0.40346109657009804, 0.40344426250858789, 0.40358695332550576, 0.40095118387119649, 0.40246291609985824, 0.39994385595474424, 0.39889532370122138, 0.39967259254734849, 0.40075715748832119, 0.4030340950441138, 0.40025185200399926, 0.39898919130985688, 0.39932140870381505, 0.40098986553158994, 0.40000084904370448, 0.39891480233048637]
NMF_dynamic_tc-w2v_TC_W2V
10 0.435583
12 0.418782
14 0.421248
16 0.411221
18 0.411648
20 0.412405
22 0.411058
24 0.407517
26 0.404814
28 0.406280
30 0.407807
32 0.407073
34 0.411099
36 0.411764
38 0.411936
40 0.412052
42 0.412661
44 0.411639
46 0.408463
48 0.407668
50 0.411858
52 0.407298
54 0.405697
56 0.408669
58 0.405392
60 0.404871
62 0.403735
64 0.401636
66 0.399175
68 0.403623
70 0.403461
72 0.403444
74 0.403587
76 0.400951
78 0.402463
80 0.399944
82 0.398895
84 0.399673
86 0.400757
88 0.403034
90 0.400252
92 0.398989
94 0.399321
96 0.400990
98 0.400001
100 0.398915

In [86]:
display(df_dynamic.describe())


NMF_dynamic_tc-w2v_TC_W2V
count 46.000000
mean 0.406725
std 0.006955
min 0.398895
25% 0.401152
50% 0.405544
75% 0.411190
max 0.435583

In [87]:
%%time

## DO NOT RUN AGAIN 2 days...
dictionary_filepath = 'data/eos/dictionary_EOS_all.dict'
bow_filepath = 'data/eos/corpus_EOS_all.mm'  

corpus = gensim.corpora.MmCorpus(bow_filepath)
print('here')
dictionary = gensim.corpora.Dictionary.load_from_text(dictionary_filepath)  
corpus_text = MyDocuments('dynamic_nmf/data/windowbin/slices/tokenized_window_all.gz') 

coherence = []


name = 'NMF_dynamic_unify'
print ('here')

for k in range(actual_kmin, actual_kmax + 2, 2):
    cm = CoherenceModel(dictionary=dictionary, corpus=corpus, 
                        texts=corpus_text, topics=get_nmf_dynamic_topics(k, dictionary, 20), 
                        coherence='c_v')

    coherence_score = cm.get_coherence()                
    print("Model coherence dynamic (k=%d) = %.4f" % (k, coherence_score))
    coherence.append(coherence_score)

print(coherence)

df_dynamic['%s_TC_W2V' % name] = coherence

display(df_dynamic)    
df_dynamic.to_csv('dynamic_nmf/data/windowbin/csv/dynamic_coherence_k_performance.csv')


here
here
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/queues.py", line 247, in _feed
    send_bytes(obj)
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 397, in _send_bytes
    self._send(header)
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
  File "/usr/lib/python3.5/multiprocessing/queues.py", line 247, in _feed
    send_bytes(obj)
Model coherence dynamic (k=10) = 0.5969
Model coherence dynamic (k=12) = 0.5724
Model coherence dynamic (k=14) = 0.5690
Model coherence dynamic (k=16) = 0.5471
Model coherence dynamic (k=18) = 0.5514
Model coherence dynamic (k=20) = 0.5519
Model coherence dynamic (k=22) = 0.5585
Model coherence dynamic (k=24) = 0.5574
Model coherence dynamic (k=26) = 0.5577
Model coherence dynamic (k=28) = 0.5662
Model coherence dynamic (k=30) = 0.5622
Model coherence dynamic (k=32) = 0.5714
Model coherence dynamic (k=34) = 0.5778
Model coherence dynamic (k=36) = 0.5809
Model coherence dynamic (k=38) = 0.5886
Model coherence dynamic (k=40) = 0.6010
Model coherence dynamic (k=42) = 0.6084
Model coherence dynamic (k=44) = 0.6066
Model coherence dynamic (k=46) = 0.5931
Model coherence dynamic (k=48) = 0.5866
Model coherence dynamic (k=50) = 0.6024
Model coherence dynamic (k=52) = 0.5971
Model coherence dynamic (k=54) = 0.5879
Model coherence dynamic (k=56) = 0.5952
Model coherence dynamic (k=58) = 0.5838
Model coherence dynamic (k=60) = 0.5825
Model coherence dynamic (k=62) = 0.5859
Model coherence dynamic (k=64) = 0.5850
Model coherence dynamic (k=66) = 0.5791
Model coherence dynamic (k=68) = 0.5863
Model coherence dynamic (k=70) = 0.5854
Model coherence dynamic (k=72) = 0.5803
Model coherence dynamic (k=74) = 0.5804
Model coherence dynamic (k=76) = 0.5761
Model coherence dynamic (k=78) = 0.5748
Model coherence dynamic (k=80) = 0.5771
Model coherence dynamic (k=82) = 0.5703
Model coherence dynamic (k=84) = 0.5692
Model coherence dynamic (k=86) = 0.5744
Model coherence dynamic (k=88) = 0.5746
Model coherence dynamic (k=90) = 0.5695
Model coherence dynamic (k=92) = 0.5716
Model coherence dynamic (k=94) = 0.5741
Model coherence dynamic (k=96) = 0.5736
Model coherence dynamic (k=98) = 0.5721
Model coherence dynamic (k=100) = 0.5711
[0.59685718083157935, 0.57243815670398845, 0.56895329132197026, 0.54707846683242112, 0.55137140918704242, 0.55186270140030058, 0.55845470837134059, 0.55743752327028473, 0.55767427855501883, 0.56620148424714889, 0.56222318265270921, 0.57137359173250646, 0.5778136078291054, 0.58087685945564449, 0.5885965236172247, 0.6010096272688884, 0.60836754481607058, 0.60656226372342048, 0.59306467556595066, 0.5865661419409226, 0.60237448166429564, 0.59707855499849294, 0.58793466474759892, 0.5952005474862172, 0.58381854164007219, 0.58251507820379977, 0.58593599795534312, 0.58496830541521672, 0.57908672406237216, 0.58634242321268726, 0.58542058245629314, 0.58032406219262689, 0.58036681626225683, 0.5761011045486214, 0.57478888810070772, 0.57706021208912905, 0.57027452178779592, 0.56916148291614554, 0.57439777567311323, 0.57457949881571801, 0.56949653726024396, 0.57157180018176712, 0.57414290114975475, 0.57357983234057686, 0.57209878558009886, 0.5711301616598794]
NMF_dynamic_tc-w2v_TC_W2V NMF_dynamic_unify_TC_W2V
10 0.435583 0.596857
12 0.418782 0.572438
14 0.421248 0.568953
16 0.411221 0.547078
18 0.411648 0.551371
20 0.412405 0.551863
22 0.411058 0.558455
24 0.407517 0.557438
26 0.404814 0.557674
28 0.406280 0.566201
30 0.407807 0.562223
32 0.407073 0.571374
34 0.411099 0.577814
36 0.411764 0.580877
38 0.411936 0.588597
40 0.412052 0.601010
42 0.412661 0.608368
44 0.411639 0.606562
46 0.408463 0.593065
48 0.407668 0.586566
50 0.411858 0.602374
52 0.407298 0.597079
54 0.405697 0.587935
56 0.408669 0.595201
58 0.405392 0.583819
60 0.404871 0.582515
62 0.403735 0.585936
64 0.401636 0.584968
66 0.399175 0.579087
68 0.403623 0.586342
70 0.403461 0.585421
72 0.403444 0.580324
74 0.403587 0.580367
76 0.400951 0.576101
78 0.402463 0.574789
80 0.399944 0.577060
82 0.398895 0.570275
84 0.399673 0.569161
86 0.400757 0.574398
88 0.403034 0.574579
90 0.400252 0.569497
92 0.398989 0.571572
94 0.399321 0.574143
96 0.400990 0.573580
98 0.400001 0.572099
100 0.398915 0.571130
CPU times: user 2h 53min 14s, sys: 1min 46s, total: 2h 55min
Wall time: 2d 12h 24min 5s

In [169]:
# Graph

df_dynamic = pd.read_csv('dynamic_nmf/data/windowbin/csv/dynamic_coherence_k_performance.csv') 



# df_dynamic.describe()
df_dynamic.columns = ['k', 'NMF dynamic TC-W2V', 'NMF dynamic unify']
df_dynamic.set_index('k', inplace=True)

print('Dynamic tc-w2v')
# print(get_nmf_dynamic_topics(10, None, 20))
print(df_dynamic.sort_values(by=['NMF dynamic TC-W2V'],  ascending=False).head())

print('Dynamic unify')
print(get_nmf_dynamic_topics(42, None, 20))  
print(df_dynamic.sort_values(by=['NMF dynamic unify'],  ascending=False).head())
    
ax = df_dynamic[['NMF dynamic unify']].plot(xticks=df_dynamic.index)
plt.xticks(rotation=90, fontsize=4)
plt.xlabel('dynamic')

plt.ylabel('score')
plt.title('dynamic coherence')
plt.legend(loc='best')
plt.show()
plt.savefig('data/eos/graphs/dynamic_coherence_plot.png', dpi=800)   

df_dynamic.head()
# df_dynamic.describe()


Dynamic tc-w2v
    NMF dynamic TC-W2V  NMF dynamic unify
k                                        
10            0.435583           0.596857
14            0.421248           0.568953
12            0.418782           0.572438
42            0.412661           0.608368
20            0.412405           0.551863
Dynamic unify
[['can', 'will', 'one', 'year', 'go', 'like', 'get', 'make', 'people', 'say', 'time', 'good', 'world', 'know', 'see', 'just', 'now', 'work', 'think', 'many'], ['shelling', 'damascus', 'suburbs', 'regime', 'homs', 'neighborhood', 'idlib', 'forces', 'daraa', 'report', 'fierce', 'hama', 'town', 'fsa', 'city', 'martyrs', 'artillery', 'al', 'mortar', 'army'], ['syrian', 'syria', 'assad', 'opposition', 'talk', 'geneva', 'say', 'foreign', 'arab', 'peace', 'al_assad', 'regime', 'damascus', 'government', 'president_bashar', 'meeting', 'political', 'conference', 'support', 'terrorism'], ['israel', 'israeli', 'palestinian', 'jerusalem', 'palestinians', 'hamas', 'gaza', 'netanyahu', 'west_bank', 'jewish', 'israelis', 'peace', 'aqsa', 'abbas', 'palestine', 'arab', 'jews', 'jordan', 'benjamin_netanyahu', 'state'], ['turkey', 'turkish', 'erdogan', 'ankara', 'kurdish', 'istanbul', 'pkk', 'border', 'syria', 'davutoglu', 'kurds', 'tayyip_erdogan', 'syrian', 'nato', 'party', 'ypg', 'armenian', 'recep_tayyip', 'visit', 'coup'], ['iran', 'nuclear', 'iranian', 'tehran', 'sanction', 'talk', 'saudi_arabia', 'program', 'islamic', 'rouhani', 'foreign', 'power', 'israel', 'country', 'us', 'iranians', 'saudi', 'republic', 'zarif', 'united_states'], ['refugee', 'child', 'million', 'jordan', 'unhcr', 'syrian_refugee', '000', 'humanitarian', 'people', 'country', 'aid', 'flee', 'syria', 'un', 'syrians', 'conflict', 'need', 'camp', 'crisis', 'food'], ['iraq', 'iraqi', 'baghdad', 'sunni', 'maliki', 'shi', 'ite', 'government', 'kurdish', 'prime_minister', 'shiite', 'country', 'violence', 'sectarian', 'iraqis', 'kurds', 'sunnis', 'sadr', 'al', 'say'], ['russia', 'russian', 'putin', 'moscow', 'ukraine', 'lavrov', 'president_vladimir', 'kremlin', 'say', 'nato', 'syria', 'military', 'sanction', 'air', 'foreign', 'ukrainian', 'meeting', 'crimea', 'relation', 'russians'], ['oil', 'market', 'price', 'barrel', 'opec', 'production', 'crude', 'oil_price', 'percent', 'output', 'company', 'rise', 'energy', 'export', 'low', 'supply', 'cut', 'high', 'year', 'cent'], ['alert', 'click_here', 'disable', 'remove', 'add', 'daesh', 'afp', 'al', 'file', 'dubai', 'follow', 'united_nations', 'bashar_al', 'anadolu_agency', 'army', 'uae', 'business', 'al_qaeda', 'hamdan', 'dewa'], ['chemical', 'weapons', 'use', 'syria', 'opcw', 'syrian', 'attack', 'say', 'gas', 'destroy', 'prohibition', 'stockpile', 'assad', 'report', 'un', 'damascus', 'destruction', 'sarin', 'arsenal', 'mustard'], ['mosul', 'iraqi', 'forces', 'city', 'iraq', 'islamic_state', 'fallujah', 'troop', 'militant', 'daesh', 'retake', 'civilian', 'say', 'baghdad', 'offensive', 'operation', 'army', 'western', 'control', 'fighters'], ['say', 'police', 'suspect', 'man', 'attack', 'arrested', 'video', 'french', 'charge', 'terrorism', 'paris', 'terrorist', 'terror', 'attacks', 'court', 'group', 'authority', 'year_old', 'france', 'london'], ['un', 'council', 'resolution', 'syria', 'security', 'united_nations', 'say', 'annan', 'observer', 'violence', 'mission', 'secretary_general', 'international', 'humanitarian', 'league', 'peace', 'plan', 'arab', 'call', 'ceasefire'], ['full_story', 'channel_news', 'read', 'asia', '2014', 'syria', 'daily_mail', 'jerusalem', 'sunday', 'saturday', 'cbs_news', 'al_jazeera', 'monday', 'huffington_post', 'independent', 'friday', 'post', 'tuesday', 'radiofreeeurope', 'iraq'], ['quot', 'say', 'the', 'we', 'it', 'add', 'annan', 'tell', 'afp', 'there', 'this', 'reuters', 'assad', 'they', 'apos', 'if', 'but', 'terrorists', 'call', 'that'], ['window_click', 'share', 'new', 'opens', 'iraqinews', 'com', 'click', 'reddit_opens', 'google', 'twitter_opens', '_opens', 'window', 'email', 'add', 'print', 'facebook', 'archival_photo', 'source', 'baghdad', 'friend'], ['trump', 'donald_trump', 'clinton', 'president', 'republican', 'ban', 'say', 'mr', 'campaign', 'hillary_clinton', 'order', 'white_house', 'election', 'policy', 'muslim', 'candidate', 'debate', 'democratic', 'america', 'united_states'], ['lebanon', 'hezbollah', 'lebanese', 'beirut', 'syria', 'syrian', 'border', 'assad', 'tripoli', 'group', 'war', 'sunni', 'say', 'nasrallah', 'lebanese_army', 'rebel', 'shi', 'civil', 'arsal', 'fighter'], ['yemen', 'saudi', 'hadi', 'yemeni', 'saudi_arabia', 'saleh', 'al_qaeda', 'houthis', 'houthi', 'mansour', 'aden', 'sanaa', 'rebels', 'militant', 'riyadh', 'say', 'government', 'ali_abdullah', 'president_abd', 'official'], ['migrant', 'eu', 'refugee', 'europe', 'greece', 'border', 'germany', 'european', 'greek', 'say', 'european_union', 'boat', 'migration', 'turkey', 'asylum_seeker', 'country', 'merkel', 'island', 'italy', 'macedonia'], ['js', 'function', 'replyid', 'var', 'width', 'fjs', 'border', 'id', 'isloggedin', 'rferl', 'comment', 'height', 'ffffff', 'color', 'if', 'background', 'replycomment', 'solid', 'left', 'shwtimer'], ['photo', 'istanbul', 'photos', 'turkish', 'turkey', 'amid', 'curfew', 'police', 'protest', 'destruction', 'day', 'clashes', 'celebrate', 'sur', 'across', 'visitor', 'isil', 'aegean', 'photograph', 'depict'], ['apos', 'it', 'assad', 'amp', 'don', 'com', 'apo', 'au', 'reuters', 'they', 'that', 'we', 'he', 're', '2012', 've', 'ite', 'http', 'australia', 'shi'], ['egypt', 'egyptian', 'cairo', 'morsi', 'president', 'protest', 'muslim_brotherhood', 'brotherhood', 'sisi', 'islamist', 'mubarak', 'government', 'mursi', 'supporter', 'arab', 'political', 'state', 'say', 'interim', 'constitution'], ['isis', 'iraq', 'syria', 'group', 'fighters', 'video', 'terror', 'baghdadi', 'terrorist', 'attacks', 'jihadi', 'claim', 'daily_mail', 'join', 'extremist', 'independent', 'caliphate', 'leader', 'abu_bakr', 'capture'], ['match', 'seed', 'win', 'round', 'play', 'ivanovic', 'open', 'williams', 'game', 'final', 'player', 'set', 'second', 'first', 'beat', 'australian', 'tournament', 'champion', 'third', 'wimbledon'], ['provide', 'project', 'support', 'management', 'development', 'work', 'humanitarian', 'experience', 'program', 'ensure', 'need', 'programme', 'assistance', 'will', 'staff', 'health', 'activity', 'information', 'training', 'food'], ['killed', 'baghdad', 'wound', 'car', 'attack', 'bomb', 'people', 'suicide', 'police', 'attacks', 'least', 'blast', 'killing', 'bombing', 'security', 'say', 'source', 'bomber', 'capital', 'bombings'], ['canada', 'canadian', 'trudeau', 'say', 'canadians', 'ottawa', 'will', 'government', 'mission', 'nato', 'liberal', 'defence', 'minister', 'justin_trudeau', 'harper', 'liberals', 'prime_minister', 'military', 'isil', 'sajjan'], ['al', 'terrorist', 'army', 'unit', 'daraa', 'source', 'destroy', 'countryside', 'nusra', 'jabhat_al', 'group', 'area', 'province', 'terrorist_organization', 'sana', 'baghdadi', 'number', 'terrorism', 'vehicle', 'position'], ['obama', 'white_house', 'president_barack', 'president', 'washington', 'say', 'hagel', 'will', 'congress', 'us', 'american', 'republican', 'united_states', 'policy', 'administration', 'leader', 'secretary', 'bush', 'military', 'romney'], ['iraq', 'registration', 'article', 'news', 'please', 'p_articles', 'iraqupdates', 'monthly_basis', 'inbox', 'rss_feed', 'subscriber', 'relat', 'everyday', 'subscription', 'students', 'newsletter', 'register', 'archive', 'weekly', 'headline'], ['mr', 'labour', 'corbyn', 'vote', 'party', 'britain', 'mp', 'british', 'jeremy_corbyn', 'uk', 'prime_minister', 'election', 'leader', 'london', 'will', 'parliament', 'scotland', 'australia', 'cameron', 'campaign'], ['30', 'reuters', 'getty', '2016', '2017', '16', '13', '15', '14', 'image', '29', '17', '18', '19', '11', '12', '10', '27', '21', '22'], ['islamic_state', 'group', 'kurdish', 'fighters', 'say', 'militant', 'syria', 'raqqa', 'forces', 'strike', 'isil', 'coalition', 'air_strike', 'syrian', 'town', 'sdf', 'fighting', 'airstrike', 'northern', 'ypg'], ['2010', '2011', 'comment', '2009', 'arab', 'arabic', 'post', 'news', '2012', 'august', 'september', 'october', 'december', 'january', 'july', 'november', 'read', 'february', 'story', 'tags'], ['use', 'osc', 'english', 'original', 'dissemination', 'copyrighted_material', 'usage', 'reproduction', 'contain', 'purpose', 'product', 'subject', 'authorize', 'national', 'policy', 'copyright', 'united_states', 'security', 'government', 'lebanon'], ['aleppo', 'rebel', 'syrian', 'rebels', 'say', 'city', 'government', 'area', 'activist', 'killed', 'civilian', 'held', 'assad', 'syria', 'regime', 'opposition', 'syrian_observatory', 'army', 'observatory', 'human_rights'], ['china', 'chinese', 'beijing', 'north_korea', 'japan', 'xi', 'trade', 'global', 'summit', 'economic', 'asia', 'will', 'india', 'country', 'issue', 'international', 'cooperation', 'development', 'chen', 'nuclear'], ['us', 'afghanistan', 'afghan', 'military', 'taliban', 'troop', 'pakistan', 'nato', 'add', 'kabul', 'washington', 'american', 'war', 'iraq', 'united_states', 'forces', 'pentagon', 'karzai', 'force', 'defense']]
    NMF dynamic TC-W2V  NMF dynamic unify
k                                        
42            0.412661           0.608368
44            0.411639           0.606562
50            0.411858           0.602374
40            0.412052           0.601010
52            0.407298           0.597079
Out[169]:
NMF dynamic TC-W2V NMF dynamic unify
k
10 0.435583 0.596857
12 0.418782 0.572438
14 0.421248 0.568953
16 0.411221 0.547078
18 0.411648 0.551371

In [191]:
# Get the top 5 topic words in one window from LDA Mallet and NMF for comparison
nmf_model = 'dynamic_nmf/data/windowbin/result/tf_idf_tokenized_window_2016*.pkl'

mallet = 'data/eos/mallet/Malletmodel_%s_K_%s.pkl'
windows = ['2016_01', '2016_02', '2016_03', '2016_04','2016_05','2016_06'
#            '2016_07', '2016_09','2016_10','2016_11','2016_12'
          ]


df = pd.read_csv('dynamic_nmf/data/windowbin/csv/windows_coherence_k_performance.csv')
df.set_index(df['window'], inplace=True)

df = df[df['window'].isin(windows)]
df.head()


for index, row in df.iterrows():
    print(index, 'NMF', row['NMF_TC_W2V_k'])
    print(get_nmf_topics(row['window'], row['NMF_TC_W2V_k'], None, 5))
    ldamodel = joblib.load(mallet % (row['window'], row['mallet_TC_W2V_k']))
    print(index, 'Mallet', row['mallet_TC_W2V_k'])
    print(get_topics(ldamodel, row['mallet_TC_W2V_k'], 5))
    

# ldamodel = joblib.load(fname)
# topic_num = 
# print(get_topics(ldamodel, topic_num, 5))


2016_01 NMF 24
[['photo', 'photos', 'istanbul', 'turkey', 'turkish'], ['child', 'humanitarian', 'un', 'aid', 'people'], ['window_click', 'share', 'new', 'opens', 'iraqinews'], ['inmate', 'jail', 'hallock', 'escape', 'orange_county'], ['talk', 'opposition', 'syrian', 'geneva', 'peace'], ['iran', 'rouhani', 'sanction', 'iranian', 'nuclear'], ['alert', 'click_here', 'disable', 'remove', 'add'], ['refugee', 'merkel', 'germany', 'border', 'migrant'], ['match', 'murray', 'australian', 'win', 'round'], ['russian', 'airspace', 'russia', 'turkey', 'turkish'], ['iran', 'saudi_arabia', 'saudi', 'shi', 'riyadh'], ['oil', 'opec', 'price', 'market', 'production'], ['iraq', 'islamic_state', 'strike', 'iraqi', 'coalition'], ['monastery', 'elijah', 'st', 'mosul', 'church'], ['video', 'paris', 'attacks', 'islamic_state', 'jihadi'], ['egypt', 'suspect', 'bomb', 'airport', 'source'], ['china', 'xi', 'chinese', 'egypt', 'president_xi'], ['migrant', 'greece', 'boat', 'turkey', 'europe'], ['turkey', 'kurdish', 'pkk', 'biden', 'ypg'], ['canada', 'canadian', 'canadians', 'defence', 'isis'], ['israel', 'israeli', 'palestinian', 'palestinians', 'jewish'], ['deir_al', 'zor', 'group', 'killed', 'syrian'], ['will', 'mr', 'say', 'can', 'one'], ['mansour', 'pickup_truck', 'car', 'crash', 'firefighter']]
2016_01 Mallet 24
[['russia', 'russian', 'nato', 'moscow', 'turkey'], ['turkey', 'turkish', 'kurdish', 'pkk', 'ankara'], ['british', 'report', 'mr', 'case', 'court'], ['refugee', 'migrant', 'europe', 'germany', 'border'], ['alert', 'click_here', 'add', 'remove', 'disable'], ['china', 'region', 'state', 'political', 'support'], ['oil', 'market', 'price', 'percent', 'low'], ['question', 'mr', 'party', 'trump', 'make'], ['jail', 'escape', 'man', 'charge', 'inmate'], ['police', 'islamic_state', 'video', 'group', '-attacks-'], ['day', 'film', 'time', 'family', 'life'], ['group', 'city', '-killed-', '-forces-', 'area'], ['britain', 'minister', 'prime_minister', 'mr', 'party'], ['israel', 'report', 'israeli', 'palestinian', '-war-'], ['iran', 'saudi_arabia', 'saudi', 'iranian', 'yemen'], ['work', 'school', 'service', 'project', 'student'], ['2016', 'jan', 'people', 'reply_alert', 'good'], ['people', 'child', 'refugee', 'million', 'aid'], ['-iraq-', 'u.s.', 'military', 'islamic_state', 'canada'], ['photo', 'photos', 'turkey', 'istanbul', 'turkish'], ['iran', 'sanction', 'deal', 'nuclear', 'visit'], ['open', 'win', 'match', 'australian', 'time'], ['share', 'window_click', 'iraqi', '-iraq-', 'isis'], ['talk', 'syrian', 'opposition', 'peace', 'geneva']]
2016_02 NMF 20
[['photo', 'photos', 'istanbul', 'turkey', 'amid'], ['ceasefire', 'russia', 'syria', 'cessation', 'hostility'], ['window_click', 'share', 'new', 'opens', 'iraqinews'], ['will', 'can', 'armenia', 'one', 'armenian'], ['alert', 'click_here', 'disable', 'remove', 'add'], ['talk', 'geneva', 'opposition', 'syrian', 'hnc'], ['migrant', 'greece', 'refugee', 'border', 'europe'], ['turkey', 'turkish', 'kurdish', 'ypg', 'pyd'], ['iran', 'saudi_arabia', 'saudi', 'lebanon', 'troop'], ['canada', 'canadian', 'citizenship', 'canadians', 'trudeau'], ['aleppo', 'syrian', 'rebel', 'russian', 'city'], ['plane', 'passenger', 'hole', 'say', 'mogadishu'], ['trump', 'republican', 'rubio', 'bush', 'cruz'], ['baghdad', 'suicide', 'killed', 'people', 'attack'], ['oil', 'barrel', 'production', 'price', 'opec'], ['taliban', 'afghanistan', 'afghan', 'fraser', 'kabul'], ['refugee', 'jordan', 'million', 'provide', 'aid'], ['shakil', 'son', 'woman', 'say', 'judge'], ['islamic_state', 'libya', 'iraq', 'mosul', 'iraqi'], ['israel', 'israeli', 'palestinian', 'palestinians', 'jerusalem']]
2016_02 Mallet 26
[['refugee', 'migrant', 'border', 'europe', 'greece'], ['people', 'refugee', 'million', 'aid', 'jordan'], ['canada', 'canadian', 'plane', 'north', 'north_korea'], ['-killed-', 'people', '-attacks-', 'group', '-attack-'], ['child', 'family', 'woman', 'life', 'time'], ['syrian', 'group', 'talk', 'russia', 'opposition'], ['political', 'people', 'world', 'make', '-war-'], ['police', 'court', 'case', 'charge', 'group'], ['israel', 'israeli', 'afghanistan', 'palestinian', 'taliban'], ['2016', 'february', 'reuters', 'feb', 'report'], ['trump', 'mr', 'republican', 'candidate', 'win'], ['armenian', 'church', 'university', 'community', 'student'], ['turkey', 'russia', 'russian', 'turkish', 'kurdish'], ['international', 'security', 'state', 'minister', 'government'], ['day', 'photo', 'local', 'military', 'continue'], ['photo', 'photos', 'turkey', 'istanbul', 'turkish'], ['win', 'move', 'team', 'world', 'make'], ['iran', 'military', 'u.s.', 'saudi_arabia', 'libya'], ['2016', 'feb', 'alert_moderator', 'people', 'government'], ['eu', 'britain', 'deal', 'prime_minister', 'uk'], ['oil', 'market', 'china', 'price', 'million'], ['syrian', 'aleppo', 'government', 'city', 'border'], ['day', 'continue', 'area', 'security', 'military'], ['alert', 'click_here', 'add', 'remove', 'disable'], ['work', 'service', 'include', 'provide', 'project'], ['isis', '-iraq-', 'share', 'islamic_state', 'iraqi']]
2016_03 NMF 22
[['photos', 'photo', 'istanbul', 'turkey', 'turkish'], ['islamic_state', 'say', 'strike', 'group', 'iraq'], ['brussels', 'belgian', 'attacks', 'paris', 'abdeslam'], ['window_click', 'share', 'new', 'opens', 'iraqinews'], ['alert', 'click_here', 'disable', 'remove', 'add'], ['greece', 'migrant', 'refugee', 'eu', 'turkey'], ['talk', 'syrian', 'geneva', 'peace', 'say'], ['child', 'year', 'will', 'work', 'country'], ['palmyra', 'syrian', 'city', 'army', 'islamic_state'], ['trump', 'republican', 'rubio', 'donald_trump', 'say'], ['habib', 'girlfriend', 'gatineau', 'charge', 'court'], ['kurdish', 'turkey', 'pkk', 'attack', 'ankara'], ['putin', 'russian', 'russia', 'moscow', 'syria'], ['yemen', 'saudi', 'aden', 'houthis', 'houthi'], ['easter', 'francis', 'st', 'pope', 'jesus'], ['libya', 'tunisia', 'tunisian', 'guerdan', 'militant'], ['canada', 'canadian', 'trudeau', 'sajjan', 'war'], ['israel', 'israeli', 'palestinian', 'palestinians', 'biden'], ['iraqi', 'baghdad', 'mosul', 'iraq', 'dam'], ['north_korea', 'iran', 'sanction', 'nuclear', 'north_korean'], ['hezbollah', 'lebanon', 'saudi_arabia', 'arab', 'iran'], ['provide', 'support', 'food', 'assistance', 'aid']]
2016_03 Mallet 28
[['photo', 'photos', 'istanbul', 'turkey', 'turkish'], ['syrian', 'talk', 'government', 'peace', 'opposition'], ['work', 'support', 'international', 'provide', 'development'], ['government', 'party', 'parliament', 'freedom', 'journalist'], ['israel', 'israeli', 'council', 'security', 'palestinian'], ['group', 'u.s.', 'official', 'document', 'security'], ['yemen', 'saudi_arabia', 'arab', 'saudi', 'lebanon'], ['russia', 'russian', 'moscow', 'putin', 'military'], ['university', 'news', 'work', 'picture', 'book'], ['-attack-', 'turkey', '-killed-', 'turkish', 'kurdish'], ['police', 'charge', 'court', 'man', 'video'], ['armenian', 'armenia', 'world', 'church', 'genocide'], ['brussels', '-attacks-', 'paris', 'belgian', 'police'], ['alert', 'click_here', 'add', 'remove', 'palmyra'], ['oil', 'market', 'million', 'company', 'percent'], ['-iraq-', 'iraqi', 'islamic_state', '-forces-', 'group'], ['eu', 'turkey', 'europe', 'european', 'prime_minister'], ['child', 'family', 'canada', 'home', 'school'], ['islamic_state', 'group', 'libya', '-iraq-', 'military'], ['photo', 'photos', 'turkey', 'istanbul', 'turkish'], ['world', '-war-', 'time', 'political', 'question'], ['trump', 'obama', 'u.s.', 'state', 'republican'], ['team', 'play', 'win', 'game', 'world'], ['iran', 'share', 'window_click', '-weapons-', 'nuclear'], ['march', '2016', 'report', 'reuters', 'post'], ['2016', 'mar', 'reply_alert', 'moderator', 'australia'], ['official', 'uae', 'plane', 'bank', 'system'], ['refugee', 'migrant', 'border', 'greece', 'turkey']]
2016_04 NMF 22
[['photo', 'photos', 'istanbul', 'turkey', 'turkish'], ['say', 'will', 'year', 'can', 'people'], ['window_click', 'share', 'new', 'iraqinews', 'opens'], ['aleppo', 'hospital', 'rebel', 'held', 'air_strike'], ['alert', 'click_here', 'disable', 'remove', 'add'], ['obama', 'germany', 'merkel', 'trade', 'president_barack'], ['provide', 'carter', 'iraq', 'support', 'food'], ['sadr', 'green', 'parliament', 'zone', 'protester'], ['abu_sayyaf', 'ridsdel', 'philippine', 'hostage', 'ransom'], ['refugee', 'turkey', 'migrant', 'eu', 'greece'], ['soldiers', 'light', 'training', 'canadian', 'combat'], ['30', '2016', 'april', '21', '23'], ['trudeau', 'quantum_computing', 'reporter', 'canadian', 'abuzz'], ['russian', 'russia', 'putin', 'moscow', 'ukraine'], ['baghdad', 'shi', 'suicide', 'ite', 'wound'], ['chaiyakorn', 'man', 'thai', 'son', 'tourist'], ['talk', 'geneva', 'opposition', 'syrian', 'peace'], ['israel', 'israeli', 'golan_heights', 'netanyahu', 'golan'], ['armenian', 'azerbaijan', 'armenia', 'armenians', 'genocide'], ['trump', 'clinton', 'policy', 'republican', 'speech'], ['yemen', 'iran', 'saudi', 'saudi_arabia', 'kuwait'], ['isis', 'islamic_state', 'iraq', 'forces', 'syria']]
2016_04 Mallet 20
[['2016', 'april', '21', 'reuters', 'world'], ['police', '-attacks-', 'brussels', 'man', 'suspect'], ['isis', '-iraq-', 'islamic_state', 'group', '-forces-'], ['oil', 'million', 'company', 'percent', 'market'], ['child', 'family', 'woman', 'court', 'mother'], ['obama', 'president', 'mr', 'britain', 'eu'], ['world', 'state', 'international', 'member', 'call'], ['time', 'family', 'work', 'day', 'life'], ['photo', 'photos', 'turkey', 'istanbul', 'turkish'], ['report', 'turkey', 'turkish', '2016', 'reuters'], ['alert', 'click_here', 'add', 'israel', 'remove'], ['syrian', 'aleppo', 'government', 'city', 'area'], ['share', 'military', 'troop', '-forces-', '-soldiers-'], ['development', 'agreement', 'support', 'national', 'future'], ['iran', 'saudi_arabia', 'saudi', 'obama', 'u.s.'], ['group', 'canadian', 'abu_sayyaf', 'government', 'hostage'], ['russia', 'russian', 'talk', 'peace', 'syrian'], ['2016', 'make', 'time', 'law', 'case'], ['refugee', 'migrant', 'turkey', 'child', 'europe'], ['parliament', 'police', 'man', 'government', 'protester']]
2016_05 NMF 22
[['photo', 'photos', 'istanbul', 'turkey', 'turkish'], ['baghdad', 'suicide', 'wound', 'killed', 'car'], ['window_click', 'share', 'new', 'opens', 'iraqinews'], ['taliban', 'mansour', 'pakistan', 'afghan', 'afghanistan'], ['humanitarian', 'aid', 'international', 'health', 'world'], ['aleppo', 'syrian', 'syria', 'rebel', 'say'], ['alert', 'click_here', 'disable', 'remove', 'add'], ['30', '2016', 'getty', 'april', 'may'], ['hezbollah', 'badreddine', 'commander', 'lebanese', 'group'], ['fallujah', 'iraqi', 'city', 'forces', 'iraq'], ['protester', 'zone', 'green', 'sadr', 'baghdad'], ['refugee', 'migrant', 'greece', 'europe', 'camp'], ['turkey', 'turkish', 'kurdish', 'pkk', 'erdogan'], ['egyptair', 'plane', 'egyptian', 'flight', 'cairo'], ['say', 'trump', 'will', 'go', 'can'], ['isis', '74', 'iraq', 'fighters', 'raqqa'], ['australia', 'police', 'man', 'boat', 'australian'], ['provide', 'fort_mcmurray', 'support', 'management', 'edmonton'], ['israel', 'israeli', 'jerusalem', 'palestinian', 'palestinians'], ['oil', 'iran', 'opec', 'saudi_arabia', 'barrel'], ['libya', 'government', 'sirte', 'islamic_state', 'say'], ['islamic_state', 'strike', 'kilis', 'syria', 'turkish']]
2016_05 Mallet 30
[['trump', 'reply_alert', 'australia', 'moderator', 'news'], ['share', 'libya', 'window_click', 'source', 'security'], ['humanitarian', 'work', 'provide', 'world', 'support'], ['israel', 'school', 'israeli', 'child', 'student'], ['city', 'area', 'month', 'home', 'local'], ['isis', 'iraqi', '-iraq-', 'city', '-forces-'], ['court', 'government', 'case', 'journalist', 'release'], ['wednesday', 'fire', 'fort_mcmurray', 'canada', 'city'], ['turkey', 'turkish', 'islamic_state', 'border', 'militant'], ['family', 'man', 'child', 'woman', 'picture'], ['party', 'president', 'political', 'government', 'election'], ['oil', 'company', 'million', 'market', 'business'], ['international', 'security', 'peace', 'state', 'council'], ['april', 'image', 'reuters', 'getty', 'london'], ['photo', 'photos', 'turkey', 'istanbul', 'turkish'], ['iran', 'hezbollah', 'group', 'iranian', 'saudi_arabia'], ['-baghdad-', '-killed-', 'security', '-attacks-', '-iraq-'], ['alert', 'click_here', 'add', 'remove', 'disable'], ['britain', 'eu', 'uk', 'mr', 'british'], ['world', '-war-', 'make', 'time', 'fact'], ['photo', 'photos', 'turkey', 'istanbul', 'turkish'], ['show', 'world', 'watch', 'video', 'film'], ['refugee', 'migrant', 'europe', 'turkey', 'border'], ['syrian', 'aleppo', 'government', 'city', 'civilian'], ['police', 'group', 'man', 'isis', 'terrorist'], ['plane', 'egyptian', 'flight', 'egypt', 'egyptair'], ['taliban', 'leader', '-mansour-', 'afghanistan', 'pakistan'], ['play', 'win', 'team', 'game', 'player'], ['russia', 'russian', 'military', 'nato', '-war-'], ['u.s.', '-iraq-', 'military', '-forces-', 'troop']]
2016_06 NMF 22
[['photo', 'photos', 'istanbul', 'turkey', 'turkish'], ['say', 'ali', 'go', 'can', 'will'], ['fallujah', 'iraqi', 'city', 'forces', 'iraq'], ['alert', 'click_here', 'disable', 'remove', 'add'], ['mateen', 'orlando', 'fbi', 'say', 'pulse'], ['window_click', 'share', 'new', 'opens', 'iraqinews'], ['aleppo', 'rebel', 'syrian', 'syria', 'aid'], ['cox', 'jo', 'mp', 'labour', 'mair'], ['airport', 'turkey', 'istanbul', 'attack', 'suicide'], ['refugee', 'unhcr', 'english', 'emergency', 'million'], ['30', '2016', 'june', 'getty', 'reuters'], ['israel', 'israeli', 'palestinian', 'palestinians', 'jerusalem'], ['isis', 'iraq', 'video', 'fighters', 'jihadi'], ['iran', 'yemen', 'saudi', 'saudi_arabia', 'say'], ['manbij', 'sdf', 'syrian', 'fighters', 'forces'], ['trump', 'clinton', 'obama', 'donald_trump', 'ban'], ['russia', 'russian', 'putin', 'assad', 'moscow'], ['yazidis', 'genocide', 'yazidi', 'say', 'crimes'], ['jordan', 'jordanian', 'border', 'attack', 'amman'], ['falluja', 'iraqi', 'baghdad', 'islamic_state', 'army'], ['project', 'development', 'work', 'program', 'provide'], ['eu', 'brexit', 'vote', 'britain', 'referendum']]
2016_06 Mallet 24
[['iraqi', 'city', '-forces-', 'fallujah', 'isis'], ['alert', 'click_here', 'add', 'remove', 'disable'], ['turkey', 'state', 'government', 'turkish', 'party'], ['-attack-', 'airport', 'turkey', 'istanbul', '-killed-'], ['russia', 'iran', 'russian', 'state', 'moscow'], ['military', 'u.s.', 'group', '-iraq-', 'official'], ['eu', 'ali', 'vote', 'britain', 'leave'], ['service', 'mr', 'community', 'nsw', 'significant'], ['june', 'report', 'news', 'file', 'publish'], ['company', 'market', 'million', 'business', 'percent'], ['trump', 'obama', 'u.s.', 'president', 'united_states'], ['israel', 'israeli', 'palestinian', 'arab', 'air'], ['family', 'child', 'work', 'day', 'school'], ['cox', 'jo', 'mp', 'labour', 'time'], ['syrian', 'area', 'aleppo', 'government', '-forces-'], ['mateen', 'orlando', 'shooting', 'nightclub', 'gay'], ['refugee', 'child', 'english', 'million', 'camp'], ['police', 'man', 'group', 'isis', 'suspect'], ['world', 'make', 'good', 'reply_alert', 'time'], ['share', 'al', 'window_click', 'dubai', 'yemen'], ['migrant', 'europe', 'germany', 'turkey', 'eu'], ['june', 'reuters', 'image', 'getty', 'ramadan'], ['photo', 'photos', 'turkey', 'istanbul', 'turkish'], ['work', 'international', 'include', 'support', 'council']]

In [71]:
%%time


from gensim import corpora
corpus_text = MyDocuments('dynamic_nmf/data/windowbin/slices/tokenized_window_all.gz') 

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(corpus_text)
dictionary.save_as_text('data/eos/dictionary_EOS_all.dict')


CPU times: user 5min 29s, sys: 7.97 s, total: 5min 37s
Wall time: 5min 37s

In [72]:
%%time


# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in corpus_text]
corpora.MmCorpus.serialize('data/eos/corpus_EOS_all.mm', corpus)


CPU times: user 7min 54s, sys: 8.06 s, total: 8min 2s
Wall time: 8min 4s

In [ ]:


In [13]:
# print(get_nmf_dynamic_topics(42, None, 20))  
df = pd.DataFrame(get_nmf_dynamic_topics(42, None, 20))
df.to_csv('experiment/dynamic_nmf_42.csv')

In [ ]: