Kmean

TSNE

Word Mover's distance

TF-IDF



In [1]:

    
import numpy as np

from sklearn.externals import joblib
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn import cluster

from scipy import stats
import pickle
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D

from matplotlib import offsetbox
from sklearn import (manifold, datasets, decomposition, ensemble,
                     discriminant_analysis, random_projection)

import numpy as np  # a conventional alias
from sklearn.feature_extraction.text import CountVectorizer
from matplotlib.pyplot import *
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing

from gensim import models
import gensim
from gensim import corpora
from gensim.models import TfidfModel
from gensim.models import LsiModel
from gensim.similarities import MatrixSimilarity
from gensim.models import Word2Vec

from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans

import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from IPython.display import display
from time import time
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib notebook



In [2]:

    
df = pd.read_csv('data/windowbin/csv/all_dynamic_topic.csv')

# display(df)
index = df.iloc[:,2:3].values
# display(index)
df.drop(df.columns[[0,1,]], axis=1, inplace=True)
display(df.head())
display(df.iloc[:,1:].head())# used for corpus


# df1 = df.apply(lambda row: row.astype(str).str.contains('iran').any(), axis=1)
# display(df[df.apply(lambda row: row.astype(str).str.contains('iran').any(), axis=1)])









    






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
    
  
  
    
      0
      tf_idf_tokenized_window_2012_11_01
      js
      fjs
      id
      type
      href
      name
      rferl
      cssclass
      link_button
      ...
      createelement
      parentnode
      insertbefore
      script
      russia
      3a
      georgia
      gulnara_karimova
      document
      hah
    
    
      1
      tf_idf_tokenized_window_2012_11_02
      syrian
      syria
      say
      turkey
      assad
      opposition
      rebels
      turkish
      rebel
      ...
      killed
      coalition
      foreign
      patriot
      president_bashar
      fighting
      month
      army
      activist
      people
    
    
      2
      tf_idf_tokenized_window_2012_11_03
      shelling
      damascus
      suburbs
      regime
      daraa
      fierce
      neighborhood
      martyrs
      artillery
      ...
      clashes
      demonstration
      area
      hama
      youtu
      warplane
      wound
      mortar
      due
      free
    
    
      3
      tf_idf_tokenized_window_2012_11_04
      israel
      gaza
      israeli
      hamas
      palestinian
      palestinians
      arab
      gaza_strip
      say
      ...
      will
      al
      jerusalem
      state
      egyptian
      tel_aviv
      un
      conflict
      netanyahu
      ceasefire
    
    
      4
      tf_idf_tokenized_window_2012_11_05
      news
      com
      http
      www
      in
      killed
      iraq
      html
      world
      ...
      kill
      car
      breaking
      ie
      feeds
      pakistan
      2012
      google
      wound
      facebook_reddit
    
  

5 rows × 31 columns







    






  
    
      
      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      ...
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
    
  
  
    
      0
      js
      fjs
      id
      type
      href
      name
      rferl
      cssclass
      link_button
      2f
      ...
      createelement
      parentnode
      insertbefore
      script
      russia
      3a
      georgia
      gulnara_karimova
      document
      hah
    
    
      1
      syrian
      syria
      say
      turkey
      assad
      opposition
      rebels
      turkish
      rebel
      border
      ...
      killed
      coalition
      foreign
      patriot
      president_bashar
      fighting
      month
      army
      activist
      people
    
    
      2
      shelling
      damascus
      suburbs
      regime
      daraa
      fierce
      neighborhood
      martyrs
      artillery
      al
      ...
      clashes
      demonstration
      area
      hama
      youtu
      warplane
      wound
      mortar
      due
      free
    
    
      3
      israel
      gaza
      israeli
      hamas
      palestinian
      palestinians
      arab
      gaza_strip
      say
      egypt
      ...
      will
      al
      jerusalem
      state
      egyptian
      tel_aviv
      un
      conflict
      netanyahu
      ceasefire
    
    
      4
      news
      com
      http
      www
      in
      killed
      iraq
      html
      world
      reuters
      ...
      kill
      car
      breaking
      ie
      feeds
      pakistan
      2012
      google
      wound
      facebook_reddit
    
  

5 rows × 30 columns

Word Embedding



In [3]:

    
def load_w2v(word2vec_model_file):
    # load the finished model from disk
    word2vec_model = Word2Vec.load(word2vec_model_file)
    word2vec_model.init_sims(replace=True)
    return word2vec_model



In [4]:

    
word2vec_model_file = '/home/sonic/sonic/eosdb/data/eos/word2vec_model_all.model'
word2vec_model = load_w2v(word2vec_model_file)



In [5]:

    
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.vocab]
    return np.mean(word2vec_model[doc], axis=0)



In [6]:

    
def document_to_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.vocab]
    return np.array(word2vec_model[doc])



In [7]:

    
def kmeans(X, figName='', metric='euclidean'):

    estimators = {
        '4': KMeans(n_clusters=4),
        '6': KMeans(n_clusters=6),
        '10': KMeans(n_clusters=10),
        '15': KMeans(n_clusters=15),
        '20': KMeans(n_clusters=20),
        '30': KMeans(n_clusters=30),
        '40': KMeans(n_clusters=40),
        '50': KMeans(n_clusters=50),
        '70': KMeans(n_clusters=70),  
        '80': KMeans(n_clusters=80), 
        '100': KMeans(n_clusters=100),
        '150': KMeans(n_clusters=150),
        '200': KMeans(n_clusters=200),
    } 

    fignum = 1
    minsil=-1
    bestlabels=[]
    
    

    for name, est in estimators.items():
        fig = plt.figure(fignum, figsize=(8, 8))
        plt.clf()
        ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

        plt.cla()
        est.fit(X)
        
        labels = est.labels_
        centroids = est.cluster_centers_
        
        sil=silhouette_score(X, labels, metric=metric)
        print ("silhouette of " + name+"=" +str(sil))
        if sil>minsil:
            bestlabels=labels
            minsil=sil
            numOfCluster=name

        colormap = plt.cm.gist_ncar  # nipy_spectral, Set1,Paired
        colorst = [colormap(i) for i in np.linspace(0, 0.9, int(name))]
        colors=[]
        for i in labels:
            colors.append(colorst[i])

        ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=colors)  #X[:,3] means all rows column=3 ''''''c=labels.astype(np.float)

        ax.w_xaxis.set_ticklabels([])
        ax.w_yaxis.set_ticklabels([])
        ax.w_zaxis.set_ticklabels([])
        ax.set_xlabel(name)
        fignum = fignum + 1
        plt.title("%s k:%s" % (figName, name)) 
        plt.show()

   
    print ("Best silhouette =" + str(minsil))

    return numOfCluster,bestlabels



In [8]:

    
def analyze(X, figName='', metric='euclidean', show=False):
#     range_n_clusters = [2, 3, 4, 5, 6, 8, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200]
    range_n_clusters =  [30, 40, 50, 60, 70]
    
    best_k = 0
    min_score = 0
    bestlabels=[]

    for n_clusters in range_n_clusters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(12, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=42)
        cluster_labels = clusterer.fit_predict(X)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels, metric=metric)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)
        
        if silhouette_avg > min_score:
            bestlabels = cluster_labels
            min_score=silhouette_avg
            best_k=n_clusters

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, ith_cluster_silhouette_values,
                              facecolor=color, edgecolor=color, alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("clusters silhouette plot")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        # plt.cm.gist_ncar or nipy_spectral        
        colors = cm.spectral(cluster_labels.astype(float) / n_clusters)

        ax2.scatter(X[:, 0], X[:, 1], X[:, 2], c=colors)
        
        ax2.set_xlabel(n_clusters)

        # Labeling the clusters
        centers = clusterer.cluster_centers_
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                    c="white", alpha=1, s=200, edgecolor='k')

        for i, c in enumerate(centers):
            ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50, edgecolor='k')

        ax2.set_title("clustere %s k:%s" % (figName, n_clusters))
#         ax2.set_xlabel("Feature space for the 1st feature")
#         ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle(("Silhouette analysis for KMeans clustering on topic space "
                      "with n_clusters = %d" % n_clusters), fontsize=12, fontweight='bold')
    
    
        # Interactive figure
        if (show):
            fig2 = plt.figure(figsize=(8, 8))
            ax3 = Axes3D(fig2, rect=[0, 0, .95, 1], elev=48, azim=134)
            ax3.scatter(X[:, 0], X[:, 1], X[:, 2], c=colors)
            ax3.set_xlabel(n_clusters)
            ax3.w_xaxis.set_ticklabels([])
            ax3.w_yaxis.set_ticklabels([])
            ax3.w_zaxis.set_ticklabels([])
            ax3.set_title("%s cluster k:%s" % (figName, n_clusters))
        
        plt.show()
 
    return best_k, bestlabels



In [9]:

    
def getCorpus():
    corpus = df.iloc[:,3:].values.tolist()
    return corpus

TSNE experiments



In [10]:

    
%%time

corpus = getCorpus()

topic_w2v = np.array([document_vector(word2vec_model, doc) for doc in corpus])

X_embedded = TSNE(n_components=3, init='pca', verbose=2).fit_transform(topic_w2v)
# print(topic_w2v[0])

# n_components =20
# print("number of components: {}".format(n_components))
# X_reduced = TruncatedSVD(n_components=n_components, random_state=42).fit_transform(topic_w2v)
# print(X_reduced.shape)









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 1546
[t-SNE] Computed conditional probabilities for sample 1546 / 1546
[t-SNE] Mean sigma: 0.099600
[t-SNE] Iteration 25: error = 1.4189861, gradient norm = 0.0218827
[t-SNE] Iteration 50: error = 1.3233318, gradient norm = 0.0182941
[t-SNE] Iteration 75: error = 1.0233327, gradient norm = 0.0080291
[t-SNE] Iteration 100: error = 0.9868832, gradient norm = 0.0094304
[t-SNE] KL divergence after 100 iterations with early exaggeration: 0.986883
[t-SNE] Iteration 125: error = 0.9117515, gradient norm = 0.0050173
[t-SNE] Iteration 150: error = 0.9109656, gradient norm = 0.0068414
[t-SNE] Iteration 175: error = 0.9053628, gradient norm = 0.0075644
[t-SNE] Iteration 200: error = 0.8981646, gradient norm = 0.0078038
[t-SNE] Iteration 225: error = 0.8905883, gradient norm = 0.0084922
[t-SNE] Iteration 250: error = 0.8864372, gradient norm = 0.0092010
[t-SNE] Iteration 275: error = 0.8732015, gradient norm = 0.0088293
[t-SNE] Iteration 300: error = 0.8533086, gradient norm = 0.0089430
[t-SNE] Iteration 325: error = 0.8383408, gradient norm = 0.0090336
[t-SNE] Iteration 350: error = 0.8169093, gradient norm = 0.0086195
[t-SNE] Iteration 375: error = 0.7946079, gradient norm = 0.0082852
[t-SNE] Iteration 400: error = 0.7690800, gradient norm = 0.0077375
[t-SNE] Iteration 425: error = 0.7541503, gradient norm = 0.0076468
[t-SNE] Iteration 450: error = 0.7454948, gradient norm = 0.0077390
[t-SNE] Iteration 475: error = 0.7299900, gradient norm = 0.0076015
[t-SNE] Iteration 500: error = 0.7068597, gradient norm = 0.0073691
[t-SNE] Iteration 525: error = 0.6942506, gradient norm = 0.0073081
[t-SNE] Iteration 550: error = 0.6771215, gradient norm = 0.0070034
[t-SNE] Iteration 575: error = 0.6629848, gradient norm = 0.0066949
[t-SNE] Iteration 600: error = 0.6483309, gradient norm = 0.0067359
[t-SNE] Iteration 625: error = 0.6380474, gradient norm = 0.0062304
[t-SNE] Iteration 650: error = 0.6385837, gradient norm = 0.0066448
[t-SNE] Iteration 675: error = 0.6283131, gradient norm = 0.0063935
[t-SNE] Iteration 700: error = 0.6085238, gradient norm = 0.0062830
[t-SNE] Iteration 725: error = 0.6018202, gradient norm = 0.0060179
[t-SNE] Iteration 750: error = 0.5928153, gradient norm = 0.0058105
[t-SNE] Iteration 775: error = 0.5835083, gradient norm = 0.0054687
[t-SNE] Iteration 800: error = 0.5855131, gradient norm = 0.0055564
[t-SNE] Iteration 825: error = 0.5666971, gradient norm = 0.0050397
[t-SNE] Iteration 850: error = 0.5644496, gradient norm = 0.0053086
[t-SNE] Iteration 875: error = 0.5799002, gradient norm = 0.0058943
[t-SNE] Iteration 900: error = 0.5560875, gradient norm = 0.0052589
[t-SNE] Iteration 925: error = 0.5507349, gradient norm = 0.0051035
[t-SNE] Iteration 950: error = 0.5548517, gradient norm = 0.0050829
[t-SNE] Iteration 975: error = 0.5434975, gradient norm = 0.0048078
[t-SNE] Iteration 1000: error = 0.5361019, gradient norm = 0.0048077
[t-SNE] Error after 1000 iterations: 0.986883
CPU times: user 1min 22s, sys: 1.94 s, total: 1min 24s
Wall time: 1min 23s



In [11]:

    
%%time


# No scaling
# numofClusters, bestlabels = kmeans(X_embedded)
best_k, bestlabels = analyze(X_embedded, figName="W2V")
print('best number of clusters: %s' % best_k)
df['label_w2v_no'] = bestlabels
display(df.head())









    














    











    



For n_clusters = 30 The average silhouette_score is : 0.340386747146






    



/usr/local/lib/python3.5/dist-packages/matplotlib/collections.py:865: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor






    














    











    



For n_clusters = 40 The average silhouette_score is : 0.355603028038






    














    











    



For n_clusters = 50 The average silhouette_score is : 0.364453787019






    














    











    



For n_clusters = 60 The average silhouette_score is : 0.342293280215






    














    











    



For n_clusters = 70 The average silhouette_score is : 0.341910846014
best number of clusters: 50






    






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      22
      23
      24
      25
      26
      27
      28
      29
      30
      label_w2v_no
    
  
  
    
      0
      tf_idf_tokenized_window_2012_11_01
      js
      fjs
      id
      type
      href
      name
      rferl
      cssclass
      link_button
      ...
      parentnode
      insertbefore
      script
      russia
      3a
      georgia
      gulnara_karimova
      document
      hah
      36
    
    
      1
      tf_idf_tokenized_window_2012_11_02
      syrian
      syria
      say
      turkey
      assad
      opposition
      rebels
      turkish
      rebel
      ...
      coalition
      foreign
      patriot
      president_bashar
      fighting
      month
      army
      activist
      people
      20
    
    
      2
      tf_idf_tokenized_window_2012_11_03
      shelling
      damascus
      suburbs
      regime
      daraa
      fierce
      neighborhood
      martyrs
      artillery
      ...
      demonstration
      area
      hama
      youtu
      warplane
      wound
      mortar
      due
      free
      14
    
    
      3
      tf_idf_tokenized_window_2012_11_04
      israel
      gaza
      israeli
      hamas
      palestinian
      palestinians
      arab
      gaza_strip
      say
      ...
      al
      jerusalem
      state
      egyptian
      tel_aviv
      un
      conflict
      netanyahu
      ceasefire
      8
    
    
      4
      tf_idf_tokenized_window_2012_11_05
      news
      com
      http
      www
      in
      killed
      iraq
      html
      world
      ...
      car
      breaking
      ie
      feeds
      pakistan
      2012
      google
      wound
      facebook_reddit
      34
    
  

5 rows × 32 columns







    



CPU times: user 6.25 s, sys: 8.54 s, total: 14.8 s
Wall time: 4.92 s



In [12]:

    
%%time


# Scaled
X_embedded_scaled = preprocessing.scale(X_embedded)
best_k, bestlabels = analyze(X_embedded_scaled, figName="W2V scale")
print('best number of clusters: %s' % best_k)
df['label_w2v_scale'] = bestlabels
display(df.head())









    














    











    



For n_clusters = 30 The average silhouette_score is : 0.346398450368






    



/usr/local/lib/python3.5/dist-packages/matplotlib/collections.py:865: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor






    














    











    



For n_clusters = 40 The average silhouette_score is : 0.349881376175






    














    











    



For n_clusters = 50 The average silhouette_score is : 0.342778885706






    














    











    



For n_clusters = 60 The average silhouette_score is : 0.339568862722






    














    











    



For n_clusters = 70 The average silhouette_score is : 0.345787204445
best number of clusters: 40






    






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      23
      24
      25
      26
      27
      28
      29
      30
      label_w2v_no
      label_w2v_scale
    
  
  
    
      0
      tf_idf_tokenized_window_2012_11_01
      js
      fjs
      id
      type
      href
      name
      rferl
      cssclass
      link_button
      ...
      insertbefore
      script
      russia
      3a
      georgia
      gulnara_karimova
      document
      hah
      36
      6
    
    
      1
      tf_idf_tokenized_window_2012_11_02
      syrian
      syria
      say
      turkey
      assad
      opposition
      rebels
      turkish
      rebel
      ...
      foreign
      patriot
      president_bashar
      fighting
      month
      army
      activist
      people
      20
      3
    
    
      2
      tf_idf_tokenized_window_2012_11_03
      shelling
      damascus
      suburbs
      regime
      daraa
      fierce
      neighborhood
      martyrs
      artillery
      ...
      area
      hama
      youtu
      warplane
      wound
      mortar
      due
      free
      14
      27
    
    
      3
      tf_idf_tokenized_window_2012_11_04
      israel
      gaza
      israeli
      hamas
      palestinian
      palestinians
      arab
      gaza_strip
      say
      ...
      jerusalem
      state
      egyptian
      tel_aviv
      un
      conflict
      netanyahu
      ceasefire
      8
      19
    
    
      4
      tf_idf_tokenized_window_2012_11_05
      news
      com
      http
      www
      in
      killed
      iraq
      html
      world
      ...
      breaking
      ie
      feeds
      pakistan
      2012
      google
      wound
      facebook_reddit
      34
      14
    
  

5 rows × 33 columns







    



CPU times: user 6.7 s, sys: 9 s, total: 15.7 s
Wall time: 5.11 s



In [13]:

    
%%time


# Normalized 
X_normalized = preprocessing.normalize(X_embedded, norm='l2')
best_k, bestlabels = analyze(X_normalized, figName="W2V normalize")
print('best number of clusters: %s' % best_k)
df['label_w2v_normalize'] = bestlabels
display(df.head())









    














    











    



For n_clusters = 30 The average silhouette_score is : 0.425847357464






    



/usr/local/lib/python3.5/dist-packages/matplotlib/collections.py:865: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor






    














    











    



For n_clusters = 40 The average silhouette_score is : 0.425286207796






    














    











    



For n_clusters = 50 The average silhouette_score is : 0.392950242071






    














    











    



For n_clusters = 60 The average silhouette_score is : 0.382804062511






    














    











    



For n_clusters = 70 The average silhouette_score is : 0.372795617088
best number of clusters: 30






    






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      24
      25
      26
      27
      28
      29
      30
      label_w2v_no
      label_w2v_scale
      label_w2v_normalize
    
  
  
    
      0
      tf_idf_tokenized_window_2012_11_01
      js
      fjs
      id
      type
      href
      name
      rferl
      cssclass
      link_button
      ...
      script
      russia
      3a
      georgia
      gulnara_karimova
      document
      hah
      36
      6
      25
    
    
      1
      tf_idf_tokenized_window_2012_11_02
      syrian
      syria
      say
      turkey
      assad
      opposition
      rebels
      turkish
      rebel
      ...
      patriot
      president_bashar
      fighting
      month
      army
      activist
      people
      20
      3
      20
    
    
      2
      tf_idf_tokenized_window_2012_11_03
      shelling
      damascus
      suburbs
      regime
      daraa
      fierce
      neighborhood
      martyrs
      artillery
      ...
      hama
      youtu
      warplane
      wound
      mortar
      due
      free
      14
      27
      21
    
    
      3
      tf_idf_tokenized_window_2012_11_04
      israel
      gaza
      israeli
      hamas
      palestinian
      palestinians
      arab
      gaza_strip
      say
      ...
      state
      egyptian
      tel_aviv
      un
      conflict
      netanyahu
      ceasefire
      8
      19
      7
    
    
      4
      tf_idf_tokenized_window_2012_11_05
      news
      com
      http
      www
      in
      killed
      iraq
      html
      world
      ...
      ie
      feeds
      pakistan
      2012
      google
      wound
      facebook_reddit
      34
      14
      21
    
  

5 rows × 34 columns







    



CPU times: user 6.51 s, sys: 8.96 s, total: 15.5 s
Wall time: 5.07 s

Word Mover's Distance



In [14]:

    
X_wmd_distance_eos = pd.read_pickle('data/df_X_wmd_distance_eos.plk')
# print(X_wmd_distance_eos)

X_wmd_distance_eos = TSNE(n_components=3, init='pca', verbose=2).fit_transform(X_wmd_distance_eos)









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 1546
[t-SNE] Computed conditional probabilities for sample 1546 / 1546
[t-SNE] Mean sigma: 1.001407
[t-SNE] Iteration 25: error = 1.4322207, gradient norm = 0.0227097
[t-SNE] Iteration 50: error = 1.2940395, gradient norm = 0.0184377
[t-SNE] Iteration 75: error = 1.0870367, gradient norm = 0.0099298
[t-SNE] Iteration 100: error = 1.0724896, gradient norm = 0.0119170
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.072490
[t-SNE] Iteration 125: error = 1.0104851, gradient norm = 0.0077116
[t-SNE] Iteration 150: error = 1.0190333, gradient norm = 0.0105304
[t-SNE] Iteration 175: error = 1.0037144, gradient norm = 0.0107932
[t-SNE] Iteration 200: error = 0.9737310, gradient norm = 0.0110103
[t-SNE] Iteration 225: error = 0.9447153, gradient norm = 0.0105912
[t-SNE] Iteration 250: error = 0.9243681, gradient norm = 0.0106318
[t-SNE] Iteration 275: error = 0.9146151, gradient norm = 0.0101901
[t-SNE] Iteration 300: error = 0.8847403, gradient norm = 0.0100836
[t-SNE] Iteration 325: error = 0.8630164, gradient norm = 0.0100205
[t-SNE] Iteration 350: error = 0.8355398, gradient norm = 0.0094951
[t-SNE] Iteration 375: error = 0.8184664, gradient norm = 0.0094181
[t-SNE] Iteration 400: error = 0.7997040, gradient norm = 0.0091967
[t-SNE] Iteration 425: error = 0.7847713, gradient norm = 0.0094029
[t-SNE] Iteration 450: error = 0.7702472, gradient norm = 0.0088543
[t-SNE] Iteration 475: error = 0.7543474, gradient norm = 0.0083614
[t-SNE] Iteration 500: error = 0.7352163, gradient norm = 0.0081374
[t-SNE] Iteration 525: error = 0.7285579, gradient norm = 0.0079570
[t-SNE] Iteration 550: error = 0.7338256, gradient norm = 0.0083008
[t-SNE] Iteration 575: error = 0.7094892, gradient norm = 0.0079181
[t-SNE] Iteration 600: error = 0.7088695, gradient norm = 0.0080201
[t-SNE] Iteration 625: error = 0.7071211, gradient norm = 0.0078997
[t-SNE] Iteration 650: error = 0.7068320, gradient norm = 0.0077355
[t-SNE] Iteration 675: error = 0.6964190, gradient norm = 0.0074916
[t-SNE] Iteration 700: error = 0.6924484, gradient norm = 0.0075237
[t-SNE] Iteration 725: error = 0.6896589, gradient norm = 0.0072161
[t-SNE] Iteration 750: error = 0.6862370, gradient norm = 0.0072295
[t-SNE] Iteration 775: error = 0.6779082, gradient norm = 0.0074539
[t-SNE] Iteration 800: error = 0.6771815, gradient norm = 0.0072414
[t-SNE] Iteration 825: error = 0.6755974, gradient norm = 0.0071828
[t-SNE] Iteration 850: error = 0.6778833, gradient norm = 0.0073407
[t-SNE] Iteration 875: error = 0.6716080, gradient norm = 0.0075684
[t-SNE] Iteration 900: error = 0.6623158, gradient norm = 0.0072712
[t-SNE] Iteration 925: error = 0.6603461, gradient norm = 0.0076386
[t-SNE] Iteration 950: error = 0.6507266, gradient norm = 0.0071244
[t-SNE] Iteration 975: error = 0.6522916, gradient norm = 0.0073010
[t-SNE] Iteration 1000: error = 0.6480749, gradient norm = 0.0071423
[t-SNE] Error after 1000 iterations: 1.072490



In [15]:

    
# Pairewise distance
best_k, bestlabels = analyze(X_wmd_distance_eos, figName="WMD distance")
print('best number of clusters: %s' % best_k)
df['label_wmd_distance'] = bestlabels

display(df.head())









    














    











    



For n_clusters = 30 The average silhouette_score is : 0.359614723431






    



/usr/local/lib/python3.5/dist-packages/matplotlib/collections.py:865: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor






    














    











    



For n_clusters = 40 The average silhouette_score is : 0.369713484049






    














    











    



For n_clusters = 50 The average silhouette_score is : 0.374482685379






    














    











    



For n_clusters = 60 The average silhouette_score is : 0.374255032651






    














    











    



For n_clusters = 70 The average silhouette_score is : 0.368823915341
best number of clusters: 50






    






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      25
      26
      27
      28
      29
      30
      label_w2v_no
      label_w2v_scale
      label_w2v_normalize
      label_wmd_distance
    
  
  
    
      0
      tf_idf_tokenized_window_2012_11_01
      js
      fjs
      id
      type
      href
      name
      rferl
      cssclass
      link_button
      ...
      russia
      3a
      georgia
      gulnara_karimova
      document
      hah
      36
      6
      25
      25
    
    
      1
      tf_idf_tokenized_window_2012_11_02
      syrian
      syria
      say
      turkey
      assad
      opposition
      rebels
      turkish
      rebel
      ...
      president_bashar
      fighting
      month
      army
      activist
      people
      20
      3
      20
      35
    
    
      2
      tf_idf_tokenized_window_2012_11_03
      shelling
      damascus
      suburbs
      regime
      daraa
      fierce
      neighborhood
      martyrs
      artillery
      ...
      youtu
      warplane
      wound
      mortar
      due
      free
      14
      27
      21
      26
    
    
      3
      tf_idf_tokenized_window_2012_11_04
      israel
      gaza
      israeli
      hamas
      palestinian
      palestinians
      arab
      gaza_strip
      say
      ...
      egyptian
      tel_aviv
      un
      conflict
      netanyahu
      ceasefire
      8
      19
      7
      11
    
    
      4
      tf_idf_tokenized_window_2012_11_05
      news
      com
      http
      www
      in
      killed
      iraq
      html
      world
      ...
      feeds
      pakistan
      2012
      google
      wound
      facebook_reddit
      34
      14
      21
      2
    
  

5 rows × 35 columns



In [16]:

    
display(df.loc[df['label_wmd_distance'] == 5].head())









    






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      25
      26
      27
      28
      29
      30
      label_w2v_no
      label_w2v_scale
      label_w2v_normalize
      label_wmd_distance
    
  
  
    
      274
      tf_idf_tokenized_window_2016_12_17
      putin
      russia
      russian
      say
      military
      moscow
      nato
      nuclear
      shoigu
      ...
      strong
      crimea
      threat
      year
      aggressor
      peace
      3
      17
      13
      5
    
    
      317
      tf_idf_tokenized_window_2012_06_06
      turkish
      turkey
      plane
      syrian
      jet
      syria
      airspace
      say
      nato
      ...
      fly
      warplane
      response
      meeting
      recep_tayyip
      violate
      17
      10
      29
      5
    
    
      322
      tf_idf_tokenized_window_2012_06_11
      russia
      helicopter
      syria
      russian
      ship
      say
      moscow
      lavrov
      arm
      ...
      vessel
      repair
      conflict
      port
      british
      stop
      17
      10
      7
      5
    
    
      375
      tf_idf_tokenized_window_2016_04_14
      russian
      russia
      putin
      moscow
      ukraine
      military
      kremlin
      syria
      aircraft
      ...
      netanyahu
      syrian
      official
      coordination
      washington
      ship
      17
      10
      29
      5
    
    
      413
      tf_idf_tokenized_window_2016_09_30
      russia
      putin
      russian
      moscow
      ukraine
      kremlin
      syria
      assad
      president_vladimir
      ...
      vote
      relation
      united_states
      accuse
      may
      print
      3
      17
      13
      5
    
  

5 rows × 35 columns



In [17]:

    
# Normalized 
X_wmd_normalized = preprocessing.normalize(X_wmd_distance_eos, norm='l2')

pca = PCA(n_components=3)
pca.fit(X_wmd_normalized)
X_wmd_normalized = pca.transform(X_wmd_normalized)

best_k, bestlabels = analyze(X_wmd_normalized,  figName="WMD normalized")
print('best number of clusters: %s' % best_k)
df['label_wmd_normalize'] = bestlabels









    



/usr/local/lib/python3.5/dist-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)






    














    











    



For n_clusters = 30 The average silhouette_score is : 0.426789669373






    



/usr/local/lib/python3.5/dist-packages/matplotlib/collections.py:865: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor






    














    











    



For n_clusters = 40 The average silhouette_score is : 0.421204624843






    














    











    



For n_clusters = 50 The average silhouette_score is : 0.425255270425






    














    











    



For n_clusters = 60 The average silhouette_score is : 0.409730192013






    














    











    



For n_clusters = 70 The average silhouette_score is : 0.411370124992
best number of clusters: 30



In [18]:

    
# Scaled
X_wmd_scaled = preprocessing.scale(X_wmd_distance_eos)

pca = PCA(n_components=3)
pca.fit(X_wmd_scaled)
X_wmd_scaled = pca.transform(X_wmd_scaled)

best_k, bestlabels = analyze(X_wmd_scaled, figName="WMD scaled")
df['label_wmd_scale'] = best_k
display(df.head())









    



/usr/local/lib/python3.5/dist-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)






    














    











    



For n_clusters = 30 The average silhouette_score is : 0.356698250569






    



/usr/local/lib/python3.5/dist-packages/matplotlib/collections.py:865: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor






    














    











    



For n_clusters = 40 The average silhouette_score is : 0.373808434184






    














    











    



For n_clusters = 50 The average silhouette_score is : 0.378579693242






    














    











    



For n_clusters = 60 The average silhouette_score is : 0.382404378202






    














    











    



For n_clusters = 70 The average silhouette_score is : 0.377502397647






    






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      27
      28
      29
      30
      label_w2v_no
      label_w2v_scale
      label_w2v_normalize
      label_wmd_distance
      label_wmd_normalize
      label_wmd_scale
    
  
  
    
      0
      tf_idf_tokenized_window_2012_11_01
      js
      fjs
      id
      type
      href
      name
      rferl
      cssclass
      link_button
      ...
      georgia
      gulnara_karimova
      document
      hah
      36
      6
      25
      25
      16
      60
    
    
      1
      tf_idf_tokenized_window_2012_11_02
      syrian
      syria
      say
      turkey
      assad
      opposition
      rebels
      turkish
      rebel
      ...
      month
      army
      activist
      people
      20
      3
      20
      35
      6
      60
    
    
      2
      tf_idf_tokenized_window_2012_11_03
      shelling
      damascus
      suburbs
      regime
      daraa
      fierce
      neighborhood
      martyrs
      artillery
      ...
      wound
      mortar
      due
      free
      14
      27
      21
      26
      7
      60
    
    
      3
      tf_idf_tokenized_window_2012_11_04
      israel
      gaza
      israeli
      hamas
      palestinian
      palestinians
      arab
      gaza_strip
      say
      ...
      un
      conflict
      netanyahu
      ceasefire
      8
      19
      7
      11
      2
      60
    
    
      4
      tf_idf_tokenized_window_2012_11_05
      news
      com
      http
      www
      in
      killed
      iraq
      html
      world
      ...
      2012
      google
      wound
      facebook_reddit
      34
      14
      21
      2
      22
      60
    
  

5 rows × 37 columns

TF-IDF



In [19]:

    
%%time 


corpus_all  = []

for corpus_line in getCorpus():
    corpus_all.append(u' '.join(str(e) for e in corpus_line))
    
print(corpus_all[0])
    
vectorizer = CountVectorizer(max_df=0.5, min_df=5)
# vectorizer = CountVectorizer(max_df=0.5)
X_tfidf = vectorizer.fit_transform(corpus_all)  # a sparse matrix

vocab = vectorizer.get_feature_names()  # a list

print(len(vocab))

print(X_tfidf.shape)









    



id type href name rferl cssclass link_button 2f function facebook target var twitter russian true getelementbyid getelementsbytagname src createelement parentnode insertbefore script russia 3a georgia gulnara_karimova document hah 36 6 25 25 16 60
1614
(1546, 1614)
CPU times: user 68 ms, sys: 28 ms, total: 96 ms
Wall time: 68.7 ms

LSA



In [20]:

    
%%time


# Normalized 
X_tfidf_normalized = preprocessing.normalize(X_tfidf, norm='l2')

# LSA
X_tfidf_normalized = decomposition.TruncatedSVD(n_components=50).fit_transform(X_tfidf_normalized)

X_tfidf_normalized = TSNE(n_components=3, init='pca', verbose=2).fit_transform(X_tfidf_normalized)

best_k, bestlabels = analyze(X_tfidf_normalized, figName="TF-IDF normalized")
print('best number of clusters: %s' % best_k)
df['label_tfidf_normalize'] = bestlabels
display(df.head())









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...





    



/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py:429: DataConversionWarning: Data with input dtype int64 was converted to float64 by the normalize function.
  warnings.warn(msg, _DataConversionWarning)






    



[t-SNE] Computed conditional probabilities for sample 1000 / 1546
[t-SNE] Computed conditional probabilities for sample 1546 / 1546
[t-SNE] Mean sigma: 0.247049
[t-SNE] Iteration 25: error = 1.3445078, gradient norm = 0.0219273
[t-SNE] Iteration 50: error = 1.2575742, gradient norm = 0.0188620
[t-SNE] Iteration 75: error = 1.0258642, gradient norm = 0.0092552
[t-SNE] Iteration 100: error = 0.9953162, gradient norm = 0.0115447
[t-SNE] KL divergence after 100 iterations with early exaggeration: 0.995316
[t-SNE] Iteration 125: error = 0.9432883, gradient norm = 0.0071119
[t-SNE] Iteration 150: error = 0.9443769, gradient norm = 0.0099680
[t-SNE] Iteration 175: error = 0.9219033, gradient norm = 0.0104395
[t-SNE] Iteration 200: error = 0.8945036, gradient norm = 0.0108962
[t-SNE] Iteration 225: error = 0.8704006, gradient norm = 0.0107456
[t-SNE] Iteration 250: error = 0.8362591, gradient norm = 0.0104092
[t-SNE] Iteration 275: error = 0.8115819, gradient norm = 0.0100990
[t-SNE] Iteration 300: error = 0.7805374, gradient norm = 0.0094049
[t-SNE] Iteration 325: error = 0.7466424, gradient norm = 0.0091737
[t-SNE] Iteration 350: error = 0.7220446, gradient norm = 0.0088565
[t-SNE] Iteration 375: error = 0.6973040, gradient norm = 0.0083821
[t-SNE] Iteration 400: error = 0.6857279, gradient norm = 0.0084970
[t-SNE] Iteration 425: error = 0.6632810, gradient norm = 0.0082965
[t-SNE] Iteration 450: error = 0.6479629, gradient norm = 0.0081684
[t-SNE] Iteration 475: error = 0.6296813, gradient norm = 0.0077444
[t-SNE] Iteration 500: error = 0.6153987, gradient norm = 0.0079152
[t-SNE] Iteration 525: error = 0.5953552, gradient norm = 0.0076479
[t-SNE] Iteration 550: error = 0.5807781, gradient norm = 0.0075347
[t-SNE] Iteration 575: error = 0.5590311, gradient norm = 0.0067664
[t-SNE] Iteration 600: error = 0.5544318, gradient norm = 0.0071716
[t-SNE] Iteration 625: error = 0.5300968, gradient norm = 0.0064636
[t-SNE] Iteration 650: error = 0.5264320, gradient norm = 0.0063041
[t-SNE] Iteration 675: error = 0.5223910, gradient norm = 0.0070001
[t-SNE] Iteration 700: error = 0.5004690, gradient norm = 0.0061851
[t-SNE] Iteration 725: error = 0.4935896, gradient norm = 0.0061987
[t-SNE] Iteration 750: error = 0.4904751, gradient norm = 0.0061189
[t-SNE] Iteration 775: error = 0.4826149, gradient norm = 0.0060531
[t-SNE] Iteration 800: error = 0.4671809, gradient norm = 0.0057716
[t-SNE] Iteration 825: error = 0.4606906, gradient norm = 0.0054221
[t-SNE] Iteration 850: error = 0.4699987, gradient norm = 0.0055279
[t-SNE] Iteration 875: error = 0.4491466, gradient norm = 0.0051439
[t-SNE] Iteration 900: error = 0.4520867, gradient norm = 0.0050965
[t-SNE] Iteration 925: error = 0.4482745, gradient norm = 0.0048697
[t-SNE] Iteration 950: error = 0.4463399, gradient norm = 0.0049469
[t-SNE] Iteration 975: error = 0.4524499, gradient norm = 0.0048623
[t-SNE] Iteration 1000: error = 0.4521287, gradient norm = 0.0055128
[t-SNE] Iteration 1000: did not make any progress during the last 30 episodes. Finished.
[t-SNE] Error after 1000 iterations: 0.995316






    



/usr/local/lib/python3.5/dist-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)






    














    











    



For n_clusters = 30 The average silhouette_score is : 0.415405135063






    



/usr/local/lib/python3.5/dist-packages/matplotlib/collections.py:865: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor






    














    











    



For n_clusters = 40 The average silhouette_score is : 0.425917020388






    














    











    



For n_clusters = 50 The average silhouette_score is : 0.437178245572






    














    











    



For n_clusters = 60 The average silhouette_score is : 0.431400422075






    














    











    



For n_clusters = 70 The average silhouette_score is : 0.408497924998
best number of clusters: 50






    






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      28
      29
      30
      label_w2v_no
      label_w2v_scale
      label_w2v_normalize
      label_wmd_distance
      label_wmd_normalize
      label_wmd_scale
      label_tfidf_normalize
    
  
  
    
      0
      tf_idf_tokenized_window_2012_11_01
      js
      fjs
      id
      type
      href
      name
      rferl
      cssclass
      link_button
      ...
      gulnara_karimova
      document
      hah
      36
      6
      25
      25
      16
      60
      45
    
    
      1
      tf_idf_tokenized_window_2012_11_02
      syrian
      syria
      say
      turkey
      assad
      opposition
      rebels
      turkish
      rebel
      ...
      army
      activist
      people
      20
      3
      20
      35
      6
      60
      15
    
    
      2
      tf_idf_tokenized_window_2012_11_03
      shelling
      damascus
      suburbs
      regime
      daraa
      fierce
      neighborhood
      martyrs
      artillery
      ...
      mortar
      due
      free
      14
      27
      21
      26
      7
      60
      43
    
    
      3
      tf_idf_tokenized_window_2012_11_04
      israel
      gaza
      israeli
      hamas
      palestinian
      palestinians
      arab
      gaza_strip
      say
      ...
      conflict
      netanyahu
      ceasefire
      8
      19
      7
      11
      2
      60
      10
    
    
      4
      tf_idf_tokenized_window_2012_11_05
      news
      com
      http
      www
      in
      killed
      iraq
      html
      world
      ...
      google
      wound
      facebook_reddit
      34
      14
      21
      2
      22
      60
      9
    
  

5 rows × 38 columns







    



CPU times: user 1min 34s, sys: 11.6 s, total: 1min 46s
Wall time: 1min 34s



In [21]:

    
display(df.loc[df['label_tfidf_normalize'] == 7])









    






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      28
      29
      30
      label_w2v_no
      label_w2v_scale
      label_w2v_normalize
      label_wmd_distance
      label_wmd_normalize
      label_wmd_scale
      label_tfidf_normalize
    
  
  
    
      43
      tf_idf_tokenized_window_2013_03_34
      afghanistan
      military
      soldiers
      afghan
      violent
      combat
      say
      study
      smith
      ...
      crime
      australian
      man
      24
      33
      17
      0
      27
      60
      7
    
    
      82
      tf_idf_tokenized_window_2012_03_13
      afghan
      afghanistan
      soldier
      us
      bales
      say
      taliban
      base
      kandahar
      ...
      killings
      child
      war
      24
      33
      17
      0
      27
      60
      7
    
    
      206
      tf_idf_tokenized_window_2013_11_09
      karzai
      afghan
      afghanistan
      troop
      jirga
      sign
      pact
      loya
      say
      ...
      end
      delegate
      washington
      24
      33
      17
      0
      27
      60
      7
    
    
      229
      tf_idf_tokenized_window_2013_11_32
      us
      obama
      military
      washington
      american
      war
      policy
      iraq
      president_barack
      ...
      aid
      secretary
      department
      24
      33
      17
      22
      9
      60
      7
    
    
      256
      tf_idf_tokenized_window_2012_04_25
      afghanistan
      afghan
      us
      pakistan
      nato
      kabul
      obama
      taliban
      bounty
      ...
      suspend
      operations
      will
      24
      33
      17
      0
      27
      60
      7
    
    
      285
      tf_idf_tokenized_window_2016_12_28
      mattis
      general
      defense
      secretary
      military
      retire
      kelly
      marine
      command
      ...
      marshall
      job
      trump
      24
      33
      17
      12
      3
      60
      7
    
    
      447
      tf_idf_tokenized_window_2012_01_20
      us
      military
      obama
      afghanistan
      china
      defense
      strategy
      pentagon
      say
      ...
      wars
      pakistan
      president_barack
      24
      33
      17
      0
      27
      60
      7
    
    
      520
      tf_idf_tokenized_window_2017_06_35
      afghanistan
      taliban
      afghan
      tora_bora
      pakistan
      troop
      kabul
      bin_laden
      ghani
      ...
      sajjan
      group
      soldiers
      24
      33
      17
      0
      27
      60
      7
    
    
      592
      tf_idf_tokenized_window_2012_02_23
      us
      afghanistan
      afghan
      pakistan
      taliban
      nato
      military
      kabul
      obama
      ...
      drone
      breaking
      ie
      24
      33
      17
      0
      27
      60
      7
    
    
      595
      tf_idf_tokenized_window_2014_08_02
      obama
      say
      military
      us
      white_house
      iraq
      syria
      united_states
      strategy
      ...
      secretary
      threat
      protect
      24
      33
      17
      0
      27
      60
      7
    
    
      667
      tf_idf_tokenized_window_2012_05_08
      afghanistan
      afghan
      obama
      taliban
      troop
      war
      kabul
      us
      forces
      ...
      american
      say
      withdrawal
      24
      33
      17
      0
      27
      60
      7
    
    
      737
      tf_idf_tokenized_window_2014_06_04
      us
      iraq
      obama
      say
      military
      iraqi
      troop
      kerry
      washington
      ...
      official
      air_strike
      country
      24
      33
      17
      0
      27
      60
      7
    
    
      757
      tf_idf_tokenized_window_2014_09_02
      obama
      will
      say
      islamic_state
      president
      white_house
      congress
      american
      strategy
      ...
      house
      action
      war
      24
      33
      17
      0
      27
      60
      7
    
    
      846
      tf_idf_tokenized_window_2016_07_29
      afghanistan
      afghan
      kabul
      taliban
      hazara
      obama
      troop
      attack
      say
      ...
      ite
      80
      line
      24
      33
      17
      0
      27
      60
      7
    
    
      859
      tf_idf_tokenized_window_2013_05_06
      obama
      yemen
      drone
      drone_strike
      guantanamo
      detainee
      say
      al_qaida
      president
      ...
      americans
      speech
      president_barack
      24
      33
      17
      22
      9
      60
      7
    
    
      889
      tf_idf_tokenized_window_2015_12_14
      taliban
      mansour
      afghan
      mullah
      afghanistan
      pakistan
      akhtar
      kabul
      leader
      ...
      talk
      confirm
      longtime
      24
      33
      17
      0
      27
      60
      7
    
    
      894
      tf_idf_tokenized_window_2015_12_19
      obama
      president_barack
      oval_office
      threat
      san_bernardino
      california
      islamic_state
      americans
      white_house
      ...
      america
      group
      paris
      35
      33
      17
      32
      27
      60
      7
    
    
      990
      tf_idf_tokenized_window_2016_06_23
      taliban
      afghanistan
      afghan
      pakistan
      kabul
      mansour
      drone_strike
      mullah
      us
      ...
      last
      al_qaeda
      pakistani
      24
      33
      17
      0
      27
      60
      7
    
    
      1011
      tf_idf_tokenized_window_2015_11_12
      obama
      troop
      special
      deployment
      us
      ground
      syria
      president_barack
      forces
      ...
      broad
      military
      monday
      24
      33
      17
      0
      27
      60
      7
    
    
      1038
      tf_idf_tokenized_window_2014_05_13
      obama
      us
      afghanistan
      jarba
      syrian
      washington
      coalition
      military
      white_house
      ...
      non_lethal
      will
      security
      24
      33
      17
      0
      27
      60
      7
    
    
      1183
      tf_idf_tokenized_window_2017_03_08
      us
      trump
      mr
      donald_trump
      president
      military
      pentagon
      troop
      plan
      ...
      afghanistan
      deploy
      force
      24
      33
      17
      22
      9
      60
      7
    
    
      1223
      tf_idf_tokenized_window_2014_02_20
      hagel
      budget
      military
      pentagon
      army
      defense
      spending
      plan
      say
      ...
      size
      world
      billion
      44
      33
      17
      0
      27
      60
      7
    
    
      1237
      tf_idf_tokenized_window_2014_02_34
      afghan
      taliban
      afghanistan
      karzai
      pakistan
      release
      bagram
      us
      troop
      ...
      statement
      peace
      jail
      24
      33
      17
      0
      27
      60
      7
    
    
      1311
      tf_idf_tokenized_window_2016_05_04
      taliban
      mansour
      pakistan
      afghan
      afghanistan
      mullah
      leader
      drone_strike
      kabul
      ...
      deputy
      movement
      talk
      24
      33
      17
      0
      27
      60
      7
    
    
      1340
      tf_idf_tokenized_window_2013_10_05
      us
      afghanistan
      afghan
      karzai
      kabul
      pakistan
      troop
      say
      military
      ...
      000
      2014
      will
      24
      33
      17
      0
      27
      60
      7
    
    
      1403
      tf_idf_tokenized_window_2016_02_16
      taliban
      afghanistan
      afghan
      fraser
      kabul
      pakistan
      regime
      say
      troop
      ...
      face
      go
      problem
      24
      33
      17
      0
      27
      60
      7
    
    
      1513
      tf_idf_tokenized_window_2012_09_18
      afghan
      afghanistan
      taliban
      prison
      bagram
      kabul
      transfer
      troop
      detainees
      ...
      facility
      forces
      inmate
      24
      33
      17
      0
      27
      60
      7
    
  

27 rows × 38 columns



In [22]:

    
df.to_csv('data/windowbin/csv/result_all_dynamic_topic.csv')



In [ ]:



In [ ]:

	0	1	2	3	4	5	6	7	8	9	...	21	22	23	24	25	26	27	28	29	30
0	tf_idf_tokenized_window_2012_11_01	js	fjs	id	type	href	name	rferl	cssclass	link_button	...	createelement	parentnode	insertbefore	script	russia	3a	georgia	gulnara_karimova	document	hah
1	tf_idf_tokenized_window_2012_11_02	syrian	syria	say	turkey	assad	opposition	rebels	turkish	rebel	...	killed	coalition	foreign	patriot	president_bashar	fighting	month	army	activist	people
2	tf_idf_tokenized_window_2012_11_03	shelling	damascus	suburbs	regime	daraa	fierce	neighborhood	martyrs	artillery	...	clashes	demonstration	area	hama	youtu	warplane	wound	mortar	due	free
3	tf_idf_tokenized_window_2012_11_04	israel	gaza	israeli	hamas	palestinian	palestinians	arab	gaza_strip	say	...	will	al	jerusalem	state	egyptian	tel_aviv	un	conflict	netanyahu	ceasefire
4	tf_idf_tokenized_window_2012_11_05	news	com	http	www	in	killed	iraq	html	world	...	kill	car	breaking	ie	feeds	pakistan	2012	google	wound	facebook_reddit

	0	1	2	3	4	5	6	7	8	9	...	25	26	27	28	29	30	label_w2v_no	label_w2v_scale	label_w2v_normalize	label_wmd_distance
274	tf_idf_tokenized_window_2016_12_17	putin	russia	russian	say	military	moscow	nato	nuclear	shoigu	...	strong	crimea	threat	year	aggressor	peace	3	17	13	5
317	tf_idf_tokenized_window_2012_06_06	turkish	turkey	plane	syrian	jet	syria	airspace	say	nato	...	fly	warplane	response	meeting	recep_tayyip	violate	17	10	29	5
322	tf_idf_tokenized_window_2012_06_11	russia	helicopter	syria	russian	ship	say	moscow	lavrov	arm	...	vessel	repair	conflict	port	british	stop	17	10	7	5
375	tf_idf_tokenized_window_2016_04_14	russian	russia	putin	moscow	ukraine	military	kremlin	syria	aircraft	...	netanyahu	syrian	official	coordination	washington	ship	17	10	29	5
413	tf_idf_tokenized_window_2016_09_30	russia	putin	russian	moscow	ukraine	kremlin	syria	assad	president_vladimir	...	vote	relation	united_states	accuse	may	print	3	17	13	5

	0	1	2	3	4	5	6	7	8	9	...	28	29	30	label_w2v_no	label_w2v_scale	label_w2v_normalize	label_wmd_distance	label_wmd_normalize	label_wmd_scale	label_tfidf_normalize
43	tf_idf_tokenized_window_2013_03_34	afghanistan	military	soldiers	afghan	violent	combat	say	study	smith	...	crime	australian	man	24	33	17	0	27	60	7
82	tf_idf_tokenized_window_2012_03_13	afghan	afghanistan	soldier	us	bales	say	taliban	base	kandahar	...	killings	child	war	24	33	17	0	27	60	7
206	tf_idf_tokenized_window_2013_11_09	karzai	afghan	afghanistan	troop	jirga	sign	pact	loya	say	...	end	delegate	washington	24	33	17	0	27	60	7
229	tf_idf_tokenized_window_2013_11_32	us	obama	military	washington	american	war	policy	iraq	president_barack	...	aid	secretary	department	24	33	17	22	9	60	7
256	tf_idf_tokenized_window_2012_04_25	afghanistan	afghan	us	pakistan	nato	kabul	obama	taliban	bounty	...	suspend	operations	will	24	33	17	0	27	60	7
285	tf_idf_tokenized_window_2016_12_28	mattis	general	defense	secretary	military	retire	kelly	marine	command	...	marshall	job	trump	24	33	17	12	3	60	7
447	tf_idf_tokenized_window_2012_01_20	us	military	obama	afghanistan	china	defense	strategy	pentagon	say	...	wars	pakistan	president_barack	24	33	17	0	27	60	7
520	tf_idf_tokenized_window_2017_06_35	afghanistan	taliban	afghan	tora_bora	pakistan	troop	kabul	bin_laden	ghani	...	sajjan	group	soldiers	24	33	17	0	27	60	7
592	tf_idf_tokenized_window_2012_02_23	us	afghanistan	afghan	pakistan	taliban	nato	military	kabul	obama	...	drone	breaking	ie	24	33	17	0	27	60	7
595	tf_idf_tokenized_window_2014_08_02	obama	say	military	us	white_house	iraq	syria	united_states	strategy	...	secretary	threat	protect	24	33	17	0	27	60	7
667	tf_idf_tokenized_window_2012_05_08	afghanistan	afghan	obama	taliban	troop	war	kabul	us	forces	...	american	say	withdrawal	24	33	17	0	27	60	7
737	tf_idf_tokenized_window_2014_06_04	us	iraq	obama	say	military	iraqi	troop	kerry	washington	...	official	air_strike	country	24	33	17	0	27	60	7
757	tf_idf_tokenized_window_2014_09_02	obama	will	say	islamic_state	president	white_house	congress	american	strategy	...	house	action	war	24	33	17	0	27	60	7
846	tf_idf_tokenized_window_2016_07_29	afghanistan	afghan	kabul	taliban	hazara	obama	troop	attack	say	...	ite	80	line	24	33	17	0	27	60	7
859	tf_idf_tokenized_window_2013_05_06	obama	yemen	drone	drone_strike	guantanamo	detainee	say	al_qaida	president	...	americans	speech	president_barack	24	33	17	22	9	60	7
889	tf_idf_tokenized_window_2015_12_14	taliban	mansour	afghan	mullah	afghanistan	pakistan	akhtar	kabul	leader	...	talk	confirm	longtime	24	33	17	0	27	60	7
894	tf_idf_tokenized_window_2015_12_19	obama	president_barack	oval_office	threat	san_bernardino	california	islamic_state	americans	white_house	...	america	group	paris	35	33	17	32	27	60	7
990	tf_idf_tokenized_window_2016_06_23	taliban	afghanistan	afghan	pakistan	kabul	mansour	drone_strike	mullah	us	...	last	al_qaeda	pakistani	24	33	17	0	27	60	7
1011	tf_idf_tokenized_window_2015_11_12	obama	troop	special	deployment	us	ground	syria	president_barack	forces	...	broad	military	monday	24	33	17	0	27	60	7
1038	tf_idf_tokenized_window_2014_05_13	obama	us	afghanistan	jarba	syrian	washington	coalition	military	white_house	...	non_lethal	will	security	24	33	17	0	27	60	7
1183	tf_idf_tokenized_window_2017_03_08	us	trump	mr	donald_trump	president	military	pentagon	troop	plan	...	afghanistan	deploy	force	24	33	17	22	9	60	7
1223	tf_idf_tokenized_window_2014_02_20	hagel	budget	military	pentagon	army	defense	spending	plan	say	...	size	world	billion	44	33	17	0	27	60	7
1237	tf_idf_tokenized_window_2014_02_34	afghan	taliban	afghanistan	karzai	pakistan	release	bagram	us	troop	...	statement	peace	jail	24	33	17	0	27	60	7
1311	tf_idf_tokenized_window_2016_05_04	taliban	mansour	pakistan	afghan	afghanistan	mullah	leader	drone_strike	kabul	...	deputy	movement	talk	24	33	17	0	27	60	7
1340	tf_idf_tokenized_window_2013_10_05	us	afghanistan	afghan	karzai	kabul	pakistan	troop	say	military	...	000	2014	will	24	33	17	0	27	60	7
1403	tf_idf_tokenized_window_2016_02_16	taliban	afghanistan	afghan	fraser	kabul	pakistan	regime	say	troop	...	face	go	problem	24	33	17	0	27	60	7
1513	tf_idf_tokenized_window_2012_09_18	afghan	afghanistan	taliban	prison	bagram	kabul	transfer	troop	detainees	...	facility	forces	inmate	24	33	17	0	27	60	7