Kmean

TSNE

Word Mover's distance

TF-IDF


In [1]:
import numpy as np

from sklearn.externals import joblib
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn import cluster

from scipy import stats
import pickle
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D

from matplotlib import offsetbox
from sklearn import (manifold, datasets, decomposition, ensemble,
                     discriminant_analysis, random_projection)

import numpy as np  # a conventional alias
from sklearn.feature_extraction.text import CountVectorizer
from matplotlib.pyplot import *
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing

from gensim import models
import gensim
from gensim import corpora
from gensim.models import TfidfModel
from gensim.models import LsiModel
from gensim.similarities import MatrixSimilarity
from gensim.models import Word2Vec

from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans

import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from IPython.display import display
from time import time
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib notebook

In [2]:
df = pd.read_csv('data/windowbin/csv/all_dynamic_topic.csv')

# display(df)
index = df.iloc[:,2:3].values
# display(index)
df.drop(df.columns[[0,1,]], axis=1, inplace=True)
display(df.head())
display(df.iloc[:,1:].head())# used for corpus


# df1 = df.apply(lambda row: row.astype(str).str.contains('iran').any(), axis=1)
# display(df[df.apply(lambda row: row.astype(str).str.contains('iran').any(), axis=1)])


0 1 2 3 4 5 6 7 8 9 ... 21 22 23 24 25 26 27 28 29 30
0 tf_idf_tokenized_window_2012_11_01 js fjs id type href name rferl cssclass link_button ... createelement parentnode insertbefore script russia 3a georgia gulnara_karimova document hah
1 tf_idf_tokenized_window_2012_11_02 syrian syria say turkey assad opposition rebels turkish rebel ... killed coalition foreign patriot president_bashar fighting month army activist people
2 tf_idf_tokenized_window_2012_11_03 shelling damascus suburbs regime daraa fierce neighborhood martyrs artillery ... clashes demonstration area hama youtu warplane wound mortar due free
3 tf_idf_tokenized_window_2012_11_04 israel gaza israeli hamas palestinian palestinians arab gaza_strip say ... will al jerusalem state egyptian tel_aviv un conflict netanyahu ceasefire
4 tf_idf_tokenized_window_2012_11_05 news com http www in killed iraq html world ... kill car breaking ie feeds pakistan 2012 google wound facebook_reddit

5 rows × 31 columns

1 2 3 4 5 6 7 8 9 10 ... 21 22 23 24 25 26 27 28 29 30
0 js fjs id type href name rferl cssclass link_button 2f ... createelement parentnode insertbefore script russia 3a georgia gulnara_karimova document hah
1 syrian syria say turkey assad opposition rebels turkish rebel border ... killed coalition foreign patriot president_bashar fighting month army activist people
2 shelling damascus suburbs regime daraa fierce neighborhood martyrs artillery al ... clashes demonstration area hama youtu warplane wound mortar due free
3 israel gaza israeli hamas palestinian palestinians arab gaza_strip say egypt ... will al jerusalem state egyptian tel_aviv un conflict netanyahu ceasefire
4 news com http www in killed iraq html world reuters ... kill car breaking ie feeds pakistan 2012 google wound facebook_reddit

5 rows × 30 columns

Word Embedding


In [3]:
def load_w2v(word2vec_model_file):
    # load the finished model from disk
    word2vec_model = Word2Vec.load(word2vec_model_file)
    word2vec_model.init_sims(replace=True)
    return word2vec_model

In [4]:
word2vec_model_file = '/home/sonic/sonic/eosdb/data/eos/word2vec_model_all.model'
word2vec_model = load_w2v(word2vec_model_file)

In [5]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.vocab]
    return np.mean(word2vec_model[doc], axis=0)

In [6]:
def document_to_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.vocab]
    return np.array(word2vec_model[doc])

In [7]:
def kmeans(X, figName='', metric='euclidean'):

    estimators = {
        '4': KMeans(n_clusters=4),
        '6': KMeans(n_clusters=6),
        '10': KMeans(n_clusters=10),
        '15': KMeans(n_clusters=15),
        '20': KMeans(n_clusters=20),
        '30': KMeans(n_clusters=30),
        '40': KMeans(n_clusters=40),
        '50': KMeans(n_clusters=50),
        '70': KMeans(n_clusters=70),  
        '80': KMeans(n_clusters=80), 
        '100': KMeans(n_clusters=100),
        '150': KMeans(n_clusters=150),
        '200': KMeans(n_clusters=200),
    } 

    fignum = 1
    minsil=-1
    bestlabels=[]
    
    

    for name, est in estimators.items():
        fig = plt.figure(fignum, figsize=(8, 8))
        plt.clf()
        ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

        plt.cla()
        est.fit(X)
        
        labels = est.labels_
        centroids = est.cluster_centers_
        
        sil=silhouette_score(X, labels, metric=metric)
        print ("silhouette of " + name+"=" +str(sil))
        if sil>minsil:
            bestlabels=labels
            minsil=sil
            numOfCluster=name

        colormap = plt.cm.gist_ncar  # nipy_spectral, Set1,Paired
        colorst = [colormap(i) for i in np.linspace(0, 0.9, int(name))]
        colors=[]
        for i in labels:
            colors.append(colorst[i])

        ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=colors)  #X[:,3] means all rows column=3 ''''''c=labels.astype(np.float)

        ax.w_xaxis.set_ticklabels([])
        ax.w_yaxis.set_ticklabels([])
        ax.w_zaxis.set_ticklabels([])
        ax.set_xlabel(name)
        fignum = fignum + 1
        plt.title("%s k:%s" % (figName, name)) 
        plt.show()

   
    print ("Best silhouette =" + str(minsil))

    return numOfCluster,bestlabels

In [8]:
def analyze(X, figName='', metric='euclidean', show=False):
#     range_n_clusters = [2, 3, 4, 5, 6, 8, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200]
    range_n_clusters =  [30, 40, 50, 60, 70]
    
    best_k = 0
    min_score = 0
    bestlabels=[]

    for n_clusters in range_n_clusters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(12, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=42)
        cluster_labels = clusterer.fit_predict(X)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels, metric=metric)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)
        
        if silhouette_avg > min_score:
            bestlabels = cluster_labels
            min_score=silhouette_avg
            best_k=n_clusters

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, ith_cluster_silhouette_values,
                              facecolor=color, edgecolor=color, alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("clusters silhouette plot")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        # plt.cm.gist_ncar or nipy_spectral        
        colors = cm.spectral(cluster_labels.astype(float) / n_clusters)

        ax2.scatter(X[:, 0], X[:, 1], X[:, 2], c=colors)
        
        ax2.set_xlabel(n_clusters)

        # Labeling the clusters
        centers = clusterer.cluster_centers_
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                    c="white", alpha=1, s=200, edgecolor='k')

        for i, c in enumerate(centers):
            ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50, edgecolor='k')

        ax2.set_title("clustere %s k:%s" % (figName, n_clusters))
#         ax2.set_xlabel("Feature space for the 1st feature")
#         ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle(("Silhouette analysis for KMeans clustering on topic space "
                      "with n_clusters = %d" % n_clusters), fontsize=12, fontweight='bold')
    
    
        # Interactive figure
        if (show):
            fig2 = plt.figure(figsize=(8, 8))
            ax3 = Axes3D(fig2, rect=[0, 0, .95, 1], elev=48, azim=134)
            ax3.scatter(X[:, 0], X[:, 1], X[:, 2], c=colors)
            ax3.set_xlabel(n_clusters)
            ax3.w_xaxis.set_ticklabels([])
            ax3.w_yaxis.set_ticklabels([])
            ax3.w_zaxis.set_ticklabels([])
            ax3.set_title("%s cluster k:%s" % (figName, n_clusters))
        
        plt.show()
 
    return best_k, bestlabels

In [9]:
def getCorpus():
    corpus = df.iloc[:,3:].values.tolist()
    return corpus

TSNE experiments


In [10]:
%%time

corpus = getCorpus()

topic_w2v = np.array([document_vector(word2vec_model, doc) for doc in corpus])

X_embedded = TSNE(n_components=3, init='pca', verbose=2).fit_transform(topic_w2v)
# print(topic_w2v[0])

# n_components =20
# print("number of components: {}".format(n_components))
# X_reduced = TruncatedSVD(n_components=n_components, random_state=42).fit_transform(topic_w2v)
# print(X_reduced.shape)


[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 1546
[t-SNE] Computed conditional probabilities for sample 1546 / 1546
[t-SNE] Mean sigma: 0.099600
[t-SNE] Iteration 25: error = 1.4189861, gradient norm = 0.0218827
[t-SNE] Iteration 50: error = 1.3233318, gradient norm = 0.0182941
[t-SNE] Iteration 75: error = 1.0233327, gradient norm = 0.0080291
[t-SNE] Iteration 100: error = 0.9868832, gradient norm = 0.0094304
[t-SNE] KL divergence after 100 iterations with early exaggeration: 0.986883
[t-SNE] Iteration 125: error = 0.9117515, gradient norm = 0.0050173
[t-SNE] Iteration 150: error = 0.9109656, gradient norm = 0.0068414
[t-SNE] Iteration 175: error = 0.9053628, gradient norm = 0.0075644
[t-SNE] Iteration 200: error = 0.8981646, gradient norm = 0.0078038
[t-SNE] Iteration 225: error = 0.8905883, gradient norm = 0.0084922
[t-SNE] Iteration 250: error = 0.8864372, gradient norm = 0.0092010
[t-SNE] Iteration 275: error = 0.8732015, gradient norm = 0.0088293
[t-SNE] Iteration 300: error = 0.8533086, gradient norm = 0.0089430
[t-SNE] Iteration 325: error = 0.8383408, gradient norm = 0.0090336
[t-SNE] Iteration 350: error = 0.8169093, gradient norm = 0.0086195
[t-SNE] Iteration 375: error = 0.7946079, gradient norm = 0.0082852
[t-SNE] Iteration 400: error = 0.7690800, gradient norm = 0.0077375
[t-SNE] Iteration 425: error = 0.7541503, gradient norm = 0.0076468
[t-SNE] Iteration 450: error = 0.7454948, gradient norm = 0.0077390
[t-SNE] Iteration 475: error = 0.7299900, gradient norm = 0.0076015
[t-SNE] Iteration 500: error = 0.7068597, gradient norm = 0.0073691
[t-SNE] Iteration 525: error = 0.6942506, gradient norm = 0.0073081
[t-SNE] Iteration 550: error = 0.6771215, gradient norm = 0.0070034
[t-SNE] Iteration 575: error = 0.6629848, gradient norm = 0.0066949
[t-SNE] Iteration 600: error = 0.6483309, gradient norm = 0.0067359
[t-SNE] Iteration 625: error = 0.6380474, gradient norm = 0.0062304
[t-SNE] Iteration 650: error = 0.6385837, gradient norm = 0.0066448
[t-SNE] Iteration 675: error = 0.6283131, gradient norm = 0.0063935
[t-SNE] Iteration 700: error = 0.6085238, gradient norm = 0.0062830
[t-SNE] Iteration 725: error = 0.6018202, gradient norm = 0.0060179
[t-SNE] Iteration 750: error = 0.5928153, gradient norm = 0.0058105
[t-SNE] Iteration 775: error = 0.5835083, gradient norm = 0.0054687
[t-SNE] Iteration 800: error = 0.5855131, gradient norm = 0.0055564
[t-SNE] Iteration 825: error = 0.5666971, gradient norm = 0.0050397
[t-SNE] Iteration 850: error = 0.5644496, gradient norm = 0.0053086
[t-SNE] Iteration 875: error = 0.5799002, gradient norm = 0.0058943
[t-SNE] Iteration 900: error = 0.5560875, gradient norm = 0.0052589
[t-SNE] Iteration 925: error = 0.5507349, gradient norm = 0.0051035
[t-SNE] Iteration 950: error = 0.5548517, gradient norm = 0.0050829
[t-SNE] Iteration 975: error = 0.5434975, gradient norm = 0.0048078
[t-SNE] Iteration 1000: error = 0.5361019, gradient norm = 0.0048077
[t-SNE] Error after 1000 iterations: 0.986883
CPU times: user 1min 22s, sys: 1.94 s, total: 1min 24s
Wall time: 1min 23s

In [11]:
%%time


# No scaling
# numofClusters, bestlabels = kmeans(X_embedded)
best_k, bestlabels = analyze(X_embedded, figName="W2V")
print('best number of clusters: %s' % best_k)
df['label_w2v_no'] = bestlabels
display(df.head())


For n_clusters = 30 The average silhouette_score is : 0.340386747146
/usr/local/lib/python3.5/dist-packages/matplotlib/collections.py:865: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor
For n_clusters = 40 The average silhouette_score is : 0.355603028038
For n_clusters = 50 The average silhouette_score is : 0.364453787019
For n_clusters = 60 The average silhouette_score is : 0.342293280215
For n_clusters = 70 The average silhouette_score is : 0.341910846014
best number of clusters: 50
0 1 2 3 4 5 6 7 8 9 ... 22 23 24 25 26 27 28 29 30 label_w2v_no
0 tf_idf_tokenized_window_2012_11_01 js fjs id type href name rferl cssclass link_button ... parentnode insertbefore script russia 3a georgia gulnara_karimova document hah 36
1 tf_idf_tokenized_window_2012_11_02 syrian syria say turkey assad opposition rebels turkish rebel ... coalition foreign patriot president_bashar fighting month army activist people 20
2 tf_idf_tokenized_window_2012_11_03 shelling damascus suburbs regime daraa fierce neighborhood martyrs artillery ... demonstration area hama youtu warplane wound mortar due free 14
3 tf_idf_tokenized_window_2012_11_04 israel gaza israeli hamas palestinian palestinians arab gaza_strip say ... al jerusalem state egyptian tel_aviv un conflict netanyahu ceasefire 8
4 tf_idf_tokenized_window_2012_11_05 news com http www in killed iraq html world ... car breaking ie feeds pakistan 2012 google wound facebook_reddit 34

5 rows × 32 columns

CPU times: user 6.25 s, sys: 8.54 s, total: 14.8 s
Wall time: 4.92 s

In [12]:
%%time


# Scaled
X_embedded_scaled = preprocessing.scale(X_embedded)
best_k, bestlabels = analyze(X_embedded_scaled, figName="W2V scale")
print('best number of clusters: %s' % best_k)
df['label_w2v_scale'] = bestlabels
display(df.head())


For n_clusters = 30 The average silhouette_score is : 0.346398450368
/usr/local/lib/python3.5/dist-packages/matplotlib/collections.py:865: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor
For n_clusters = 40 The average silhouette_score is : 0.349881376175
For n_clusters = 50 The average silhouette_score is : 0.342778885706
For n_clusters = 60 The average silhouette_score is : 0.339568862722
For n_clusters = 70 The average silhouette_score is : 0.345787204445
best number of clusters: 40
0 1 2 3 4 5 6 7 8 9 ... 23 24 25 26 27 28 29 30 label_w2v_no label_w2v_scale
0 tf_idf_tokenized_window_2012_11_01 js fjs id type href name rferl cssclass link_button ... insertbefore script russia 3a georgia gulnara_karimova document hah 36 6
1 tf_idf_tokenized_window_2012_11_02 syrian syria say turkey assad opposition rebels turkish rebel ... foreign patriot president_bashar fighting month army activist people 20 3
2 tf_idf_tokenized_window_2012_11_03 shelling damascus suburbs regime daraa fierce neighborhood martyrs artillery ... area hama youtu warplane wound mortar due free 14 27
3 tf_idf_tokenized_window_2012_11_04 israel gaza israeli hamas palestinian palestinians arab gaza_strip say ... jerusalem state egyptian tel_aviv un conflict netanyahu ceasefire 8 19
4 tf_idf_tokenized_window_2012_11_05 news com http www in killed iraq html world ... breaking ie feeds pakistan 2012 google wound facebook_reddit 34 14

5 rows × 33 columns

CPU times: user 6.7 s, sys: 9 s, total: 15.7 s
Wall time: 5.11 s

In [13]:
%%time


# Normalized 
X_normalized = preprocessing.normalize(X_embedded, norm='l2')
best_k, bestlabels = analyze(X_normalized, figName="W2V normalize")
print('best number of clusters: %s' % best_k)
df['label_w2v_normalize'] = bestlabels
display(df.head())


For n_clusters = 30 The average silhouette_score is : 0.425847357464
/usr/local/lib/python3.5/dist-packages/matplotlib/collections.py:865: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor
For n_clusters = 40 The average silhouette_score is : 0.425286207796
For n_clusters = 50 The average silhouette_score is : 0.392950242071
For n_clusters = 60 The average silhouette_score is : 0.382804062511
For n_clusters = 70 The average silhouette_score is : 0.372795617088
best number of clusters: 30
0 1 2 3 4 5 6 7 8 9 ... 24 25 26 27 28 29 30 label_w2v_no label_w2v_scale label_w2v_normalize
0 tf_idf_tokenized_window_2012_11_01 js fjs id type href name rferl cssclass link_button ... script russia 3a georgia gulnara_karimova document hah 36 6 25
1 tf_idf_tokenized_window_2012_11_02 syrian syria say turkey assad opposition rebels turkish rebel ... patriot president_bashar fighting month army activist people 20 3 20
2 tf_idf_tokenized_window_2012_11_03 shelling damascus suburbs regime daraa fierce neighborhood martyrs artillery ... hama youtu warplane wound mortar due free 14 27 21
3 tf_idf_tokenized_window_2012_11_04 israel gaza israeli hamas palestinian palestinians arab gaza_strip say ... state egyptian tel_aviv un conflict netanyahu ceasefire 8 19 7
4 tf_idf_tokenized_window_2012_11_05 news com http www in killed iraq html world ... ie feeds pakistan 2012 google wound facebook_reddit 34 14 21

5 rows × 34 columns

CPU times: user 6.51 s, sys: 8.96 s, total: 15.5 s
Wall time: 5.07 s

Word Mover's Distance


In [14]:
X_wmd_distance_eos = pd.read_pickle('data/df_X_wmd_distance_eos.plk')
# print(X_wmd_distance_eos)

X_wmd_distance_eos = TSNE(n_components=3, init='pca', verbose=2).fit_transform(X_wmd_distance_eos)


[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 1546
[t-SNE] Computed conditional probabilities for sample 1546 / 1546
[t-SNE] Mean sigma: 1.001407
[t-SNE] Iteration 25: error = 1.4322207, gradient norm = 0.0227097
[t-SNE] Iteration 50: error = 1.2940395, gradient norm = 0.0184377
[t-SNE] Iteration 75: error = 1.0870367, gradient norm = 0.0099298
[t-SNE] Iteration 100: error = 1.0724896, gradient norm = 0.0119170
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.072490
[t-SNE] Iteration 125: error = 1.0104851, gradient norm = 0.0077116
[t-SNE] Iteration 150: error = 1.0190333, gradient norm = 0.0105304
[t-SNE] Iteration 175: error = 1.0037144, gradient norm = 0.0107932
[t-SNE] Iteration 200: error = 0.9737310, gradient norm = 0.0110103
[t-SNE] Iteration 225: error = 0.9447153, gradient norm = 0.0105912
[t-SNE] Iteration 250: error = 0.9243681, gradient norm = 0.0106318
[t-SNE] Iteration 275: error = 0.9146151, gradient norm = 0.0101901
[t-SNE] Iteration 300: error = 0.8847403, gradient norm = 0.0100836
[t-SNE] Iteration 325: error = 0.8630164, gradient norm = 0.0100205
[t-SNE] Iteration 350: error = 0.8355398, gradient norm = 0.0094951
[t-SNE] Iteration 375: error = 0.8184664, gradient norm = 0.0094181
[t-SNE] Iteration 400: error = 0.7997040, gradient norm = 0.0091967
[t-SNE] Iteration 425: error = 0.7847713, gradient norm = 0.0094029
[t-SNE] Iteration 450: error = 0.7702472, gradient norm = 0.0088543
[t-SNE] Iteration 475: error = 0.7543474, gradient norm = 0.0083614
[t-SNE] Iteration 500: error = 0.7352163, gradient norm = 0.0081374
[t-SNE] Iteration 525: error = 0.7285579, gradient norm = 0.0079570
[t-SNE] Iteration 550: error = 0.7338256, gradient norm = 0.0083008
[t-SNE] Iteration 575: error = 0.7094892, gradient norm = 0.0079181
[t-SNE] Iteration 600: error = 0.7088695, gradient norm = 0.0080201
[t-SNE] Iteration 625: error = 0.7071211, gradient norm = 0.0078997
[t-SNE] Iteration 650: error = 0.7068320, gradient norm = 0.0077355
[t-SNE] Iteration 675: error = 0.6964190, gradient norm = 0.0074916
[t-SNE] Iteration 700: error = 0.6924484, gradient norm = 0.0075237
[t-SNE] Iteration 725: error = 0.6896589, gradient norm = 0.0072161
[t-SNE] Iteration 750: error = 0.6862370, gradient norm = 0.0072295
[t-SNE] Iteration 775: error = 0.6779082, gradient norm = 0.0074539
[t-SNE] Iteration 800: error = 0.6771815, gradient norm = 0.0072414
[t-SNE] Iteration 825: error = 0.6755974, gradient norm = 0.0071828
[t-SNE] Iteration 850: error = 0.6778833, gradient norm = 0.0073407
[t-SNE] Iteration 875: error = 0.6716080, gradient norm = 0.0075684
[t-SNE] Iteration 900: error = 0.6623158, gradient norm = 0.0072712
[t-SNE] Iteration 925: error = 0.6603461, gradient norm = 0.0076386
[t-SNE] Iteration 950: error = 0.6507266, gradient norm = 0.0071244
[t-SNE] Iteration 975: error = 0.6522916, gradient norm = 0.0073010
[t-SNE] Iteration 1000: error = 0.6480749, gradient norm = 0.0071423
[t-SNE] Error after 1000 iterations: 1.072490

In [15]:
# Pairewise distance
best_k, bestlabels = analyze(X_wmd_distance_eos, figName="WMD distance")
print('best number of clusters: %s' % best_k)
df['label_wmd_distance'] = bestlabels

display(df.head())


For n_clusters = 30 The average silhouette_score is : 0.359614723431
/usr/local/lib/python3.5/dist-packages/matplotlib/collections.py:865: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor
For n_clusters = 40 The average silhouette_score is : 0.369713484049
For n_clusters = 50 The average silhouette_score is : 0.374482685379
For n_clusters = 60 The average silhouette_score is : 0.374255032651
For n_clusters = 70 The average silhouette_score is : 0.368823915341
best number of clusters: 50
0 1 2 3 4 5 6 7 8 9 ... 25 26 27 28 29 30 label_w2v_no label_w2v_scale label_w2v_normalize label_wmd_distance
0 tf_idf_tokenized_window_2012_11_01 js fjs id type href name rferl cssclass link_button ... russia 3a georgia gulnara_karimova document hah 36 6 25 25
1 tf_idf_tokenized_window_2012_11_02 syrian syria say turkey assad opposition rebels turkish rebel ... president_bashar fighting month army activist people 20 3 20 35
2 tf_idf_tokenized_window_2012_11_03 shelling damascus suburbs regime daraa fierce neighborhood martyrs artillery ... youtu warplane wound mortar due free 14 27 21 26
3 tf_idf_tokenized_window_2012_11_04 israel gaza israeli hamas palestinian palestinians arab gaza_strip say ... egyptian tel_aviv un conflict netanyahu ceasefire 8 19 7 11
4 tf_idf_tokenized_window_2012_11_05 news com http www in killed iraq html world ... feeds pakistan 2012 google wound facebook_reddit 34 14 21 2

5 rows × 35 columns


In [16]:
display(df.loc[df['label_wmd_distance'] == 5].head())


0 1 2 3 4 5 6 7 8 9 ... 25 26 27 28 29 30 label_w2v_no label_w2v_scale label_w2v_normalize label_wmd_distance
274 tf_idf_tokenized_window_2016_12_17 putin russia russian say military moscow nato nuclear shoigu ... strong crimea threat year aggressor peace 3 17 13 5
317 tf_idf_tokenized_window_2012_06_06 turkish turkey plane syrian jet syria airspace say nato ... fly warplane response meeting recep_tayyip violate 17 10 29 5
322 tf_idf_tokenized_window_2012_06_11 russia helicopter syria russian ship say moscow lavrov arm ... vessel repair conflict port british stop 17 10 7 5
375 tf_idf_tokenized_window_2016_04_14 russian russia putin moscow ukraine military kremlin syria aircraft ... netanyahu syrian official coordination washington ship 17 10 29 5
413 tf_idf_tokenized_window_2016_09_30 russia putin russian moscow ukraine kremlin syria assad president_vladimir ... vote relation united_states accuse may print 3 17 13 5

5 rows × 35 columns


In [17]:
# Normalized 
X_wmd_normalized = preprocessing.normalize(X_wmd_distance_eos, norm='l2')

pca = PCA(n_components=3)
pca.fit(X_wmd_normalized)
X_wmd_normalized = pca.transform(X_wmd_normalized)

best_k, bestlabels = analyze(X_wmd_normalized,  figName="WMD normalized")
print('best number of clusters: %s' % best_k)
df['label_wmd_normalize'] = bestlabels


/usr/local/lib/python3.5/dist-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
For n_clusters = 30 The average silhouette_score is : 0.426789669373
/usr/local/lib/python3.5/dist-packages/matplotlib/collections.py:865: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor
For n_clusters = 40 The average silhouette_score is : 0.421204624843
For n_clusters = 50 The average silhouette_score is : 0.425255270425
For n_clusters = 60 The average silhouette_score is : 0.409730192013
For n_clusters = 70 The average silhouette_score is : 0.411370124992
best number of clusters: 30

In [18]:
# Scaled
X_wmd_scaled = preprocessing.scale(X_wmd_distance_eos)

pca = PCA(n_components=3)
pca.fit(X_wmd_scaled)
X_wmd_scaled = pca.transform(X_wmd_scaled)

best_k, bestlabels = analyze(X_wmd_scaled, figName="WMD scaled")
df['label_wmd_scale'] = best_k
display(df.head())


/usr/local/lib/python3.5/dist-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
For n_clusters = 30 The average silhouette_score is : 0.356698250569
/usr/local/lib/python3.5/dist-packages/matplotlib/collections.py:865: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor
For n_clusters = 40 The average silhouette_score is : 0.373808434184
For n_clusters = 50 The average silhouette_score is : 0.378579693242
For n_clusters = 60 The average silhouette_score is : 0.382404378202
For n_clusters = 70 The average silhouette_score is : 0.377502397647
0 1 2 3 4 5 6 7 8 9 ... 27 28 29 30 label_w2v_no label_w2v_scale label_w2v_normalize label_wmd_distance label_wmd_normalize label_wmd_scale
0 tf_idf_tokenized_window_2012_11_01 js fjs id type href name rferl cssclass link_button ... georgia gulnara_karimova document hah 36 6 25 25 16 60
1 tf_idf_tokenized_window_2012_11_02 syrian syria say turkey assad opposition rebels turkish rebel ... month army activist people 20 3 20 35 6 60
2 tf_idf_tokenized_window_2012_11_03 shelling damascus suburbs regime daraa fierce neighborhood martyrs artillery ... wound mortar due free 14 27 21 26 7 60
3 tf_idf_tokenized_window_2012_11_04 israel gaza israeli hamas palestinian palestinians arab gaza_strip say ... un conflict netanyahu ceasefire 8 19 7 11 2 60
4 tf_idf_tokenized_window_2012_11_05 news com http www in killed iraq html world ... 2012 google wound facebook_reddit 34 14 21 2 22 60

5 rows × 37 columns

TF-IDF


In [19]:
%%time 


corpus_all  = []

for corpus_line in getCorpus():
    corpus_all.append(u' '.join(str(e) for e in corpus_line))
    
print(corpus_all[0])
    
vectorizer = CountVectorizer(max_df=0.5, min_df=5)
# vectorizer = CountVectorizer(max_df=0.5)
X_tfidf = vectorizer.fit_transform(corpus_all)  # a sparse matrix

vocab = vectorizer.get_feature_names()  # a list

print(len(vocab))

print(X_tfidf.shape)


id type href name rferl cssclass link_button 2f function facebook target var twitter russian true getelementbyid getelementsbytagname src createelement parentnode insertbefore script russia 3a georgia gulnara_karimova document hah 36 6 25 25 16 60
1614
(1546, 1614)
CPU times: user 68 ms, sys: 28 ms, total: 96 ms
Wall time: 68.7 ms

LSA


In [20]:
%%time


# Normalized 
X_tfidf_normalized = preprocessing.normalize(X_tfidf, norm='l2')

# LSA
X_tfidf_normalized = decomposition.TruncatedSVD(n_components=50).fit_transform(X_tfidf_normalized)

X_tfidf_normalized = TSNE(n_components=3, init='pca', verbose=2).fit_transform(X_tfidf_normalized)

best_k, bestlabels = analyze(X_tfidf_normalized, figName="TF-IDF normalized")
print('best number of clusters: %s' % best_k)
df['label_tfidf_normalize'] = bestlabels
display(df.head())


[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py:429: DataConversionWarning: Data with input dtype int64 was converted to float64 by the normalize function.
  warnings.warn(msg, _DataConversionWarning)
[t-SNE] Computed conditional probabilities for sample 1000 / 1546
[t-SNE] Computed conditional probabilities for sample 1546 / 1546
[t-SNE] Mean sigma: 0.247049
[t-SNE] Iteration 25: error = 1.3445078, gradient norm = 0.0219273
[t-SNE] Iteration 50: error = 1.2575742, gradient norm = 0.0188620
[t-SNE] Iteration 75: error = 1.0258642, gradient norm = 0.0092552
[t-SNE] Iteration 100: error = 0.9953162, gradient norm = 0.0115447
[t-SNE] KL divergence after 100 iterations with early exaggeration: 0.995316
[t-SNE] Iteration 125: error = 0.9432883, gradient norm = 0.0071119
[t-SNE] Iteration 150: error = 0.9443769, gradient norm = 0.0099680
[t-SNE] Iteration 175: error = 0.9219033, gradient norm = 0.0104395
[t-SNE] Iteration 200: error = 0.8945036, gradient norm = 0.0108962
[t-SNE] Iteration 225: error = 0.8704006, gradient norm = 0.0107456
[t-SNE] Iteration 250: error = 0.8362591, gradient norm = 0.0104092
[t-SNE] Iteration 275: error = 0.8115819, gradient norm = 0.0100990
[t-SNE] Iteration 300: error = 0.7805374, gradient norm = 0.0094049
[t-SNE] Iteration 325: error = 0.7466424, gradient norm = 0.0091737
[t-SNE] Iteration 350: error = 0.7220446, gradient norm = 0.0088565
[t-SNE] Iteration 375: error = 0.6973040, gradient norm = 0.0083821
[t-SNE] Iteration 400: error = 0.6857279, gradient norm = 0.0084970
[t-SNE] Iteration 425: error = 0.6632810, gradient norm = 0.0082965
[t-SNE] Iteration 450: error = 0.6479629, gradient norm = 0.0081684
[t-SNE] Iteration 475: error = 0.6296813, gradient norm = 0.0077444
[t-SNE] Iteration 500: error = 0.6153987, gradient norm = 0.0079152
[t-SNE] Iteration 525: error = 0.5953552, gradient norm = 0.0076479
[t-SNE] Iteration 550: error = 0.5807781, gradient norm = 0.0075347
[t-SNE] Iteration 575: error = 0.5590311, gradient norm = 0.0067664
[t-SNE] Iteration 600: error = 0.5544318, gradient norm = 0.0071716
[t-SNE] Iteration 625: error = 0.5300968, gradient norm = 0.0064636
[t-SNE] Iteration 650: error = 0.5264320, gradient norm = 0.0063041
[t-SNE] Iteration 675: error = 0.5223910, gradient norm = 0.0070001
[t-SNE] Iteration 700: error = 0.5004690, gradient norm = 0.0061851
[t-SNE] Iteration 725: error = 0.4935896, gradient norm = 0.0061987
[t-SNE] Iteration 750: error = 0.4904751, gradient norm = 0.0061189
[t-SNE] Iteration 775: error = 0.4826149, gradient norm = 0.0060531
[t-SNE] Iteration 800: error = 0.4671809, gradient norm = 0.0057716
[t-SNE] Iteration 825: error = 0.4606906, gradient norm = 0.0054221
[t-SNE] Iteration 850: error = 0.4699987, gradient norm = 0.0055279
[t-SNE] Iteration 875: error = 0.4491466, gradient norm = 0.0051439
[t-SNE] Iteration 900: error = 0.4520867, gradient norm = 0.0050965
[t-SNE] Iteration 925: error = 0.4482745, gradient norm = 0.0048697
[t-SNE] Iteration 950: error = 0.4463399, gradient norm = 0.0049469
[t-SNE] Iteration 975: error = 0.4524499, gradient norm = 0.0048623
[t-SNE] Iteration 1000: error = 0.4521287, gradient norm = 0.0055128
[t-SNE] Iteration 1000: did not make any progress during the last 30 episodes. Finished.
[t-SNE] Error after 1000 iterations: 0.995316
/usr/local/lib/python3.5/dist-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
For n_clusters = 30 The average silhouette_score is : 0.415405135063
/usr/local/lib/python3.5/dist-packages/matplotlib/collections.py:865: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor
For n_clusters = 40 The average silhouette_score is : 0.425917020388
For n_clusters = 50 The average silhouette_score is : 0.437178245572
For n_clusters = 60 The average silhouette_score is : 0.431400422075
For n_clusters = 70 The average silhouette_score is : 0.408497924998
best number of clusters: 50
0 1 2 3 4 5 6 7 8 9 ... 28 29 30 label_w2v_no label_w2v_scale label_w2v_normalize label_wmd_distance label_wmd_normalize label_wmd_scale label_tfidf_normalize
0 tf_idf_tokenized_window_2012_11_01 js fjs id type href name rferl cssclass link_button ... gulnara_karimova document hah 36 6 25 25 16 60 45
1 tf_idf_tokenized_window_2012_11_02 syrian syria say turkey assad opposition rebels turkish rebel ... army activist people 20 3 20 35 6 60 15
2 tf_idf_tokenized_window_2012_11_03 shelling damascus suburbs regime daraa fierce neighborhood martyrs artillery ... mortar due free 14 27 21 26 7 60 43
3 tf_idf_tokenized_window_2012_11_04 israel gaza israeli hamas palestinian palestinians arab gaza_strip say ... conflict netanyahu ceasefire 8 19 7 11 2 60 10
4 tf_idf_tokenized_window_2012_11_05 news com http www in killed iraq html world ... google wound facebook_reddit 34 14 21 2 22 60 9

5 rows × 38 columns

CPU times: user 1min 34s, sys: 11.6 s, total: 1min 46s
Wall time: 1min 34s

In [21]:
display(df.loc[df['label_tfidf_normalize'] == 7])


0 1 2 3 4 5 6 7 8 9 ... 28 29 30 label_w2v_no label_w2v_scale label_w2v_normalize label_wmd_distance label_wmd_normalize label_wmd_scale label_tfidf_normalize
43 tf_idf_tokenized_window_2013_03_34 afghanistan military soldiers afghan violent combat say study smith ... crime australian man 24 33 17 0 27 60 7
82 tf_idf_tokenized_window_2012_03_13 afghan afghanistan soldier us bales say taliban base kandahar ... killings child war 24 33 17 0 27 60 7
206 tf_idf_tokenized_window_2013_11_09 karzai afghan afghanistan troop jirga sign pact loya say ... end delegate washington 24 33 17 0 27 60 7
229 tf_idf_tokenized_window_2013_11_32 us obama military washington american war policy iraq president_barack ... aid secretary department 24 33 17 22 9 60 7
256 tf_idf_tokenized_window_2012_04_25 afghanistan afghan us pakistan nato kabul obama taliban bounty ... suspend operations will 24 33 17 0 27 60 7
285 tf_idf_tokenized_window_2016_12_28 mattis general defense secretary military retire kelly marine command ... marshall job trump 24 33 17 12 3 60 7
447 tf_idf_tokenized_window_2012_01_20 us military obama afghanistan china defense strategy pentagon say ... wars pakistan president_barack 24 33 17 0 27 60 7
520 tf_idf_tokenized_window_2017_06_35 afghanistan taliban afghan tora_bora pakistan troop kabul bin_laden ghani ... sajjan group soldiers 24 33 17 0 27 60 7
592 tf_idf_tokenized_window_2012_02_23 us afghanistan afghan pakistan taliban nato military kabul obama ... drone breaking ie 24 33 17 0 27 60 7
595 tf_idf_tokenized_window_2014_08_02 obama say military us white_house iraq syria united_states strategy ... secretary threat protect 24 33 17 0 27 60 7
667 tf_idf_tokenized_window_2012_05_08 afghanistan afghan obama taliban troop war kabul us forces ... american say withdrawal 24 33 17 0 27 60 7
737 tf_idf_tokenized_window_2014_06_04 us iraq obama say military iraqi troop kerry washington ... official air_strike country 24 33 17 0 27 60 7
757 tf_idf_tokenized_window_2014_09_02 obama will say islamic_state president white_house congress american strategy ... house action war 24 33 17 0 27 60 7
846 tf_idf_tokenized_window_2016_07_29 afghanistan afghan kabul taliban hazara obama troop attack say ... ite 80 line 24 33 17 0 27 60 7
859 tf_idf_tokenized_window_2013_05_06 obama yemen drone drone_strike guantanamo detainee say al_qaida president ... americans speech president_barack 24 33 17 22 9 60 7
889 tf_idf_tokenized_window_2015_12_14 taliban mansour afghan mullah afghanistan pakistan akhtar kabul leader ... talk confirm longtime 24 33 17 0 27 60 7
894 tf_idf_tokenized_window_2015_12_19 obama president_barack oval_office threat san_bernardino california islamic_state americans white_house ... america group paris 35 33 17 32 27 60 7
990 tf_idf_tokenized_window_2016_06_23 taliban afghanistan afghan pakistan kabul mansour drone_strike mullah us ... last al_qaeda pakistani 24 33 17 0 27 60 7
1011 tf_idf_tokenized_window_2015_11_12 obama troop special deployment us ground syria president_barack forces ... broad military monday 24 33 17 0 27 60 7
1038 tf_idf_tokenized_window_2014_05_13 obama us afghanistan jarba syrian washington coalition military white_house ... non_lethal will security 24 33 17 0 27 60 7
1183 tf_idf_tokenized_window_2017_03_08 us trump mr donald_trump president military pentagon troop plan ... afghanistan deploy force 24 33 17 22 9 60 7
1223 tf_idf_tokenized_window_2014_02_20 hagel budget military pentagon army defense spending plan say ... size world billion 44 33 17 0 27 60 7
1237 tf_idf_tokenized_window_2014_02_34 afghan taliban afghanistan karzai pakistan release bagram us troop ... statement peace jail 24 33 17 0 27 60 7
1311 tf_idf_tokenized_window_2016_05_04 taliban mansour pakistan afghan afghanistan mullah leader drone_strike kabul ... deputy movement talk 24 33 17 0 27 60 7
1340 tf_idf_tokenized_window_2013_10_05 us afghanistan afghan karzai kabul pakistan troop say military ... 000 2014 will 24 33 17 0 27 60 7
1403 tf_idf_tokenized_window_2016_02_16 taliban afghanistan afghan fraser kabul pakistan regime say troop ... face go problem 24 33 17 0 27 60 7
1513 tf_idf_tokenized_window_2012_09_18 afghan afghanistan taliban prison bagram kabul transfer troop detainees ... facility forces inmate 24 33 17 0 27 60 7

27 rows × 38 columns


In [22]:
df.to_csv('data/windowbin/csv/result_all_dynamic_topic.csv')

In [ ]:


In [ ]: