In [1]:
import numpy as np
from gensim import corpora, models
import gensim
import csv
from sklearn.externals import joblib
import bz2
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy import stats
import pickle
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
%pylab inline
# %matplotlib notebook
In [2]:
%%time
# dataset = 'karbala'
dataset = 'all'
corpus_dictionary_file = 'data/eos/ngram/bigram_dict_'+dataset+'.dict'
corpus_doc2bow_file = 'data/eos/ngram/bigram_bow_corpus_'+dataset+'.mm'
# Load to memory
corpus = gensim.corpora.MmCorpus(corpus_doc2bow_file)
dictionary = gensim.corpora.Dictionary.load(corpus_dictionary_file)
# ldamodel = joblib.load('data/eos/lda/16_LDAmodel_EOS.pkl')
print(corpus)
print(dictionary)
# print(ldamodel)
In [14]:
%%time
def get_colors(ldamodel):
color = []
logging.info("starting colors")
for corpus_line in corpus:
sorted_topic_line = list(sorted(ldamodel[corpus_line], key=lambda x: x[1], reverse=True))
color.append(sorted_topic_line[0][0])
with open('data/eos/pca_color.p', 'wb') as fp:
pickle.dump(color, fp)
In [15]:
%%time
def get_lda_out(ldamodel):
lda_output = []
for line in corpus:
lda_output.append(ldamodel[line])
with open('data/eos/pca_lda_output.p', 'wb') as fp:
pickle.dump(lda_output, fp)
In [3]:
%%time
topic_model = 'data/eos/lda/20_LDAmodel_EOS.pkl'
ldamodel = joblib.load(topic_model)
print(ldamodel)
In [21]:
%%time
get_colors(ldamodel)
get_lda_out(ldamodel)
In [4]:
%%time
color = pickle.load( open('data/eos/pca_color.p', "rb") )
lda_output = pickle.load( open('data/eos/pca_lda_output.p', "rb") )
print(len(lda_output))
topic_num = 20
topics_data = np.zeros(shape=(len(corpus), topic_num))
for i, line in enumerate(lda_output):
for topic_line in line:
topics_data[i][topic_line[0]] = topic_line[1]
In [5]:
%%time
X_pca = PCA().fit_transform(topics_data)
figure(num=None, figsize=(18, 11), dpi=80, facecolor='w', edgecolor='k')
scatter(X_pca[:, 0], X_pca[:, 1], c=color)
In [ ]:
%%time
topics_data_norm = stats.zscore(topics_data, axis=0)
X_pca = PCA().fit_transform(topics_data_norm )
figure(num=None, figsize=(18, 11), dpi=800, facecolor='w', edgecolor='k')
scatter(X_pca[:, 0], X_pca[:, 1], c=color)
In [27]:
%%time
topics_data = np.zeros(shape=(len(corpus) * 2, 10))
print (topics_data.shape)
print(len(lda_output))
for i, line in enumerate(lda_output):
for topic_line in line:
topics_data[i][topic_line[0]] = topic_line[1]
topics_data_norm = stats.zscore(topics_data, axis=0)
print(topics_data_norm.shape)
color = []
for line in topics_data_norm:
color_number =[i for i, j in enumerate(line) if j == max(line)]
color.append(int(color_number[0]))
X_pca = PCA().fit_transform(topics_data_norm )
figure(num=None, figsize=(20, 12), dpi=80, facecolor='w', edgecolor='k')
scatter(X_pca[:, 0], X_pca[:, 1], c=color)
In [ ]:
%%time
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(topics_data_norm)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
# print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
# print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
# print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
# print("Adjusted Rand Index: %0.3f"
# % metrics.adjusted_rand_score(labels_true, labels))
# print("Adjusted Mutual Information: %0.3f"
# % metrics.adjusted_mutual_info_score(labels_true, labels))
# print("Silhouette Coefficient: %0.3f"
# % metrics.silhouette_score(topics_data_norm, labels))
In [ ]:
%%time
from time import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
n_digits = 8
reduced_data = PCA(n_components=2).fit_transform(topics_data_norm)
kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
kmeans.fit(reduced_data)
print ("finished kmeans clustering")
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max].
# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect='auto', origin='lower')
plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=169, linewidths=3,
color='w', zorder=10)
plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()
In [ ]:
%%time
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
In [ ]:
from sklearn.manifold import TSNE
# a t-SNE model
# angle value close to 1 means sacrificing accuracy for speed
# pca initializtion usually leads to better results
# tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
# doc_topic_distr = ldamodel.fit_transform()
# 20-D -> 2-D
# tsne_lda = tsne_model.fit_transform(doc_topic_distr)
In [ ]: