In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
import numpy as np
import seaborn as sns
Here, we load the serialized DataFrame saved in step 1.
In [2]:
raw_input = pd.read_pickle('sample.pkl')
In [3]:
raw_input.head()
Out[3]:
In [4]:
raw_input.info()
Now we convert each document as a vector of tf-idf features. Then we apply Non-negative Matrix Factorization (NMF) to transform the high-dimensional sparse feature space to lower dimensional dense feature space. Finally, we normalize the low dimension vectors by their L2-norm, so Euclidean distances between the feature vectors are proportional to their cosine similarities.
In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
en_stopwords = set(stopwords.words('english'))
In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline
def vsm_representation(n_components, texts, return_error=False):
"""Return the Vector Space represents of a vector of texts, `texts`.
If `return_error` is True, return the reconstruction_error of the NMF.
Otherwise return the L2 normalized transformed features.
"""
tfidf = TfidfVectorizer(lowercase=True, ngram_range=(1, 1),
max_df=0.4, min_df=25, stop_words=en_stopwords)
nmf = NMF(n_components=n_components, init=None, solver='cd', random_state=1234, shuffle=True)
steps = [('tfidf', tfidf), ('nmf', nmf)]
if not return_error:
steps.append(('norm', Normalizer(norm='l2')))
pipeline = Pipeline(steps)
if return_error:
pipeline = pipeline.fit(texts)
return pipeline.named_steps['nmf'].reconstruction_err_
else:
return pipeline.fit_transform(texts)
Next we plot the NMF reconstruction error against the number of components to choose a target component size.
In [7]:
%%time
def plot_errors(texts):
"""Plot the NMF reconstruction error for a range of possible candidate sizes."""
n_range = range(5, 35, 5)
errors = {n: vsm_representation(n, texts, return_error=True) for n in n_range}
s = pd.Series(errors, name='NMF reconstruction errors').sort_index()
ax = s.plot(kind='bar', title='NMF reconstruction errors vs. Component Size')
return s
errors = plot_errors(raw_input.text)
errors.to_csv('nmf_rec_errors.csv')
Based on the plot above, I chose the component size as 20.
In [8]:
%%time
vsm_mat = vsm_representation(20, raw_input.text)
vsm_mat.shape
Next we visualize our vsm representation by mapping our vsm_matrix into a 2D matrix using t-SNE method.
In [9]:
%%time
from sklearn.manifold import TSNE
def plot_2d_representation(tf_mat, levels):
"""Use the t-SNE method to produce a 2D visualization of the vector space model."""
visualizer = TSNE(n_components=2, perplexity=30.0, random_state=1024)
tf_vis = visualizer.fit_transform(tf_mat)
vis_df = pd.DataFrame(data=tf_vis, index=raw_input.index, columns=['x1', 'x2'])
# attach the ground truth labels so that we can assign a different colour to each label.
vis_df = vis_df.assign(level=levels)
_ = sns.lmplot(x='x1', y='x2', hue='level', data=vis_df, fit_reg=False, size=10, aspect=1)
plot_2d_representation(vsm_mat, raw_input.level)
Next we build another helper function to fit a KMeans cluster on the vector space rerpesentation of our text documents.
In [10]:
from sklearn.cluster import MiniBatchKMeans
def cluster_vsm(n_clusters, vsm, batch_size=5000, return_labels=True):
"""
Cluster the document VSM using ``MiniBatchKMeans``.
If `return_labels` is True, return the cluster membership of each row.
Otherwise return the `inertia_` attribute, which is a measure of
sum of squared errors of each sample relative to the cluster mean.
"""
clusterer = MiniBatchKMeans(
n_clusters=n_clusters,
compute_labels=True,
random_state=1234,
batch_size=batch_size,
tol=1E-3)
clusterer = clusterer.fit(vsm)
if return_labels:
return clusterer.labels_.copy()
else:
return clusterer.inertia_
Next, we calculate and plot the inertia
of the cluster for a range of candidate values for the cluster size.
In [11]:
%%time
def calc_cluster_sse(vsm):
"""
Plot the SSE vs cluster size for a number of candidate cluster sizes.
"""
n_clusters = range(2, 31)
errors = {n: cluster_vsm(n, vsm, return_labels=False) for n in n_clusters}
s = pd.Series(errors, name='elbow_plot').sort_index()
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
ax = s.plot(kind='bar', title='Cluster SSE vs Cluster Size (KMeans)', ax=ax)
return s
cluster_sse = calc_cluster_sse(vsm_mat)
cluster_sse.to_csv('kmeans_sse.csv')
Based on the above, I chose 20 as the optimal cluster size.
In [12]:
# Extract the cluster assignments assuming 20 clusters
cluster_labels = cluster_vsm(20, vsm_mat)
# Convert to a Series
cluster_labels = pd.Series(index=raw_input.index, data=cluster_labels, name='cluster_labels')
# Plot Count of Labels
label_counts = cluster_labels.value_counts()
label_counts.to_csv('label_counts_small.csv')
ax =label_counts.plot(kind='bar', rot='0', title='Number of documents per Cluster')
In [13]:
# Cross-tabulate the assigned levels with cluster assignments
ct = pd.crosstab(cluster_labels, raw_input.level).T
fig, ax = plt.subplots(1, 1, figsize=(14, 8))
ax = sns.heatmap(ct, annot=True, fmt='d', ax=ax)
ct.to_csv('cluster_assignment_vs_labels_small.csv')
Next we calculate the Adjusted Rand Score between the cluster assignments and the ground truth labels, which is a measure of how similar these two label assignments are.
In [14]:
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(cluster_labels, raw_input.level)
Out[14]:
The adjusted rand score is close to 0.0 for random labeling independently of the number of clusters and samples and exactly 1.0 if they are identical. So this clustering doesn't seem to correspond to the ground truth labels.
In [15]:
%%time
n_components, n_clusters = 20, 20
raw_input_full = pd.read_pickle('input.pkl')
vsm_mat_full = vsm_representation(n_components, raw_input_full.text)
cluster_labels_full = cluster_vsm(n_clusters, vsm_mat_full)
And then the cross-tabulation plot against ground-truth labels:
In [16]:
cluster_labels_full = pd.Series(data=cluster_labels_full, index=raw_input_full.index, name='cluster_labels')
label_counts_full = cluster_labels_full.value_counts()
label_counts_full.to_csv('label_counts_all.csv')
ax =label_counts_full.plot(kind='bar', rot='0', title='Number of documents per Cluster')
ct_full = pd.crosstab(cluster_labels_full, raw_input_full.level).T
fig, ax = plt.subplots(1, 1, figsize=(16, 8))
ax = sns.heatmap(ct_full, annot=True, fmt='d', ax=ax)
ct_full.to_csv('cluster_assignment_vs_labels_all.csv')
In [17]:
adjusted_rand_score(cluster_labels_full, raw_input_full.level)
Out[17]: