We now need to calculate a TF-IDF dictionary for each document. This will be a sparse dictionary.
Our first job is to create a tf-idf vector for each document.
In [1]:
import os
from glob import glob
result = [y for x in os.walk("../raw/processed") for y in glob(os.path.join(x[0], '*.txt'))]
print(result[0])
In [2]:
# remove all files called "logFechamento.txt"
list_of_files = [item for item in result if not item.endswith("logFechamento.txt")]
print(list_of_files[0])
In [3]:
# The text files are recorded with arbitrary encoding, so we need to cater for that.
import io
def read_hostile_text(path_txt):
encodings = [
'utf-8',
'latin_1',
'utf_16',
'cp1250',
]
for encoding in encodings:
file = io.open(path_txt, "r", encoding=encoding)
try:
text = file.read()
file.close()
return text
except UnicodeDecodeError:
file.close()
print('Could not decode', path_txt)
return None
In [4]:
text = read_hostile_text(list_of_files[0])
print(text)
In [5]:
# For each file in that list, extract the document and put it in the documents list.
documents = []
for filename in list_of_files:
documents.append(read_hostile_text(filename))
The documents are very poluted, with many numbers, which will end up being interpreted as tokens, where they don't really add any semantic value. So we'll filter those out.
In [65]:
def token_is_valid(token):
return (not token.isdigit()) and ('__' not in token) and ('((' not in token) and ('\n' not in token)
In [66]:
# This assumes that the separator is a space (' '), which will not always be the case...
# TODO deal with colons, semicolons, \x96, brackets
# TODO deal with the special case where a word is spelled with spaces between each letter (e.g. 'D E C R E T A')
documents = [''.join([token for token in document if token_is_valid(token)]) for document in documents]
In [67]:
documents[0]
Out[67]:
In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)
tfidf = vectorizer.fit_transform(documents)
In [69]:
vectorizer.vocabulary_
Out[69]:
Now we define the building blocks of the k-means algorithm.
In [70]:
import numpy as np
def get_initial_centroids(data, k, seed=None):
'''Randomly choose k data points as initial centroids'''
if seed is not None: # useful for obtaining consistent results
np.random.seed(seed)
n = data.shape[0] # number of data points
# Pick K indices from range [0, N).
rand_indices = np.random.randint(0, n, k)
# Keep centroids as dense format, as many entries will be nonzero due to averaging.
# As long as at least one document in a cluster contains a word,
# it will carry a nonzero weight in the TF-IDF vector of the centroid.
centroids = data[rand_indices,:].toarray()
return centroids
After initialization, the k-means algorithm iterates between the following two steps:
In [71]:
from sklearn.metrics import pairwise_distances
def assign_clusters(data, centroids):
# Compute distances between each data point and the set of centroids:
distances_from_centroids = pairwise_distances(data, centroids, metric='euclidean')
# Compute cluster assignments for each data point:
cluster_assignment = np.argmin(distances_from_centroids, axis=1)
return cluster_assignment
In [72]:
def revise_centroids(data, k, cluster_assignment):
new_centroids = []
for i in range(k):
# Select all data points that belong to cluster i. Fill in the blank (RHS only)
member_data_points = data[cluster_assignment==i,:]
# Compute the mean of the data points. Fill in the blank (RHS only)
centroid = member_data_points.mean(axis=0)
# Convert numpy.matrix type to numpy.ndarray type
centroid = centroid.A1
new_centroids.append(centroid)
new_centroids = np.array(new_centroids)
return new_centroids
We need to assess that the k-means algorithm is converging. We can look at the cluster assignments and see if they stabilize over time. In fact, we'll be running the algorithm until the cluster assignments stop changing at all. To be extra safe, and to assess the clustering performance, we'll be looking at an additional criteria: the sum of all squared distances between data points and centroids. This is defined as $$ J(\mathcal{Z},\mu) = \sum_{j=1}^k \sum_{i:z_i = j} \|\mathbf{x}_i - \mu_j\|^2. $$ The smaller the distances, the more homogeneous the clusters are. In other words, we'd like to have "tight" clusters.
In [73]:
def compute_heterogeneity(data, k, centroids, cluster_assignment):
heterogeneity = 0.0
for i in range(k):
# Select all data points that belong to cluster i. Fill in the blank (RHS only)
member_data_points = data[cluster_assignment==i, :]
if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty
# Compute distances from centroid to data points (RHS only)
distances = pairwise_distances(member_data_points, [centroids[i]], metric='euclidean')
squared_distances = distances**2
heterogeneity += np.sum(squared_distances)
return heterogeneity
In [74]:
def kmeans(data, k, initial_centroids, maxiter, record_heterogeneity=None, verbose=False):
'''This function runs k-means on given data and initial set of centroids.
maxiter: maximum number of iterations to run.
record_heterogeneity: (optional) a list, to store the history of heterogeneity as function of iterations
if None, do not store the history.
verbose: if True, print how many data points changed their cluster labels in each iteration'''
centroids = initial_centroids[:]
prev_cluster_assignment = None
for itr in range(maxiter):
if verbose:
print(itr)
# 1. Make cluster assignments using nearest centroids
cluster_assignment = assign_clusters(data, centroids)
# 2. Compute a new centroid for each of the k clusters, averaging all data points assigned to that cluster.
centroids = revise_centroids(data, k, cluster_assignment)
# Check for convergence: if none of the assignments changed, stop
if prev_cluster_assignment is not None and \
(prev_cluster_assignment==cluster_assignment).all():
break
# Print number of new assignments
if prev_cluster_assignment is not None:
num_changed = np.sum(prev_cluster_assignment!=cluster_assignment)
if verbose:
print(' {0:5d} elements changed their cluster assignment.'.format(num_changed))
# Record heterogeneity convergence metric
if record_heterogeneity is not None:
score = compute_heterogeneity(data, k, centroids, cluster_assignment)
record_heterogeneity.append(score)
prev_cluster_assignment = cluster_assignment[:]
return centroids, cluster_assignment
In [75]:
import matplotlib.pyplot as plt
def plot_heterogeneity(heterogeneity, k):
plt.figure(figsize=(7,4))
plt.plot(heterogeneity, linewidth=4)
plt.xlabel('# Iterations')
plt.ylabel('Heterogeneity')
plt.title('Heterogeneity of clustering over time, K={0:d}'.format(k))
plt.rcParams.update({'font.size': 16})
plt.tight_layout()
Let's do a first run at clustering, with k = 5
In [76]:
k = 5
heterogeneity = []
initial_centroids = get_initial_centroids(tfidf, k, seed=0)
centroids, cluster_assignment = kmeans(tfidf, k, initial_centroids, maxiter=400,
record_heterogeneity=heterogeneity, verbose=True)
In [77]:
%matplotlib inline
plot_heterogeneity(heterogeneity, k)
In [78]:
centroids
Out[78]:
In [79]:
cluster_assignment
Out[79]:
In [59]:
documents[2]
Out[59]:
In [ ]: