Cluster article data and compute inertia as a function of cluster number
In [1]:
from sklearn import cluster
import pandas as pd
import numpy as np
import pickle
In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
In [6]:
num_topics = 20
doc_data = pickle.load(open('pub_probabs_topic'+str(num_topics)+'.pkl','rb'))
lda_topics = ['topic'+str(i) for i in range(0,num_topics)]
cluster_dims = ['source','trust'] + lda_topics
cluster_data = doc_data[cluster_dims].values
In [7]:
# Inertia (within-cluster sum of squares criterion) is a measure of how internally coherent clusters are
MAX_K = 10
ks = range(1,MAX_K+1)
inertias = np.zeros(MAX_K)
for k in ks:
kmeans = cluster.KMeans(k).fit(cluster_data)
inertias[k-1] = kmeans.inertia_
In [9]:
with sns.axes_style("whitegrid"):
plt.plot(ks, inertias)
plt.ylabel("Average inertia")
plt.xlabel("Number of clusters")
plt.show()
Cluster article data and compute the silhouette score as a function of cluster number
In [10]:
from sklearn.metrics import silhouette_score
import random
In [11]:
num_topics = 20
doc_data = pickle.load(open('pub_probabs_topic'+str(num_topics)+'.pkl','rb'))
lda_topics = ['topic'+str(i) for i in range(0,num_topics)]
cluster_dims = ['source','trust'] + lda_topics
cluster_data = doc_data[cluster_dims].values
In [12]:
# The silhouette score is a measure of the density and separation of the formed clusters
seed = 42
MAX_K = 10
ks = range(1,MAX_K+1)
silhouette_avg = []
for i,k in enumerate(ks[1:]):
kmeans = cluster.KMeans(n_clusters=k,random_state=seed).fit(cluster_data)
kmeans_clusters = kmeans.predict(cluster_data)
silhouette_avg.append(silhouette_score(cluster_data,kmeans_clusters))
In [14]:
with sns.axes_style("whitegrid"):
plt.plot(ks[1:], silhouette_avg)
plt.ylabel("Average silhouette score")
plt.xlabel("Number of clusters")
plt.ylim([0.0,1.0])
plt.show()
Split data set for cross-validation
In [96]:
num_topics = 20
doc_data = pickle.load(open('pub_probabs_topic'+str(num_topics)+'.pkl','rb'))
lda_topics = ['topic'+str(i) for i in range(0,num_topics)]
cluster_dims = ['source','trust'] + lda_topics
cluster_data = doc_data[cluster_dims].values
In [97]:
num_folds = 5
seed = 42
np.random.seed(seed)
np.random.shuffle(cluster_data) # Shuffles in-place
cluster_data = np.split(cluster_data[0:-1,:],num_folds) # Make divisible by 10
In [98]:
train_data,test_data= [],[]
for hold in range(num_folds):
keep = [i for i in list(range(num_folds)) if i != hold]
train = [cluster_data[i] for i in keep]
test = cluster_data[hold]
train_data.append(np.vstack(train))
test_data.append(test)
full = [cluster_data[i] for i in list(range(num_folds))]
full_data = np.vstack(full)
Clustering consistency between the full and partial data sets
In [112]:
MAX_K = 10
ks = range(1,MAX_K+1)
kmeans_accuracy = []
for k in ks:
full_kmeans = cluster.KMeans(n_clusters=k,random_state=seed).fit(full_data)
accuracy = []
for fold in range(num_folds):
train_kmeans = cluster.KMeans(n_clusters=k,random_state=seed).fit(train_data[fold])
test_labels = train_kmeans.predict(test_data[fold])
full_labels = np.split(full_kmeans.labels_,num_folds)[fold]
accuracy.append(1.0 * np.sum(np.equal(full_labels,test_labels)) / len(test_labels))
kmeans_accuracy.append(np.mean(accuracy))
In [122]:
with sns.axes_style("whitegrid"):
plt.plot(ks, kmeans_accuracy)
plt.ylabel("Average accuracy")
plt.xlabel("Number of clusters")
plt.ylim([0.0,1.0])
plt.show()
In [ ]:
num_clusters = 4
In [ ]:
kmeans = cluster.KMeans(n_clusters=num_clusters,random_state=seed).fit(full_data)
kmeans_labels = kmeans.labels_
kmeans_centroids = kmeans.cluster_centers_
# 0 = mostly liberal, 1 = mostly conservative, 2 = mixed liberal, 3 = mixed conservative
kmeans_distances = kmeans.transform(full_data)
pickle.dump([kmeans,kmeans_labels,kmeans_centroids,kmeans_distances],
open('pub_kmeans_clean_cluster'+str(num_clusters)+'.pkl','wb'))