In [1]:
%matplotlib inline
from __future__ import division
import re
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.vq import kmeans,vq
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lmtzr = WordNetLemmatizer()
import mpld3
from mpld3 import plugins
from pprint import pprint as pp
from dateutil.parser import parse
import random
def elbow_method(X):
K = range(1,10)
KM = [kmeans(X,k) for k in K] # apply kmeans 1 to 10
centroids = [cent for (cent,var) in KM]
D_k = [cdist(X, cent, 'euclidean') for cent in centroids]
cIdx = [np.argmin(D,axis=1) for D in D_k]
dist = [np.min(D,axis=1) for D in D_k]
avgWithinSS = [sum(d)/X.shape[0] for d in dist]
kIdx = 2
# plot elbow curve
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(K, avgWithinSS, 'b*-')
ax.plot(K[kIdx], avgWithinSS[kIdx], marker='o', markersize=12,
markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
plt.grid(True)
plt.xlabel('Number of clusters')
plt.ylabel('Average within-cluster sum of squares')
tt = plt.title('Elbow for K-Means clustering')
In [2]:
listings = np.array([listing for listing in Listing.objects.all()])
raw_text = np.array([listing.message for listing in listings])
text = np.array([re.sub(r'^https?:\/\/.*[\r\n]*', '', message, flags=re.MULTILINE) for message in raw_text])
In [3]:
original_preprocessor = TfidfVectorizer().build_preprocessor()
preprocessor = lambda x: lmtzr.lemmatize(original_preprocessor(x))
vectorizer = TfidfVectorizer(min_df=0.01, ngram_range=(1,2), stop_words='english', preprocessor=preprocessor)
X = vectorizer.fit_transform(text).toarray()
In [4]:
elbow_method(X)
In [5]:
N_CLUSTERS = 3
k_means = KMeans(n_clusters=N_CLUSTERS, init='k-means++', max_iter=100, n_init=1, verbose=True)
# svd = TruncatedSVD(2)
# lsa = make_pipeline(svd, Normalizer(copy=False))
k_means.fit(X)
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels_unique = np.unique(k_means_labels)
terms = vectorizer.get_feature_names()
In [6]:
order_centroids = k_means_cluster_centers.argsort()[:, ::-1]
for i in range(N_CLUSTERS):
# print [(terms[ind], k_means_cluster_centers[i][ind]) for ind in order_centroids[i, :100]]
print [terms[ind] for ind in order_centroids[i, :100]]
print ""
It seems like when clustering with k=3, it consistently clusters into "unsure", "buying", and "selling". Let's try looking at just the datapoints that are clustered as "selling" to see how they cluster.
In [6]: