kmeans



In [1]:
%matplotlib inline

from __future__ import division 

import re
import numpy as np
import matplotlib.pyplot as plt

from scipy.cluster.vq import kmeans,vq
from scipy.spatial.distance import cdist

from sklearn.cluster import KMeans 
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics 

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize 
lmtzr = WordNetLemmatizer()

import mpld3
from mpld3 import plugins 

from pprint import pprint as pp
from dateutil.parser import parse 
import random 

def elbow_method(X):
    K = range(1,10)
    KM = [kmeans(X,k) for k in K] # apply kmeans 1 to 10
    centroids = [cent for (cent,var) in KM] 
    D_k = [cdist(X, cent, 'euclidean') for cent in centroids]

    cIdx = [np.argmin(D,axis=1) for D in D_k]
    dist = [np.min(D,axis=1) for D in D_k]
    avgWithinSS = [sum(d)/X.shape[0] for d in dist]  
    kIdx = 2
    # plot elbow curve
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(K, avgWithinSS, 'b*-')
    ax.plot(K[kIdx], avgWithinSS[kIdx], marker='o', markersize=12, 
          markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
    plt.grid(True)
    plt.xlabel('Number of clusters')
    plt.ylabel('Average within-cluster sum of squares')
    tt = plt.title('Elbow for K-Means clustering')

In [2]:
listings = np.array([listing for listing in Listing.objects.all()])
raw_text = np.array([listing.message for listing in listings])
text = np.array([re.sub(r'^https?:\/\/.*[\r\n]*', '', message, flags=re.MULTILINE) for message in raw_text])

In [3]:
original_preprocessor = TfidfVectorizer().build_preprocessor()
preprocessor = lambda x: lmtzr.lemmatize(original_preprocessor(x))
vectorizer = TfidfVectorizer(min_df=0.01, ngram_range=(1,2), stop_words='english', preprocessor=preprocessor)
X = vectorizer.fit_transform(text).toarray()

In [4]:
elbow_method(X)



In [5]:
N_CLUSTERS = 3
k_means = KMeans(n_clusters=N_CLUSTERS, init='k-means++', max_iter=100, n_init=1, verbose=True)
# svd = TruncatedSVD(2) 
# lsa = make_pipeline(svd, Normalizer(copy=False)) 
k_means.fit(X)
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels_unique = np.unique(k_means_labels)
terms = vectorizer.get_feature_names()


Initialization complete
Iteration  0, inertia 10948.562
Iteration  1, inertia 5700.159
Iteration  2, inertia 5673.495
Iteration  3, inertia 5654.825
Iteration  4, inertia 5617.334
Iteration  5, inertia 5588.009
Iteration  6, inertia 5579.881
Iteration  7, inertia 5576.749
Iteration  8, inertia 5575.539
Iteration  9, inertia 5575.169
Iteration 10, inertia 5575.001
Iteration 11, inertia 5574.949
Iteration 12, inertia 5574.941
Iteration 13, inertia 5574.939
Iteration 14, inertia 5574.938
Converged at iteration 14

In [6]:
order_centroids = k_means_cluster_centers.argsort()[:, ::-1]
for i in range(N_CLUSTERS):
    # print [(terms[ind], k_means_cluster_centers[i][ind]) for ind in order_centroids[i, :100]]
    print [terms[ind] for ind in order_centroids[i, :100]]
    print ""


[u'obo', u'message', u'buying', u'10', u'interested', u'sale', u'selling', u'20', u'free', u'new', u'size', u'15', u'need', u'desk', u'table', u'bed', u'mattress', u'message interested', u'30', u'pick', u'price', u'50', u'offer', u'want', u'chair', u'lamp', u'used', u'condition', u'best', u'sell', u'40', u'25', u'black', u'ikea', u'best offer', u'box', u'twin', u'negotiable', u'available', u'tv', u'small', u'fridge', u'frame', u'just', u'like', u'great', u'brand', u'room', u'brand new', u'know', u'white', u'details', u'mini', u'bike', u'good', u'com', u'does', u'set', u'couch', u'book', u'sublet', u'edition', u'60', u'spring', u'summer', u'100', u'mini fridge', u'books', u'll', u'http', u'evanston', u'tomorrow', u'today', u'got', u'tickets', u'large', u'box spring', u'coffee', u'ticket', u'year', u'message details', u'really', u'old', u'35', u'queen', u'bed frame', u'come', u'sized', u'picked', u'chairs', u'dresser', u'let', u'plastic', u'msg', u'12', u'printer', u'don', u'buy', u'www', u'bought']

[u'looking', u'looking buy', u'buy', u'message', u'bike', u'know', u'desk', u'sell', u'selling', u'dresser', u'thanks', u'bed', u'let know', u'let', u'table', u'chair', u'mattress', u'small', u'pick', u'couch', u'frame', u'textbook', u'size', u'book', u'bed frame', u'ticket', u'sublet', u'fridge', u'cheap', u'floor', u'lamp', u'fall', u'futon', u'msg', u'unit', u'chairs', u'tv', u'rid', u'sized', u'tickets', u'room', u'queen', u'desk chair', u'hey', u'microwave', u'pay', u'books', u'mini fridge', u'bookshelf', u'mini', u'dining', u'place', u'size bed', u'twin', u'pm', u'large', u'stand', u'printer', u'nice', u'good', u'like', u'ac', u'study', u'chem', u'edition', u'saturday', u'summer', u'furniture', u'interested', u'august', u'month', u'price', u'july', u'drawers', u'got', u'office', u'need', u'anybody', u'willing', u'apartment', u'available', u'coffee table', u'coffee', u'box', u'sale', u'just', u'comment', u'june', u'math', u'day', u'old', u'clicker', u'rent', u'does', u'101', u'game', u'kitchen', u'econ', u'extra', u'll']

[u'selling', u'message', u'edition', u'book', u'interested', u'ticket', u'message interested', u'bike', u'books', u'math', u'textbook', u'selling bike', u'econ', u'intro', u'tickets', u'10', u'manual', u'used', u'anybody', u'psych', u'fridge', u'dresser', u'buying', u'chem', u'50', u'230', u'obo', u'201', u'30', u'15', u'20', u'clicker', u'40', u'220', u'principles', u'mini', u'new', u'math 230', u'mini fridge', u'details', u'3rd', u'microwave', u'25', u'desk', u'100', u'chemistry', u'message details', u'101', u'pm', u'condition', u'chair', u'know', u'sunday', u'saturday', u'tv', u'cheap', u'let', u'black', u'study', u'unit', u'price', u'let know', u'day', u'nu', u'game', u'size', u'white', u'futon', u'small', u'60', u'mattress', u'buy', u'table', u'35', u'printer', u'northwestern', u'comment', u'couch', u'comes', u'msg', u'bed', u'great', u'negotiable', u'twin', u'good', u'80', u'office', u'box', u'furniture', u'offer', u'thanks', u'away', u'ikea', u'need', u'set', u'want', u'ac', u'prices', u'lamp', u'like']

It seems like when clustering with k=3, it consistently clusters into "unsure", "buying", and "selling". Let's try looking at just the datapoints that are clustered as "selling" to see how they cluster.


In [6]: