notebook.community

Edit and run



In [1]:

    
from scipy import sparse
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
from sklearn.externals import joblib
import pandas as pd
import psycopg2
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.cluster import KMeans



In [2]:

    
#conn = psycopg2.connect("dbname='cap' user='postgres' host='ec2-52-27-114-159.us-west-2.compute.amazonaws.com' port=9000 password ='secret'")
#data = pd.read_sql_query("SELECT * FROM nlp_dim ORDER BY id DESC LIMIT 300", conn)



In [3]:

    
x = sparse.load_npz('model/tf_idf.npz')



In [4]:

    
x









    Out[4]:





<164541x250 sparse matrix of type '<class 'numpy.float64'>'
	with 9477063 stored elements in Compressed Sparse Row format>



In [5]:

    
# First we are going to PCA this vector data 
reduced_data = PCA(n_components=2).fit_transform(x.todense())

km = KMeans(init='k-means++', n_clusters=15, n_init=10)
km.fit(reduced_data)









    Out[5]:





KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=15, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)



In [6]:

    
reduced_data









    Out[6]:





array([[ 0.02422442,  0.10892516],
       [-0.13735349, -0.03146955],
       [-0.04872723,  0.07693549],
       ..., 
       [-0.04816027,  0.13473284],
       [ 0.40243839, -0.08095205],
       [-0.13313446, -0.13162525]])



In [7]:

    
# step size of mesh
h = 0.05

x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max() 
y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max()



In [8]:

    
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))



In [9]:

    
xx.shape









    Out[9]:





(25, 22)



In [10]:

    
yy.shape









    Out[10]:





(25, 22)



In [11]:

    
test_data = np.c_[xx.ravel(), yy.ravel()]
# test_data.shape
Z = km.predict(test_data)
Z.shape









    Out[11]:





(550,)



In [12]:

    
Z = Z.reshape(xx.shape)
Z.shape









    Out[12]:





(25, 22)



In [24]:

    
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
          extent=(xx.min(), xx.max(), yy.min(), yy.max()),
            cmap=plt.cm.Paired,
           aspect='auto', origin='lower')
plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=.05)

# Plot the centroids as an *
centroids = km.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
           marker='*', s=169, linewidths=2,
           color='b', zorder=10)

plt.show()



In [14]:

    
reduced_data









    Out[14]:





array([[ 0.02422442,  0.10892516],
       [-0.13735349, -0.03146955],
       [-0.04872723,  0.07693549],
       ..., 
       [-0.04816027,  0.13473284],
       [ 0.40243839, -0.08095205],
       [-0.13313446, -0.13162525]])



In [ ]: