In [1]:
from scipy import sparse
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
from sklearn.externals import joblib
import pandas as pd
import psycopg2
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.cluster import KMeans
In [2]:
#conn = psycopg2.connect("dbname='cap' user='postgres' host='ec2-52-27-114-159.us-west-2.compute.amazonaws.com' port=9000 password ='secret'")
#data = pd.read_sql_query("SELECT * FROM nlp_dim ORDER BY id DESC LIMIT 300", conn)
In [3]:
x = sparse.load_npz('model/tf_idf.npz')
In [4]:
x
Out[4]:
In [5]:
# First we are going to PCA this vector data
reduced_data = PCA(n_components=2).fit_transform(x.todense())
km = KMeans(init='k-means++', n_clusters=15, n_init=10)
km.fit(reduced_data)
Out[5]:
In [6]:
reduced_data
Out[6]:
In [7]:
# step size of mesh
h = 0.05
x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max()
y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max()
In [8]:
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
In [9]:
xx.shape
Out[9]:
In [10]:
yy.shape
Out[10]:
In [11]:
test_data = np.c_[xx.ravel(), yy.ravel()]
# test_data.shape
Z = km.predict(test_data)
Z.shape
Out[11]:
In [12]:
Z = Z.reshape(xx.shape)
Z.shape
Out[12]:
In [24]:
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect='auto', origin='lower')
plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=.05)
# Plot the centroids as an *
centroids = km.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='*', s=169, linewidths=2,
color='b', zorder=10)
plt.show()
In [14]:
reduced_data
Out[14]:
In [ ]: