In [12]:
# load pandas and bokeh and initialize graphics
import pandas as pd
from bokeh.charts import Scatter, show
from bokeh.io import output_notebook,vplot
from sklearn import cluster
from sklearn import decomposition
import numpy as np
from sklearn.manifold import TSNE
output_notebook()
In [13]:
df = pd.read_csv('email.csv', header=None)
df.head(3)
Out[13]:
In [14]:
k_means = cluster.KMeans(n_clusters=9, random_state=0)
k_means.fit(df)
Out[14]:
In [4]:
pca = decomposition.PCA(n_components=2)
pca.fit(df)
X = pca.transform(df)
In [5]:
import pylab as pl
pl.scatter(X[:, 0], X[:, 1])
Out[5]:
In [6]:
pl.show()
In [17]:
model = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
x = model.fit_transform(df)
In [18]:
pl.scatter(x[:, 0], x[:, 1])
pl.show()
In [19]:
cluster = np.zeros([df.shape[0],1])
for i in range(df.shape[0]) :
row = np.reshape(df.ix[i][0:128], [1,-1])
cluster[i] = k_means.predict(row)
In [20]:
df['cluster'] = cluster
df.head()
df.to_csv('clustered_email.csv')
In [21]:
df2 = pd.DataFrame(x)
df2['cluster'] = cluster
df2.head()
Out[21]:
In [22]:
df2.columns = ['X', 'Y', 'cluster']
df2.head()
Out[22]:
In [23]:
fig = Scatter(df2,x='X',y='Y',
color='cluster',
legend='top_left',
title='KNN Classifier')
show(fig)
In [ ]: