In [12]:
# load pandas and bokeh and initialize graphics
import pandas as pd
from bokeh.charts import Scatter, show
from bokeh.io import output_notebook,vplot
from sklearn import cluster
from sklearn import decomposition
import numpy as np
from sklearn.manifold import TSNE
output_notebook()


Loading BokehJS ...

In [13]:
df = pd.read_csv('email.csv', header=None)
df.head(3)


Out[13]:
0 1 2 3 4 5 6 7 8 9 ... 118 119 120 121 122 123 124 125 126 127
0 0.037764 0.014336 0.000178 -0.037297 -0.031119 0.026128 -0.070287 0.003502 -0.059951 0.112171 ... -0.035336 0.022702 -0.014517 0.114289 -0.072962 -0.022144 0.013885 -0.030889 0.032176 -0.079394
1 0.046096 0.032011 -0.025486 0.009135 -0.045694 0.019960 -0.088211 -0.000344 -0.060330 0.105208 ... -0.000726 0.008629 -0.015821 0.075698 -0.095679 -0.044916 0.032496 0.010954 0.020992 -0.074257
2 0.042579 0.028244 -0.047193 -0.025968 -0.006204 0.044185 -0.035565 -0.013477 -0.085571 0.107871 ... -0.010689 0.015235 -0.012684 0.110166 -0.070666 -0.017062 0.009641 -0.010771 0.008265 -0.071480

3 rows × 128 columns


In [14]:
k_means = cluster.KMeans(n_clusters=9, random_state=0)
k_means.fit(df)


Out[14]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=9, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=0, tol=0.0001,
    verbose=0)

In [4]:
pca = decomposition.PCA(n_components=2)
pca.fit(df)
X = pca.transform(df)

In [5]:
import pylab as pl
pl.scatter(X[:, 0], X[:, 1])


Out[5]:
<matplotlib.collections.PathCollection at 0xca7ddd8>

In [6]:
pl.show()



In [17]:
model = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
x = model.fit_transform(df)

In [18]:
pl.scatter(x[:, 0], x[:, 1]) 
pl.show()



In [19]:
cluster = np.zeros([df.shape[0],1])
for i in range(df.shape[0]) :
    row = np.reshape(df.ix[i][0:128], [1,-1])
    cluster[i] = k_means.predict(row)

In [20]:
df['cluster'] = cluster
df.head()
df.to_csv('clustered_email.csv')

In [21]:
df2 = pd.DataFrame(x)
df2['cluster'] = cluster
df2.head()


Out[21]:
0 1 cluster
0 -10.594789 6.893975 5.0
1 -11.398771 5.936529 5.0
2 -10.061698 4.257610 5.0
3 -10.803627 4.997542 5.0
4 -10.624414 5.713126 5.0

In [22]:
df2.columns = ['X', 'Y', 'cluster']
df2.head()


Out[22]:
X Y cluster
0 -10.594789 6.893975 5.0
1 -11.398771 5.936529 5.0
2 -10.061698 4.257610 5.0
3 -10.803627 4.997542 5.0
4 -10.624414 5.713126 5.0

In [23]:
fig = Scatter(df2,x='X',y='Y',
              color='cluster',
              legend='top_left',
              title='KNN Classifier')
show(fig)



In [ ]: