notebook.community

Edit and run



In [12]:

    
# load pandas and bokeh and initialize graphics
import pandas as pd
from bokeh.charts import Scatter, show
from bokeh.io import output_notebook,vplot
from sklearn import cluster
from sklearn import decomposition
import numpy as np
from sklearn.manifold import TSNE
output_notebook()









    





    
        
        Loading BokehJS ...



In [13]:

    
df = pd.read_csv('email.csv', header=None)
df.head(3)









    Out[13]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      118
      119
      120
      121
      122
      123
      124
      125
      126
      127
    
  
  
    
      0
      0.037764
      0.014336
      0.000178
      -0.037297
      -0.031119
      0.026128
      -0.070287
      0.003502
      -0.059951
      0.112171
      ...
      -0.035336
      0.022702
      -0.014517
      0.114289
      -0.072962
      -0.022144
      0.013885
      -0.030889
      0.032176
      -0.079394
    
    
      1
      0.046096
      0.032011
      -0.025486
      0.009135
      -0.045694
      0.019960
      -0.088211
      -0.000344
      -0.060330
      0.105208
      ...
      -0.000726
      0.008629
      -0.015821
      0.075698
      -0.095679
      -0.044916
      0.032496
      0.010954
      0.020992
      -0.074257
    
    
      2
      0.042579
      0.028244
      -0.047193
      -0.025968
      -0.006204
      0.044185
      -0.035565
      -0.013477
      -0.085571
      0.107871
      ...
      -0.010689
      0.015235
      -0.012684
      0.110166
      -0.070666
      -0.017062
      0.009641
      -0.010771
      0.008265
      -0.071480
    
  

3 rows × 128 columns



In [14]:

    
k_means = cluster.KMeans(n_clusters=9, random_state=0)
k_means.fit(df)









    Out[14]:





KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=9, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=0, tol=0.0001,
    verbose=0)



In [4]:

    
pca = decomposition.PCA(n_components=2)
pca.fit(df)
X = pca.transform(df)



In [5]:

    
import pylab as pl
pl.scatter(X[:, 0], X[:, 1])









    Out[5]:





<matplotlib.collections.PathCollection at 0xca7ddd8>



In [6]:

    
pl.show()



In [17]:

    
model = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
x = model.fit_transform(df)



In [18]:

    
pl.scatter(x[:, 0], x[:, 1]) 
pl.show()



In [19]:

    
cluster = np.zeros([df.shape[0],1])
for i in range(df.shape[0]) :
    row = np.reshape(df.ix[i][0:128], [1,-1])
    cluster[i] = k_means.predict(row)



In [20]:

    
df['cluster'] = cluster
df.head()
df.to_csv('clustered_email.csv')



In [21]:

    
df2 = pd.DataFrame(x)
df2['cluster'] = cluster
df2.head()



In [22]:

    
df2.columns = ['X', 'Y', 'cluster']
df2.head()



In [23]:

    
fig = Scatter(df2,x='X',y='Y',
              color='cluster',
              legend='top_left',
              title='KNN Classifier')
show(fig)



In [ ]:

	0	1	cluster
0	-10.594789	6.893975	5.0
1	-11.398771	5.936529	5.0
2	-10.061698	4.257610	5.0
3	-10.803627	4.997542	5.0
4	-10.624414	5.713126	5.0

	X	Y	cluster
0	-10.594789	6.893975	5.0
1	-11.398771	5.936529	5.0
2	-10.061698	4.257610	5.0
3	-10.803627	4.997542	5.0
4	-10.624414	5.713126	5.0

	0	1	2	3	4	5	6	7	8	9	...	118	119	120	121	122	123	124	125	126	127
0	0.037764	0.014336	0.000178	-0.037297	-0.031119	0.026128	-0.070287	0.003502	-0.059951	0.112171	...	-0.035336	0.022702	-0.014517	0.114289	-0.072962	-0.022144	0.013885	-0.030889	0.032176	-0.079394
1	0.046096	0.032011	-0.025486	0.009135	-0.045694	0.019960	-0.088211	-0.000344	-0.060330	0.105208	...	-0.000726	0.008629	-0.015821	0.075698	-0.095679	-0.044916	0.032496	0.010954	0.020992	-0.074257
2	0.042579	0.028244	-0.047193	-0.025968	-0.006204	0.044185	-0.035565	-0.013477	-0.085571	0.107871	...	-0.010689	0.015235	-0.012684	0.110166	-0.070666	-0.017062	0.009641	-0.010771	0.008265	-0.071480