notebook.community

Edit and run



In [4]:

    
#all imports, library setups here
import pandas as pd
import numpy as np
import matplotlib

from scipy import stats, integrate

import matplotlib.pyplot as plt
%matplotlib inline
import cufflinks as cf

import plotly
plotly.offline.init_notebook_mode()
import plotly.offline as py
import plotly.graph_objs as go
from plotly.graph_objs import *
print(pd.__version__)

import seaborn as sns
sns.set(color_codes=True)
#sns.set_context('poster')
plot_kwds = {'alpha' : 0.25, 's' : 80, 'linewidths':0}

import sklearn.cluster as cluster
import time



In [5]:

    
df = pd.read_csv('iris.csv',sep=",",header='infer')



In [6]:

    
df.head(5)









    Out[6]:






  
    
      
      sepallength
      sepalwidth
      petallength
      petalwidth
      species
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      Iris-setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      Iris-setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      Iris-setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      Iris-setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      Iris-setosa



In [7]:

    
df.tail(5)









    Out[7]:






  
    
      
      sepallength
      sepalwidth
      petallength
      petalwidth
      species
    
  
  
    
      145
      6.7
      3.0
      5.2
      2.3
      Iris-virginica
    
    
      146
      6.3
      2.5
      5.0
      1.9
      Iris-virginica
    
    
      147
      6.5
      3.0
      5.2
      2.0
      Iris-virginica
    
    
      148
      6.2
      3.4
      5.4
      2.3
      Iris-virginica
    
    
      149
      5.9
      3.0
      5.1
      1.8
      Iris-virginica



In [8]:

    
df.describe()









    Out[8]:






  
    
      
      sepallength
      sepalwidth
      petallength
      petalwidth
    
  
  
    
      count
      150.000000
      150.000000
      150.000000
      150.000000
    
    
      mean
      5.843333
      3.057333
      3.758000
      1.199333
    
    
      std
      0.828066
      0.435866
      1.765298
      0.762238
    
    
      min
      4.300000
      2.000000
      1.000000
      0.100000
    
    
      25%
      5.100000
      2.800000
      1.600000
      0.300000
    
    
      50%
      5.800000
      3.000000
      4.350000
      1.300000
    
    
      75%
      6.400000
      3.300000
      5.100000
      1.800000
    
    
      max
      7.900000
      4.400000
      6.900000
      2.500000



In [9]:

    
df.dtypes









    Out[9]:





sepallength    float64
sepalwidth     float64
petallength    float64
petalwidth     float64
species         object
dtype: object



In [10]:

    
df.columns









    Out[10]:





Index(['sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species'], dtype='object')



In [11]:

    
sl = df.sepallength.values
sw = df.sepalwidth.values
sp = df.species.values

pl = df.petallength.values
pw = df.petalwidth.values



In [12]:

    
plt.plot(sl,sw,"bo")
plt.xlabel("Sepal Length")
plt.ylabel("Sepal Width")
plt.show()



In [13]:

    
plt.plot(pl,pw,"go")
plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.show()



In [14]:

    
#write cluster plot function to help us
def plot_clusters(data, algorithm, args, kwds):
    start_time = time.time()
    labels = algorithm(*args, **kwds).fit_predict(data)
    end_time = time.time()
    palette = sns.color_palette('deep', np.unique(labels).max() + 1)
    colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
    plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds)
    frame = plt.gca()
    frame.axes.get_xaxis().set_visible(False)
    frame.axes.get_yaxis().set_visible(False)
    plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)
    plt.text(-0.5, 0.7, 'Clustering took {:.2f} s'.format(end_time - start_time), fontsize=14)



In [15]:

    
#let us massage our data into shape
cdf = df.drop('species',1)
cdf.head(5)
cdf = cdf.as_matrix()



In [16]:

    
#centroid based clustering
#k-means (loyds algorithm)
# assign data points to nearest cluster based on a distance metric
# calculate new centroids of clusters; repeat first step
# algorithm has converged when data points don't change cluster affiliations
plot_clusters(cdf,cluster.KMeans,(),{'n_clusters':3})



In [27]:

    
#density based clustering
#DBSCAN
#group together points with many nearest neighbors
# no initial cluster specifications; arbitrary shaped clusters; robust to outliers
# dependence on distance metric and minimum distance threshold
plot_clusters(cdf, cluster.DBSCAN, (), {'eps':0.5})



In [32]:

    
#graph based clustering
#affinity propagation
# quantifies similarities between node-pairs; similar node-pairs are in the same cluster
# complex networks can be challenging
plot_clusters(cdf, cluster.AffinityPropagation, (), {'preference':-9.0, 'damping':0.95})



In [37]:

    
# kernel based 
#meanshift
# determine mean shift vector [with lots of similar points along this vector space]
# move ("shift") the mean towards higher density region
# dependence on kernel instead of distance metric [kernel = weighting function]
plot_clusters(cdf, cluster.MeanShift, (0.2,), {'cluster_all':False})



In [39]:

    
#spectral clustering; kind of hierarchical clustering
# normalized-cuts or Shi-Malik algorithm
# partitions points into two or more sets depending on differences in spectrum (eigenvalues)of similarity matrix of the data
# dependent on initial choice of clusters: you have to tell the algorithm how to cut and choice of similarity matrix
plot_clusters(cdf, cluster.SpectralClustering, (), {'n_clusters':3})

	sepallength	sepalwidth	petallength	petalwidth	species
0	5.1	3.5	1.4	0.2	Iris-setosa
1	4.9	3.0	1.4	0.2	Iris-setosa
2	4.7	3.2	1.3	0.2	Iris-setosa
3	4.6	3.1	1.5	0.2	Iris-setosa
4	5.0	3.6	1.4	0.2	Iris-setosa

	sepallength	sepalwidth	petallength	petalwidth	species
145	6.7	3.0	5.2	2.3	Iris-virginica
146	6.3	2.5	5.0	1.9	Iris-virginica
147	6.5	3.0	5.2	2.0	Iris-virginica
148	6.2	3.4	5.4	2.3	Iris-virginica
149	5.9	3.0	5.1	1.8	Iris-virginica

	sepallength	sepalwidth	petallength	petalwidth
count	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.057333	3.758000	1.199333
std	0.828066	0.435866	1.765298	0.762238
min	4.300000	2.000000	1.000000	0.100000
25%	5.100000	2.800000	1.600000	0.300000
50%	5.800000	3.000000	4.350000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000