In [4]:
#all imports, library setups here
import pandas as pd
import numpy as np
import matplotlib

from scipy import stats, integrate

import matplotlib.pyplot as plt
%matplotlib inline
import cufflinks as cf

import plotly
plotly.offline.init_notebook_mode()
import plotly.offline as py
import plotly.graph_objs as go
from plotly.graph_objs import *
print(pd.__version__)

import seaborn as sns
sns.set(color_codes=True)
#sns.set_context('poster')
plot_kwds = {'alpha' : 0.25, 's' : 80, 'linewidths':0}

import sklearn.cluster as cluster
import time


0.18.1

In [5]:
df = pd.read_csv('iris.csv',sep=",",header='infer')

In [6]:
df.head(5)


Out[6]:
sepallength sepalwidth petallength petalwidth species
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa

In [7]:
df.tail(5)


Out[7]:
sepallength sepalwidth petallength petalwidth species
145 6.7 3.0 5.2 2.3 Iris-virginica
146 6.3 2.5 5.0 1.9 Iris-virginica
147 6.5 3.0 5.2 2.0 Iris-virginica
148 6.2 3.4 5.4 2.3 Iris-virginica
149 5.9 3.0 5.1 1.8 Iris-virginica

In [8]:
df.describe()


Out[8]:
sepallength sepalwidth petallength petalwidth
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.057333 3.758000 1.199333
std 0.828066 0.435866 1.765298 0.762238
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000

In [9]:
df.dtypes


Out[9]:
sepallength    float64
sepalwidth     float64
petallength    float64
petalwidth     float64
species         object
dtype: object

In [10]:
df.columns


Out[10]:
Index(['sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species'], dtype='object')

In [11]:
sl = df.sepallength.values
sw = df.sepalwidth.values
sp = df.species.values

pl = df.petallength.values
pw = df.petalwidth.values

In [12]:
plt.plot(sl,sw,"bo")
plt.xlabel("Sepal Length")
plt.ylabel("Sepal Width")
plt.show()



In [13]:
plt.plot(pl,pw,"go")
plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.show()



In [14]:
#write cluster plot function to help us
def plot_clusters(data, algorithm, args, kwds):
    start_time = time.time()
    labels = algorithm(*args, **kwds).fit_predict(data)
    end_time = time.time()
    palette = sns.color_palette('deep', np.unique(labels).max() + 1)
    colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
    plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds)
    frame = plt.gca()
    frame.axes.get_xaxis().set_visible(False)
    frame.axes.get_yaxis().set_visible(False)
    plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)
    plt.text(-0.5, 0.7, 'Clustering took {:.2f} s'.format(end_time - start_time), fontsize=14)

In [15]:
#let us massage our data into shape
cdf = df.drop('species',1)
cdf.head(5)
cdf = cdf.as_matrix()

In [16]:
#centroid based clustering
#k-means (loyds algorithm)
# assign data points to nearest cluster based on a distance metric
# calculate new centroids of clusters; repeat first step
# algorithm has converged when data points don't change cluster affiliations
plot_clusters(cdf,cluster.KMeans,(),{'n_clusters':3})



In [27]:
#density based clustering
#DBSCAN
#group together points with many nearest neighbors
# no initial cluster specifications; arbitrary shaped clusters; robust to outliers
# dependence on distance metric and minimum distance threshold
plot_clusters(cdf, cluster.DBSCAN, (), {'eps':0.5})



In [32]:
#graph based clustering
#affinity propagation
# quantifies similarities between node-pairs; similar node-pairs are in the same cluster
# complex networks can be challenging
plot_clusters(cdf, cluster.AffinityPropagation, (), {'preference':-9.0, 'damping':0.95})



In [37]:
# kernel based 
#meanshift
# determine mean shift vector [with lots of similar points along this vector space]
# move ("shift") the mean towards higher density region
# dependence on kernel instead of distance metric [kernel = weighting function]
plot_clusters(cdf, cluster.MeanShift, (0.2,), {'cluster_all':False})



In [39]:
#spectral clustering; kind of hierarchical clustering
# normalized-cuts or Shi-Malik algorithm
# partitions points into two or more sets depending on differences in spectrum (eigenvalues)of similarity matrix of the data
# dependent on initial choice of clusters: you have to tell the algorithm how to cut and choice of similarity matrix
plot_clusters(cdf, cluster.SpectralClustering, (), {'n_clusters':3})