In [62]:
#all imports, library setups here
import pandas as pd
import numpy as np
import matplotlib

from scipy import stats, integrate

import matplotlib.pyplot as plt
%matplotlib inline
import cufflinks as cf

import plotly
plotly.offline.init_notebook_mode()
import plotly.offline as py
import plotly.graph_objs as go
from plotly.graph_objs import *
print(pd.__version__)

import seaborn as sns
sns.set(color_codes=True)
#sns.set_context('poster')
plot_kwds = {'alpha' : 0.25, 's' : 80, 'linewidths':0}

import sklearn.cluster as cluster
import time


0.18.1

In [5]:
#loading iris.csv
df = pd.read_csv('iris.csv',sep=",",header='infer')

In [6]:
df.head(5)


Out[6]:
sepallength sepalwidth petallength petalwidth species
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa

In [7]:
df.tail(5)


Out[7]:
sepallength sepalwidth petallength petalwidth species
145 6.7 3.0 5.2 2.3 Iris-virginica
146 6.3 2.5 5.0 1.9 Iris-virginica
147 6.5 3.0 5.2 2.0 Iris-virginica
148 6.2 3.4 5.4 2.3 Iris-virginica
149 5.9 3.0 5.1 1.8 Iris-virginica

In [12]:
df.describe()


Out[12]:
sepallength sepalwidth petallength petalwidth
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.057333 3.758000 1.199333
std 0.828066 0.435866 1.765298 0.762238
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000

In [13]:
df.dtypes


Out[13]:
sepallength    float64
sepalwidth     float64
petallength    float64
petalwidth     float64
species         object
dtype: object

In [14]:
df.columns


Out[14]:
Index(['sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species'], dtype='object')

In [15]:
sl = df.sepallength.values
sw = df.sepalwidth.values
sp = df.species.values

pl = df.petallength.values
pw = df.petalwidth.values

In [16]:
plt.plot(sl,sw,"bo")
plt.xlabel("Sepal Length")
plt.ylabel("Sepal Width")
plt.show()



In [17]:
plt.plot(pl,pw,"go")
plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.show()



In [18]:
pivoted = pd.pivot_table(df,values='sepallength',columns='species',index='sepalwidth')
pivoted.plot()


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x119d9cac8>

In [19]:
pivoted = pd.pivot_table(df,values='petallength',columns='species',index='petalwidth')
pivoted.plot()


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a1a54a8>

In [20]:
sns.stripplot(x="sepallength",y="species",data=df,jitter=1)


Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a1e1898>

In [21]:
sns.boxplot(x="sepallength",y="species",data=df)


Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a8e6278>

In [22]:
#write cluster plot function to help us
def plot_clusters(data, algorithm, args, kwds):
    start_time = time.time()
    labels = algorithm(*args, **kwds).fit_predict(data)
    end_time = time.time()
    palette = sns.color_palette('deep', np.unique(labels).max() + 1)
    colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
    plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds)
    frame = plt.gca()
    frame.axes.get_xaxis().set_visible(False)
    frame.axes.get_yaxis().set_visible(False)
    plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)
    plt.text(-0.5, 0.7, 'Clustering took {:.2f} s'.format(end_time - start_time), fontsize=14)

In [45]:
#let us massage our data into shape
cdf = df.drop('species',1)
cdf.head(5)
cdf = cdf.as_matrix()

In [60]:
#introduction to clustering
#k-means
#centroid based clustering
#step 1: specify number of clusters
#step 2: calculate centroids depending on a distance metric
#step 3: assign points to a cluster centroid depending on wcss
#step 4: calculate new centroid based on these observations
# what are the advantages and disadvantages

plot_clusters(cdf, cluster.KMeans, (), {'n_clusters':3})



In [58]:
#DBScan
#eps parameter = max distance between 2 points in same neighborhood
plot_clusters(cdf, cluster.DBSCAN, (), {'eps':0.5})



In [70]:
#graph based: affinity propagation
plot_clusters(cdf, cluster.AffinityPropagation, (), {'preference':-9.0, 'damping':0.80})