notebook.community

Edit and run



In [62]:

    
#all imports, library setups here
import pandas as pd
import numpy as np
import matplotlib

from scipy import stats, integrate

import matplotlib.pyplot as plt
%matplotlib inline
import cufflinks as cf

import plotly
plotly.offline.init_notebook_mode()
import plotly.offline as py
import plotly.graph_objs as go
from plotly.graph_objs import *
print(pd.__version__)

import seaborn as sns
sns.set(color_codes=True)
#sns.set_context('poster')
plot_kwds = {'alpha' : 0.25, 's' : 80, 'linewidths':0}

import sklearn.cluster as cluster
import time



In [5]:

    
#loading iris.csv
df = pd.read_csv('iris.csv',sep=",",header='infer')



In [6]:

    
df.head(5)









    Out[6]:






  
    
      
      sepallength
      sepalwidth
      petallength
      petalwidth
      species
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      Iris-setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      Iris-setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      Iris-setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      Iris-setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      Iris-setosa



In [7]:

    
df.tail(5)









    Out[7]:






  
    
      
      sepallength
      sepalwidth
      petallength
      petalwidth
      species
    
  
  
    
      145
      6.7
      3.0
      5.2
      2.3
      Iris-virginica
    
    
      146
      6.3
      2.5
      5.0
      1.9
      Iris-virginica
    
    
      147
      6.5
      3.0
      5.2
      2.0
      Iris-virginica
    
    
      148
      6.2
      3.4
      5.4
      2.3
      Iris-virginica
    
    
      149
      5.9
      3.0
      5.1
      1.8
      Iris-virginica



In [12]:

    
df.describe()









    Out[12]:






  
    
      
      sepallength
      sepalwidth
      petallength
      petalwidth
    
  
  
    
      count
      150.000000
      150.000000
      150.000000
      150.000000
    
    
      mean
      5.843333
      3.057333
      3.758000
      1.199333
    
    
      std
      0.828066
      0.435866
      1.765298
      0.762238
    
    
      min
      4.300000
      2.000000
      1.000000
      0.100000
    
    
      25%
      5.100000
      2.800000
      1.600000
      0.300000
    
    
      50%
      5.800000
      3.000000
      4.350000
      1.300000
    
    
      75%
      6.400000
      3.300000
      5.100000
      1.800000
    
    
      max
      7.900000
      4.400000
      6.900000
      2.500000



In [13]:

    
df.dtypes









    Out[13]:





sepallength    float64
sepalwidth     float64
petallength    float64
petalwidth     float64
species         object
dtype: object



In [14]:

    
df.columns









    Out[14]:





Index(['sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species'], dtype='object')



In [15]:

    
sl = df.sepallength.values
sw = df.sepalwidth.values
sp = df.species.values

pl = df.petallength.values
pw = df.petalwidth.values



In [16]:

    
plt.plot(sl,sw,"bo")
plt.xlabel("Sepal Length")
plt.ylabel("Sepal Width")
plt.show()



In [17]:

    
plt.plot(pl,pw,"go")
plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.show()



In [18]:

    
pivoted = pd.pivot_table(df,values='sepallength',columns='species',index='sepalwidth')
pivoted.plot()









    Out[18]:





<matplotlib.axes._subplots.AxesSubplot at 0x119d9cac8>



In [19]:

    
pivoted = pd.pivot_table(df,values='petallength',columns='species',index='petalwidth')
pivoted.plot()









    Out[19]:





<matplotlib.axes._subplots.AxesSubplot at 0x11a1a54a8>



In [20]:

    
sns.stripplot(x="sepallength",y="species",data=df,jitter=1)









    Out[20]:





<matplotlib.axes._subplots.AxesSubplot at 0x11a1e1898>



In [21]:

    
sns.boxplot(x="sepallength",y="species",data=df)









    Out[21]:





<matplotlib.axes._subplots.AxesSubplot at 0x11a8e6278>



In [22]:

    
#write cluster plot function to help us
def plot_clusters(data, algorithm, args, kwds):
    start_time = time.time()
    labels = algorithm(*args, **kwds).fit_predict(data)
    end_time = time.time()
    palette = sns.color_palette('deep', np.unique(labels).max() + 1)
    colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
    plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds)
    frame = plt.gca()
    frame.axes.get_xaxis().set_visible(False)
    frame.axes.get_yaxis().set_visible(False)
    plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)
    plt.text(-0.5, 0.7, 'Clustering took {:.2f} s'.format(end_time - start_time), fontsize=14)



In [45]:

    
#let us massage our data into shape
cdf = df.drop('species',1)
cdf.head(5)
cdf = cdf.as_matrix()



In [60]:

    
#introduction to clustering
#k-means
#centroid based clustering
#step 1: specify number of clusters
#step 2: calculate centroids depending on a distance metric
#step 3: assign points to a cluster centroid depending on wcss
#step 4: calculate new centroid based on these observations
# what are the advantages and disadvantages

plot_clusters(cdf, cluster.KMeans, (), {'n_clusters':3})



In [58]:

    
#DBScan
#eps parameter = max distance between 2 points in same neighborhood
plot_clusters(cdf, cluster.DBSCAN, (), {'eps':0.5})



In [70]:

    
#graph based: affinity propagation
plot_clusters(cdf, cluster.AffinityPropagation, (), {'preference':-9.0, 'damping':0.80})

	sepallength	sepalwidth	petallength	petalwidth	species
0	5.1	3.5	1.4	0.2	Iris-setosa
1	4.9	3.0	1.4	0.2	Iris-setosa
2	4.7	3.2	1.3	0.2	Iris-setosa
3	4.6	3.1	1.5	0.2	Iris-setosa
4	5.0	3.6	1.4	0.2	Iris-setosa

	sepallength	sepalwidth	petallength	petalwidth	species
145	6.7	3.0	5.2	2.3	Iris-virginica
146	6.3	2.5	5.0	1.9	Iris-virginica
147	6.5	3.0	5.2	2.0	Iris-virginica
148	6.2	3.4	5.4	2.3	Iris-virginica
149	5.9	3.0	5.1	1.8	Iris-virginica

	sepallength	sepalwidth	petallength	petalwidth
count	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.057333	3.758000	1.199333
std	0.828066	0.435866	1.765298	0.762238
min	4.300000	2.000000	1.000000	0.100000
25%	5.100000	2.800000	1.600000	0.300000
50%	5.800000	3.000000	4.350000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000