In [4]:
#all imports, library setups here
import pandas as pd
import numpy as np
import matplotlib
from scipy import stats, integrate
import matplotlib.pyplot as plt
%matplotlib inline
import cufflinks as cf
import plotly
plotly.offline.init_notebook_mode()
import plotly.offline as py
import plotly.graph_objs as go
from plotly.graph_objs import *
print(pd.__version__)
import seaborn as sns
sns.set(color_codes=True)
#sns.set_context('poster')
plot_kwds = {'alpha' : 0.25, 's' : 80, 'linewidths':0}
import sklearn.cluster as cluster
import time
In [5]:
df = pd.read_csv('iris.csv',sep=",",header='infer')
In [6]:
df.head(5)
Out[6]:
In [7]:
df.tail(5)
Out[7]:
In [8]:
df.describe()
Out[8]:
In [9]:
df.dtypes
Out[9]:
In [10]:
df.columns
Out[10]:
In [11]:
sl = df.sepallength.values
sw = df.sepalwidth.values
sp = df.species.values
pl = df.petallength.values
pw = df.petalwidth.values
In [12]:
plt.plot(sl,sw,"bo")
plt.xlabel("Sepal Length")
plt.ylabel("Sepal Width")
plt.show()
In [13]:
plt.plot(pl,pw,"go")
plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.show()
In [14]:
#write cluster plot function to help us
def plot_clusters(data, algorithm, args, kwds):
start_time = time.time()
labels = algorithm(*args, **kwds).fit_predict(data)
end_time = time.time()
palette = sns.color_palette('deep', np.unique(labels).max() + 1)
colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds)
frame = plt.gca()
frame.axes.get_xaxis().set_visible(False)
frame.axes.get_yaxis().set_visible(False)
plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)
plt.text(-0.5, 0.7, 'Clustering took {:.2f} s'.format(end_time - start_time), fontsize=14)
In [15]:
#let us massage our data into shape
cdf = df.drop('species',1)
cdf.head(5)
cdf = cdf.as_matrix()
In [16]:
#centroid based clustering
#k-means (loyds algorithm)
# assign data points to nearest cluster based on a distance metric
# calculate new centroids of clusters; repeat first step
# algorithm has converged when data points don't change cluster affiliations
plot_clusters(cdf,cluster.KMeans,(),{'n_clusters':3})
In [27]:
#density based clustering
#DBSCAN
#group together points with many nearest neighbors
# no initial cluster specifications; arbitrary shaped clusters; robust to outliers
# dependence on distance metric and minimum distance threshold
plot_clusters(cdf, cluster.DBSCAN, (), {'eps':0.5})
In [32]:
#graph based clustering
#affinity propagation
# quantifies similarities between node-pairs; similar node-pairs are in the same cluster
# complex networks can be challenging
plot_clusters(cdf, cluster.AffinityPropagation, (), {'preference':-9.0, 'damping':0.95})
In [37]:
# kernel based
#meanshift
# determine mean shift vector [with lots of similar points along this vector space]
# move ("shift") the mean towards higher density region
# dependence on kernel instead of distance metric [kernel = weighting function]
plot_clusters(cdf, cluster.MeanShift, (0.2,), {'cluster_all':False})
In [39]:
#spectral clustering; kind of hierarchical clustering
# normalized-cuts or Shi-Malik algorithm
# partitions points into two or more sets depending on differences in spectrum (eigenvalues)of similarity matrix of the data
# dependent on initial choice of clusters: you have to tell the algorithm how to cut and choice of similarity matrix
plot_clusters(cdf, cluster.SpectralClustering, (), {'n_clusters':3})