In [62]:
#all imports, library setups here
import pandas as pd
import numpy as np
import matplotlib
from scipy import stats, integrate
import matplotlib.pyplot as plt
%matplotlib inline
import cufflinks as cf
import plotly
plotly.offline.init_notebook_mode()
import plotly.offline as py
import plotly.graph_objs as go
from plotly.graph_objs import *
print(pd.__version__)
import seaborn as sns
sns.set(color_codes=True)
#sns.set_context('poster')
plot_kwds = {'alpha' : 0.25, 's' : 80, 'linewidths':0}
import sklearn.cluster as cluster
import time
In [5]:
#loading iris.csv
df = pd.read_csv('iris.csv',sep=",",header='infer')
In [6]:
df.head(5)
Out[6]:
In [7]:
df.tail(5)
Out[7]:
In [12]:
df.describe()
Out[12]:
In [13]:
df.dtypes
Out[13]:
In [14]:
df.columns
Out[14]:
In [15]:
sl = df.sepallength.values
sw = df.sepalwidth.values
sp = df.species.values
pl = df.petallength.values
pw = df.petalwidth.values
In [16]:
plt.plot(sl,sw,"bo")
plt.xlabel("Sepal Length")
plt.ylabel("Sepal Width")
plt.show()
In [17]:
plt.plot(pl,pw,"go")
plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.show()
In [18]:
pivoted = pd.pivot_table(df,values='sepallength',columns='species',index='sepalwidth')
pivoted.plot()
Out[18]:
In [19]:
pivoted = pd.pivot_table(df,values='petallength',columns='species',index='petalwidth')
pivoted.plot()
Out[19]:
In [20]:
sns.stripplot(x="sepallength",y="species",data=df,jitter=1)
Out[20]:
In [21]:
sns.boxplot(x="sepallength",y="species",data=df)
Out[21]:
In [22]:
#write cluster plot function to help us
def plot_clusters(data, algorithm, args, kwds):
start_time = time.time()
labels = algorithm(*args, **kwds).fit_predict(data)
end_time = time.time()
palette = sns.color_palette('deep', np.unique(labels).max() + 1)
colors = [palette[x] if x >= 0 else (0.0, 0.0, 0.0) for x in labels]
plt.scatter(data.T[0], data.T[1], c=colors, **plot_kwds)
frame = plt.gca()
frame.axes.get_xaxis().set_visible(False)
frame.axes.get_yaxis().set_visible(False)
plt.title('Clusters found by {}'.format(str(algorithm.__name__)), fontsize=24)
plt.text(-0.5, 0.7, 'Clustering took {:.2f} s'.format(end_time - start_time), fontsize=14)
In [45]:
#let us massage our data into shape
cdf = df.drop('species',1)
cdf.head(5)
cdf = cdf.as_matrix()
In [60]:
#introduction to clustering
#k-means
#centroid based clustering
#step 1: specify number of clusters
#step 2: calculate centroids depending on a distance metric
#step 3: assign points to a cluster centroid depending on wcss
#step 4: calculate new centroid based on these observations
# what are the advantages and disadvantages
plot_clusters(cdf, cluster.KMeans, (), {'n_clusters':3})
In [58]:
#DBScan
#eps parameter = max distance between 2 points in same neighborhood
plot_clusters(cdf, cluster.DBSCAN, (), {'eps':0.5})
In [70]:
#graph based: affinity propagation
plot_clusters(cdf, cluster.AffinityPropagation, (), {'preference':-9.0, 'damping':0.80})