In [1]:
# Custom libraries
from datascienceutils import analyze
from datascienceutils import clusteringModels as cm

# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
import random

from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook(bokeh.resources.INLINE)


Loading BokehJS ...

In [2]:
irisDf = pd.read_csv('./data/Iris.csv')

In [3]:
target = irisDf.Species
irisDf.drop('Species', 1, inplace=True)
cm.cluster_analyze(irisDf)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-3-7510326dc7f6> in <module>()
      1 target = irisDf.Species
      2 irisDf.drop('Species', 1, inplace=True)
----> 3 cm.cluster_analyze(irisDf)

/home/anand/playspace/data-science-utils/datascienceutils/clusteringModels.py in cluster_analyze(dataframe)
     80         # plot
     81         new_df = pd.DataFrame(X)
---> 82         plots.append(plotter.scatterplot(new_df, 0, 1, title='%s'%name))
     83 
     84         if hasattr(algorithm, 'cluster_centers_'):

/home/anand/playspace/data-science-utils/datascienceutils/plotter.py in scatterplot(scatterDF, xcol, ycol, width, height, xlabel, ylabel, group, plttitle, **kwargs)
    298 
    299     if not group:
--> 300         p.circle(scatterDF[xcol], scatterDF[ycol], size=5, **kwargs)
    301     else:
    302         groups = list(scatterDf[group].unique())

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/plotting/helpers.py in func(self, *args, **kwargs)
    491             hglyph_ca = None
    492 
--> 493         glyph = _make_glyph(glyphclass, kwargs, glyph_ca)
    494         nsglyph = _make_glyph(glyphclass, kwargs, nsglyph_ca)
    495         hglyph = _make_glyph(glyphclass, kwargs, hglyph_ca)

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/plotting/helpers.py in _make_glyph(glyphclass, kws, extra)
    168     kws = kws.copy()
    169     kws.update(extra)
--> 170     return glyphclass(**kws)
    171 
    172 

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/model.py in __init__(self, **kwargs)
     84         self._id = kwargs.pop("id", make_id())
     85         self._document = None
---> 86         super(Model, self).__init__(**kwargs)
     87         default_theme.apply_to_model(self)
     88 

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/properties.py in __init__(self, **properties)
    705 
    706         for name, value in properties.items():
--> 707             setattr(self, name, value)
    708 
    709     def equals(self, other):

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/properties.py in __setattr__(self, name, value)
    738 
    739             raise AttributeError("unexpected attribute '%s' to %s, %s attributes are %s" %
--> 740                 (name, self.__class__.__name__, text, nice_join(matches)))
    741 
    742     def set_from_json(self, name, json, models=None):

AttributeError: unexpected attribute 'title' to Circle, possible attributes are angle, angle_units, fill_alpha, fill_color, line_alpha, line_cap, line_color, line_dash, line_dash_offset, line_join, line_width, name, radius, radius_dimension, radius_units, size, tags, visible, x or y

In [4]:
analyze.silhouette_analyze(irisDf, cluster_type='KMeans')


For clusters = 2 The average silhouette_score is : 0.620465604655
For clusters = 4 The average silhouette_score is : 0.556569210379
For clusters = 6 The average silhouette_score is : 0.532536501633

In [5]:
analyze.silhouette_analyze(irisDf, cluster_type='dbscan')


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-5-0aa020a6e5ac> in <module>()
----> 1 analyze.silhouette_analyze(irisDf, cluster_type='dbscan')

/home/anand/playspace/data-science-utils/datascienceutils/analyze.py in silhouette_analyze(dataframe, cluster_type, n_clusters)
    226         # clusters
    227         if len(cluster_labels) > 1:
--> 228             silhouette_avg = silhouette_score(dataframe, cluster_labels)
    229             cluster_scores_df.loc[j] = [cluster, silhouette_avg]
    230             print("For clusters =", cluster,

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/metrics/cluster/unsupervised.py in silhouette_score(X, labels, metric, sample_size, random_state, **kwds)
     98         else:
     99             X, labels = X[indices], labels[indices]
--> 100     return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
    101 
    102 

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/metrics/cluster/unsupervised.py in silhouette_samples(X, labels, metric, **kwds)
    164     le = LabelEncoder()
    165     labels = le.fit_transform(labels)
--> 166     check_number_of_labels(len(le.classes_), X.shape[0])
    167 
    168     distances = pairwise_distances(X, metric=metric, **kwds)

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/metrics/cluster/unsupervised.py in check_number_of_labels(n_labels, n_samples)
     18     if not 1 < n_labels < n_samples:
     19         raise ValueError("Number of labels is %d. Valid values are 2 "
---> 20                          "to n_samples - 1 (inclusive)" % n_labels)
     21 
     22 

ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [6]:
analyze.silhouette_analyze(irisDf, cluster_type='spectral')


For clusters = 2 The average silhouette_score is : 0.620329425953
For clusters = 4 The average silhouette_score is : 0.535793073503
For clusters = 6 The average silhouette_score is : 0.515900030041

In [7]:
analyze.silhouette_analyze(irisDf, cluster_type='birch')


For clusters = 2 The average silhouette_score is : 0.600210942442
For clusters = 4 The average silhouette_score is : 0.600210942442
For clusters = 6 The average silhouette_score is : 0.600210942442

In [8]:
#analyze.som_analyze(df, (10,10), algo_type='som')