Clustering

Agenda:

K-means clustering
Clustering evaluation
DBSCAN clustering



In [1]:

    
import pandas as pd
# allow plots to appear in the notebook
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 14
plt.rcParams['figure.figsize'] = (20.0, 6.0)



In [2]:

    
bikes = pd.read_csv('../data/2016-Q1-Trips-History-Data.csv')
bikes['start'] = pd.to_datetime(bikes['Start date'], infer_datetime_format=True)
bikes['end'] = pd.to_datetime(bikes['End date'], infer_datetime_format=True)
bikes['datetime_hour'] = bikes.start.dt.floor(freq='h')
bikes['hour'] = bikes.start.dt.hour
bikes['day_of_week'] = bikes.start.dt.dayofweek
bikes['member_cat'] = bikes['Member Type'].map({'Registered':0, 'Casual':1})
bikes_simp = bikes.drop(['Start date', 'End date', 'Start station', 
                         'End station', 'Bike number', 'start', 'end', 'datetime_hour', 'Member Type'], axis=1)
bikes_simp.dtypes









    Out[2]:





Duration (ms)           int64
Start station number    int64
End station number      int64
hour                    int64
day_of_week             int64
member_cat              int64
dtype: object

Spatial Clustering



In [21]:

    
roads = pd.read_csv('../data/3D_spatial_network.txt', header=None, names=['osm', 'lat','lon','alt'])
roads
X = roads.drop(['osm'], axis=1).sample(100000)
X.head()









    



ERROR! Session/line number was not unique in database. History logging moved to new session 134






    Out[21]:






  
    
      
      lat
      lon
      alt
    
  
  
    
      245744
      9.811284
      56.630792
      46.735656
    
    
      267020
      10.459862
      57.518930
      1.711458
    
    
      375381
      10.101505
      57.369215
      50.121351
    
    
      232870
      10.155259
      57.056226
      9.039043
    
    
      240241
      10.458872
      57.489159
      7.481675

What happened to y?

Part 1: K-means clustering



In [5]:

    
X = bikes_simp.sample(10000)



In [41]:

    
# K-means with 3 clusters
from sklearn.cluster import KMeans
km = KMeans(n_clusters=7, random_state=1)
km.fit(X[['lat_', 'lon_', 'alt_']])









    Out[41]:





KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=7, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=1, tol=0.0001,
    verbose=0)



In [18]:

    
# review the cluster labels
set(km.labels_)









    Out[18]:





{0, 1, 2}



In [42]:

    
X['cluster'] = km.predict(X[['lat_', 'lon_', 'alt_']])



In [11]:

    
X.cluster.value_counts()









    Out[11]:





0    9614
2     371
1      15
Name: cluster, dtype: int64



In [43]:

    
# create a "colors" array for plotting
import numpy as np
colors = np.array(['red', 'green', 'blue', 'yellow', 'pink', 'purple', 'orange'])

# scatter plot of calories versus alcohol, colored by cluster (0=red, 1=green, 2=blue)
plt.scatter(X.lat_, X.lon_, c=colors[X.cluster], s=50)

# cluster centers, marked by "+"
# plt.scatter(centers.calories, centers.alcohol, linewidths=3, marker='+', s=300, c='black')

# add labels
plt.xlabel('lat')
plt.ylabel('alt')









    Out[43]:





<matplotlib.text.Text at 0x10f0c2518>



In [44]:

    
# create a "colors" array for plotting
import numpy as np
colors = np.array(['red', 'green', 'blue', 'yellow', 'pink', 'purple', 'orange'])

# scatter plot of calories versus alcohol, colored by cluster (0=red, 1=green, 2=blue)
plt.scatter(X.lat_, X.alt_, c=colors[X.cluster], s=50)

plt.xlabel('lat')
plt.ylabel('alt')









    Out[44]:





<matplotlib.text.Text at 0x10f7e57b8>



In [30]:

    
X['alt_'] = (X.alt - X.alt.mean())/X.alt.std()
X['lat_'] = (X.lat - X.lat.mean())/X.lat.std()
X['lon_'] = (X.lon - X.lon.mean())/X.lon.std()



In [27]:

    
X.std()









    Out[27]:





lat         0.626778
lon         0.289692
alt        18.672189
cluster     0.648455
dtype: float64



In [14]:

    
X[X.cluster==0].mean()









    Out[14]:





Duration (ms)           744748.748700
Start station number     31306.080924
End station number       31311.402850
hour                        13.824527
day_of_week                  2.861140
member_cat                   0.125858
cluster                      0.000000
dtype: float64



In [15]:

    
X[X.cluster==1].mean()









    Out[15]:





Duration (ms)           5.169406e+07
Start station number    3.128300e+04
End station number      3.128553e+04
hour                    1.513333e+01
day_of_week             2.600000e+00
member_cat              2.666667e-01
cluster                 1.000000e+00
dtype: float64



In [16]:

    
roads.sample(10000).plot(kind='scatter', x='lat', y='lon')









    Out[16]:





<matplotlib.axes._subplots.AxesSubplot at 0x115c37668>



In [25]:

    
# create a "colors" array for plotting
import numpy as np
colors = np.array(['red', 'green', 'blue', 'yellow'])

# scatter plot of calories versus alcohol, colored by cluster (0=red, 1=green, 2=blue)
plt.scatter(X.lat, X.lon, c=colors[X.cluster], s=50)

# cluster centers, marked by "+"
# plt.scatter(centers.calories, centers.alcohol, linewidths=3, marker='+', s=300, c='black')

# add labels
plt.xlabel('lat')
plt.ylabel('lon')









    Out[25]:





<matplotlib.text.Text at 0x10922a9b0>



In [12]:

    
# save the cluster labels and sort by cluster
X['cluster'] = km.labels_
X.groupby()









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-251499e1ca47> in <module>()
      1 # save the cluster labels and sort by cluster
----> 2 X['cluster'] = km.labels_
      3 X.groupby()

NameError: name 'km' is not defined

What do the clusters seem to be based on? Why?



In [6]:

    
# review the cluster centers
km.cluster_centers_









    Out[6]:





array([[ 150.        ,   17.        ,    4.52142857,    0.52071429],
       [ 102.75      ,   10.        ,    4.075     ,    0.44      ],
       [  70.        ,   10.5       ,    2.6       ,    0.42      ]])



In [7]:

    
# calculate the mean of each feature for each cluster
beer.groupby('cluster').mean()



In [8]:

    
# save the DataFrame of cluster centers
centers = beer.groupby('cluster').mean()



In [10]:

    
# create a "colors" array for plotting
import numpy as np
colors = np.array(['red', 'green', 'blue', 'yellow'])

Repeat with scaled data



In [13]:

    
# center and scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



In [14]:

    
# K-means with 3 clusters on scaled data
km = KMeans(n_clusters=3, random_state=1)
km.fit(X_scaled)









    Out[14]:





KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=1, tol=0.0001,
    verbose=0)

What are the "characteristics" of each cluster?

Do you notice any cluster assignments that seem a bit odd? How might we explain those?

Part 2: Clustering evaluation

The Silhouette Coefficient is a common metric for evaluating clustering "performance" in situations when the "true" cluster assignments are not known.

A Silhouette Coefficient is calculated for each observation:

$$SC = \frac{b-a} {max(a, b)}$$

a = mean distance to all other points in its cluster
b = mean distance to all other points in the next nearest cluster

It ranges from -1 (worst) to 1 (best). A global score is calculated by taking the mean score for all observations.



In [ ]:

    
# calculate SC for K=3
from sklearn import metrics
metrics.silhouette_score(X, km.labels_)



In [19]:

    
# calculate SC for K=2 through K=19
k_range = range(2, 20)
scores = []
for k in k_range:
    km = KMeans(n_clusters=k, random_state=1)
    km.fit(X_scaled)
    scores.append(metrics.silhouette_score(X_scaled, km.labels_))









    



c:\Users\alsherman\AppData\Local\Continuum\Anaconda\lib\site-packages\numpy\core\_methods.py:59: RuntimeWarning: Mean of empty slice.
  warnings.warn("Mean of empty slice.", RuntimeWarning)



In [20]:

    
# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.grid(True)



In [21]:

    
# K-means with 4 clusters on scaled data
km = KMeans(n_clusters=4, random_state=1)
km.fit(X_scaled)
beer['cluster'] = km.labels_
beer.sort('cluster')









    Out[21]:






  
    
      
      name
      calories
      sodium
      alcohol
      cost
      cluster
    
  
  
    
      0
      Budweiser
      144
      15
      4.7
      0.43
      0
    
    
      1
      Schlitz
      151
      19
      4.9
      0.43
      0
    
    
      17
      Heilemans_Old_Style
      144
      24
      4.9
      0.43
      0
    
    
      16
      Hamms
      139
      19
      4.4
      0.43
      0
    
    
      5
      Old_Milwaukee
      145
      23
      4.6
      0.28
      0
    
    
      6
      Augsberger
      175
      24
      5.5
      0.40
      0
    
    
      7
      Srohs_Bohemian_Style
      149
      27
      4.7
      0.42
      0
    
    
      10
      Coors
      140
      18
      4.6
      0.44
      0
    
    
      15
      Pabst_Extra_Light
      68
      15
      2.3
      0.38
      1
    
    
      12
      Michelob_Light
      135
      11
      4.2
      0.50
      1
    
    
      11
      Coors_Light
      102
      15
      4.1
      0.46
      1
    
    
      9
      Budweiser_Light
      113
      8
      3.7
      0.40
      1
    
    
      8
      Miller_Lite
      99
      10
      4.3
      0.43
      1
    
    
      18
      Olympia_Goled_Light
      72
      6
      2.9
      0.46
      1
    
    
      19
      Schlitz_Light
      97
      7
      4.2
      0.47
      1
    
    
      13
      Becks
      150
      19
      4.7
      0.76
      2
    
    
      14
      Kirin
      149
      6
      5.0
      0.79
      2
    
    
      4
      Heineken
      152
      11
      5.0
      0.77
      2
    
    
      3
      Kronenbourg
      170
      7
      5.2
      0.73
      2
    
    
      2
      Lowenbrau
      157
      15
      0.9
      0.48
      3

Part 3: DBSCAN clustering



In [22]:

    
# DBSCAN with eps=1 and min_samples=3
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=1, min_samples=3)
db.fit(X_scaled)









    Out[22]:





DBSCAN(algorithm='auto', eps=1, leaf_size=30, metric='euclidean',
    min_samples=3, p=None, random_state=None)



In [23]:

    
# review the cluster labels
db.labels_









    Out[23]:





array([ 0,  0, -1,  1,  1, -1, -1,  0,  2,  2,  0,  2,  0, -1,  1, -1,  0,
        0, -1,  2], dtype=int64)



In [26]:

    
# scatter plot matrix of DBSCAN cluster assignments (0=red, 1=green, 2=blue, -1=yellow)
pd.scatter_matrix(X, c=colors[beer.cluster], figsize=(10,10), s=100)









    Out[26]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000001BBD6048>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001B6EE4A8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001C030FD0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001ED01F98>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001EE84A20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001EF2B400>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F028EF0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001EF947B8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001F21AF98>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F2E4940>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F3CCDA0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F4D8550>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001F53E080>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F64B940>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F7590F0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F83EC50>]], dtype=object)



In [ ]:

	lat	lon	alt
245744	9.811284	56.630792	46.735656
267020	10.459862	57.518930	1.711458
375381	10.101505	57.369215	50.121351
232870	10.155259	57.056226	9.039043
240241	10.458872	57.489159	7.481675

	calories	sodium	alcohol	cost
cluster
0	150.00	17.0	4.521429	0.520714
1	102.75	10.0	4.075000	0.440000
2	70.00	10.5	2.600000	0.420000

	name	calories	sodium	alcohol	cost	cluster
0	Budweiser	144	15	4.7	0.43	0
1	Schlitz	151	19	4.9	0.43	0
17	Heilemans_Old_Style	144	24	4.9	0.43	0
16	Hamms	139	19	4.4	0.43	0
5	Old_Milwaukee	145	23	4.6	0.28	0
6	Augsberger	175	24	5.5	0.40	0
7	Srohs_Bohemian_Style	149	27	4.7	0.42	0
10	Coors	140	18	4.6	0.44	0
15	Pabst_Extra_Light	68	15	2.3	0.38	1
12	Michelob_Light	135	11	4.2	0.50	1
11	Coors_Light	102	15	4.1	0.46	1
9	Budweiser_Light	113	8	3.7	0.40	1
8	Miller_Lite	99	10	4.3	0.43	1
18	Olympia_Goled_Light	72	6	2.9	0.46	1
19	Schlitz_Light	97	7	4.2	0.47	1
13	Becks	150	19	4.7	0.76	2
14	Kirin	149	6	5.0	0.79	2
4	Heineken	152	11	5.0	0.77	2
3	Kronenbourg	170	7	5.2	0.73	2
2	Lowenbrau	157	15	0.9	0.48	3