Clustering

Agenda:

  1. K-means clustering
  2. Clustering evaluation
  3. DBSCAN clustering

In [1]:
import pandas as pd
# allow plots to appear in the notebook
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 14
plt.rcParams['figure.figsize'] = (20.0, 6.0)

In [2]:
bikes = pd.read_csv('../data/2016-Q1-Trips-History-Data.csv')
bikes['start'] = pd.to_datetime(bikes['Start date'], infer_datetime_format=True)
bikes['end'] = pd.to_datetime(bikes['End date'], infer_datetime_format=True)
bikes['datetime_hour'] = bikes.start.dt.floor(freq='h')
bikes['hour'] = bikes.start.dt.hour
bikes['day_of_week'] = bikes.start.dt.dayofweek
bikes['member_cat'] = bikes['Member Type'].map({'Registered':0, 'Casual':1})
bikes_simp = bikes.drop(['Start date', 'End date', 'Start station', 
                         'End station', 'Bike number', 'start', 'end', 'datetime_hour', 'Member Type'], axis=1)
bikes_simp.dtypes


Out[2]:
Duration (ms)           int64
Start station number    int64
End station number      int64
hour                    int64
day_of_week             int64
member_cat              int64
dtype: object

Spatial Clustering


In [21]:
roads = pd.read_csv('../data/3D_spatial_network.txt', header=None, names=['osm', 'lat','lon','alt'])
roads
X = roads.drop(['osm'], axis=1).sample(100000)
X.head()


ERROR! Session/line number was not unique in database. History logging moved to new session 134
Out[21]:
lat lon alt
245744 9.811284 56.630792 46.735656
267020 10.459862 57.518930 1.711458
375381 10.101505 57.369215 50.121351
232870 10.155259 57.056226 9.039043
240241 10.458872 57.489159 7.481675

What happened to y?

Part 1: K-means clustering


In [5]:
X = bikes_simp.sample(10000)

In [41]:
# K-means with 3 clusters
from sklearn.cluster import KMeans
km = KMeans(n_clusters=7, random_state=1)
km.fit(X[['lat_', 'lon_', 'alt_']])


Out[41]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=7, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=1, tol=0.0001,
    verbose=0)

In [18]:
# review the cluster labels
set(km.labels_)


Out[18]:
{0, 1, 2}

In [42]:
X['cluster'] = km.predict(X[['lat_', 'lon_', 'alt_']])

In [11]:
X.cluster.value_counts()


Out[11]:
0    9614
2     371
1      15
Name: cluster, dtype: int64

In [43]:
# create a "colors" array for plotting
import numpy as np
colors = np.array(['red', 'green', 'blue', 'yellow', 'pink', 'purple', 'orange'])

# scatter plot of calories versus alcohol, colored by cluster (0=red, 1=green, 2=blue)
plt.scatter(X.lat_, X.lon_, c=colors[X.cluster], s=50)

# cluster centers, marked by "+"
# plt.scatter(centers.calories, centers.alcohol, linewidths=3, marker='+', s=300, c='black')

# add labels
plt.xlabel('lat')
plt.ylabel('alt')


Out[43]:
<matplotlib.text.Text at 0x10f0c2518>

In [44]:
# create a "colors" array for plotting
import numpy as np
colors = np.array(['red', 'green', 'blue', 'yellow', 'pink', 'purple', 'orange'])

# scatter plot of calories versus alcohol, colored by cluster (0=red, 1=green, 2=blue)
plt.scatter(X.lat_, X.alt_, c=colors[X.cluster], s=50)

plt.xlabel('lat')
plt.ylabel('alt')


Out[44]:
<matplotlib.text.Text at 0x10f7e57b8>

In [30]:
X['alt_'] = (X.alt - X.alt.mean())/X.alt.std()
X['lat_'] = (X.lat - X.lat.mean())/X.lat.std()
X['lon_'] = (X.lon - X.lon.mean())/X.lon.std()

In [27]:
X.std()


Out[27]:
lat         0.626778
lon         0.289692
alt        18.672189
cluster     0.648455
dtype: float64

In [14]:
X[X.cluster==0].mean()


Out[14]:
Duration (ms)           744748.748700
Start station number     31306.080924
End station number       31311.402850
hour                        13.824527
day_of_week                  2.861140
member_cat                   0.125858
cluster                      0.000000
dtype: float64

In [15]:
X[X.cluster==1].mean()


Out[15]:
Duration (ms)           5.169406e+07
Start station number    3.128300e+04
End station number      3.128553e+04
hour                    1.513333e+01
day_of_week             2.600000e+00
member_cat              2.666667e-01
cluster                 1.000000e+00
dtype: float64

In [16]:
roads.sample(10000).plot(kind='scatter', x='lat', y='lon')


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x115c37668>

In [25]:
# create a "colors" array for plotting
import numpy as np
colors = np.array(['red', 'green', 'blue', 'yellow'])

# scatter plot of calories versus alcohol, colored by cluster (0=red, 1=green, 2=blue)
plt.scatter(X.lat, X.lon, c=colors[X.cluster], s=50)

# cluster centers, marked by "+"
# plt.scatter(centers.calories, centers.alcohol, linewidths=3, marker='+', s=300, c='black')

# add labels
plt.xlabel('lat')
plt.ylabel('lon')


Out[25]:
<matplotlib.text.Text at 0x10922a9b0>

In [12]:
# save the cluster labels and sort by cluster
X['cluster'] = km.labels_
X.groupby()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-251499e1ca47> in <module>()
      1 # save the cluster labels and sort by cluster
----> 2 X['cluster'] = km.labels_
      3 X.groupby()

NameError: name 'km' is not defined

What do the clusters seem to be based on? Why?


In [6]:
# review the cluster centers
km.cluster_centers_


Out[6]:
array([[ 150.        ,   17.        ,    4.52142857,    0.52071429],
       [ 102.75      ,   10.        ,    4.075     ,    0.44      ],
       [  70.        ,   10.5       ,    2.6       ,    0.42      ]])

In [7]:
# calculate the mean of each feature for each cluster
beer.groupby('cluster').mean()


Out[7]:
calories sodium alcohol cost
cluster
0 150.00 17.0 4.521429 0.520714
1 102.75 10.0 4.075000 0.440000
2 70.00 10.5 2.600000 0.420000

In [8]:
# save the DataFrame of cluster centers
centers = beer.groupby('cluster').mean()

In [10]:
# create a "colors" array for plotting
import numpy as np
colors = np.array(['red', 'green', 'blue', 'yellow'])

Repeat with scaled data


In [13]:
# center and scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [14]:
# K-means with 3 clusters on scaled data
km = KMeans(n_clusters=3, random_state=1)
km.fit(X_scaled)


Out[14]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=1, tol=0.0001,
    verbose=0)

What are the "characteristics" of each cluster?

Do you notice any cluster assignments that seem a bit odd? How might we explain those?

Part 2: Clustering evaluation

The Silhouette Coefficient is a common metric for evaluating clustering "performance" in situations when the "true" cluster assignments are not known.

A Silhouette Coefficient is calculated for each observation:

$$SC = \frac{b-a} {max(a, b)}$$
  • a = mean distance to all other points in its cluster
  • b = mean distance to all other points in the next nearest cluster

It ranges from -1 (worst) to 1 (best). A global score is calculated by taking the mean score for all observations.


In [ ]:
# calculate SC for K=3
from sklearn import metrics
metrics.silhouette_score(X, km.labels_)

In [19]:
# calculate SC for K=2 through K=19
k_range = range(2, 20)
scores = []
for k in k_range:
    km = KMeans(n_clusters=k, random_state=1)
    km.fit(X_scaled)
    scores.append(metrics.silhouette_score(X_scaled, km.labels_))


c:\Users\alsherman\AppData\Local\Continuum\Anaconda\lib\site-packages\numpy\core\_methods.py:59: RuntimeWarning: Mean of empty slice.
  warnings.warn("Mean of empty slice.", RuntimeWarning)

In [20]:
# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.grid(True)



In [21]:
# K-means with 4 clusters on scaled data
km = KMeans(n_clusters=4, random_state=1)
km.fit(X_scaled)
beer['cluster'] = km.labels_
beer.sort('cluster')


Out[21]:
name calories sodium alcohol cost cluster
0 Budweiser 144 15 4.7 0.43 0
1 Schlitz 151 19 4.9 0.43 0
17 Heilemans_Old_Style 144 24 4.9 0.43 0
16 Hamms 139 19 4.4 0.43 0
5 Old_Milwaukee 145 23 4.6 0.28 0
6 Augsberger 175 24 5.5 0.40 0
7 Srohs_Bohemian_Style 149 27 4.7 0.42 0
10 Coors 140 18 4.6 0.44 0
15 Pabst_Extra_Light 68 15 2.3 0.38 1
12 Michelob_Light 135 11 4.2 0.50 1
11 Coors_Light 102 15 4.1 0.46 1
9 Budweiser_Light 113 8 3.7 0.40 1
8 Miller_Lite 99 10 4.3 0.43 1
18 Olympia_Goled_Light 72 6 2.9 0.46 1
19 Schlitz_Light 97 7 4.2 0.47 1
13 Becks 150 19 4.7 0.76 2
14 Kirin 149 6 5.0 0.79 2
4 Heineken 152 11 5.0 0.77 2
3 Kronenbourg 170 7 5.2 0.73 2
2 Lowenbrau 157 15 0.9 0.48 3

Part 3: DBSCAN clustering


In [22]:
# DBSCAN with eps=1 and min_samples=3
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=1, min_samples=3)
db.fit(X_scaled)


Out[22]:
DBSCAN(algorithm='auto', eps=1, leaf_size=30, metric='euclidean',
    min_samples=3, p=None, random_state=None)

In [23]:
# review the cluster labels
db.labels_


Out[23]:
array([ 0,  0, -1,  1,  1, -1, -1,  0,  2,  2,  0,  2,  0, -1,  1, -1,  0,
        0, -1,  2], dtype=int64)

In [26]:
# scatter plot matrix of DBSCAN cluster assignments (0=red, 1=green, 2=blue, -1=yellow)
pd.scatter_matrix(X, c=colors[beer.cluster], figsize=(10,10), s=100)


Out[26]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000001BBD6048>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001B6EE4A8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001C030FD0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001ED01F98>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001EE84A20>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001EF2B400>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F028EF0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001EF947B8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001F21AF98>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F2E4940>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F3CCDA0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F4D8550>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000001F53E080>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F64B940>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F7590F0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000001F83EC50>]], dtype=object)

In [ ]: