In [1]:
import pandas as pd
# allow plots to appear in the notebook
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 14
plt.rcParams['figure.figsize'] = (20.0, 6.0)
In [2]:
bikes = pd.read_csv('../data/2016-Q1-Trips-History-Data.csv')
bikes['start'] = pd.to_datetime(bikes['Start date'], infer_datetime_format=True)
bikes['end'] = pd.to_datetime(bikes['End date'], infer_datetime_format=True)
bikes['datetime_hour'] = bikes.start.dt.floor(freq='h')
bikes['hour'] = bikes.start.dt.hour
bikes['day_of_week'] = bikes.start.dt.dayofweek
bikes['member_cat'] = bikes['Member Type'].map({'Registered':0, 'Casual':1})
bikes_simp = bikes.drop(['Start date', 'End date', 'Start station',
'End station', 'Bike number', 'start', 'end', 'datetime_hour', 'Member Type'], axis=1)
bikes_simp.dtypes
Out[2]:
In [21]:
roads = pd.read_csv('../data/3D_spatial_network.txt', header=None, names=['osm', 'lat','lon','alt'])
roads
X = roads.drop(['osm'], axis=1).sample(100000)
X.head()
Out[21]:
What happened to y?
In [5]:
X = bikes_simp.sample(10000)
In [41]:
# K-means with 3 clusters
from sklearn.cluster import KMeans
km = KMeans(n_clusters=7, random_state=1)
km.fit(X[['lat_', 'lon_', 'alt_']])
Out[41]:
In [18]:
# review the cluster labels
set(km.labels_)
Out[18]:
In [42]:
X['cluster'] = km.predict(X[['lat_', 'lon_', 'alt_']])
In [11]:
X.cluster.value_counts()
Out[11]:
In [43]:
# create a "colors" array for plotting
import numpy as np
colors = np.array(['red', 'green', 'blue', 'yellow', 'pink', 'purple', 'orange'])
# scatter plot of calories versus alcohol, colored by cluster (0=red, 1=green, 2=blue)
plt.scatter(X.lat_, X.lon_, c=colors[X.cluster], s=50)
# cluster centers, marked by "+"
# plt.scatter(centers.calories, centers.alcohol, linewidths=3, marker='+', s=300, c='black')
# add labels
plt.xlabel('lat')
plt.ylabel('alt')
Out[43]:
In [44]:
# create a "colors" array for plotting
import numpy as np
colors = np.array(['red', 'green', 'blue', 'yellow', 'pink', 'purple', 'orange'])
# scatter plot of calories versus alcohol, colored by cluster (0=red, 1=green, 2=blue)
plt.scatter(X.lat_, X.alt_, c=colors[X.cluster], s=50)
plt.xlabel('lat')
plt.ylabel('alt')
Out[44]:
In [30]:
X['alt_'] = (X.alt - X.alt.mean())/X.alt.std()
X['lat_'] = (X.lat - X.lat.mean())/X.lat.std()
X['lon_'] = (X.lon - X.lon.mean())/X.lon.std()
In [27]:
X.std()
Out[27]:
In [14]:
X[X.cluster==0].mean()
Out[14]:
In [15]:
X[X.cluster==1].mean()
Out[15]:
In [16]:
roads.sample(10000).plot(kind='scatter', x='lat', y='lon')
Out[16]:
In [25]:
# create a "colors" array for plotting
import numpy as np
colors = np.array(['red', 'green', 'blue', 'yellow'])
# scatter plot of calories versus alcohol, colored by cluster (0=red, 1=green, 2=blue)
plt.scatter(X.lat, X.lon, c=colors[X.cluster], s=50)
# cluster centers, marked by "+"
# plt.scatter(centers.calories, centers.alcohol, linewidths=3, marker='+', s=300, c='black')
# add labels
plt.xlabel('lat')
plt.ylabel('lon')
Out[25]:
In [12]:
# save the cluster labels and sort by cluster
X['cluster'] = km.labels_
X.groupby()
What do the clusters seem to be based on? Why?
In [6]:
# review the cluster centers
km.cluster_centers_
Out[6]:
In [7]:
# calculate the mean of each feature for each cluster
beer.groupby('cluster').mean()
Out[7]:
In [8]:
# save the DataFrame of cluster centers
centers = beer.groupby('cluster').mean()
In [10]:
# create a "colors" array for plotting
import numpy as np
colors = np.array(['red', 'green', 'blue', 'yellow'])
In [13]:
# center and scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
In [14]:
# K-means with 3 clusters on scaled data
km = KMeans(n_clusters=3, random_state=1)
km.fit(X_scaled)
Out[14]:
What are the "characteristics" of each cluster?
Do you notice any cluster assignments that seem a bit odd? How might we explain those?
The Silhouette Coefficient is a common metric for evaluating clustering "performance" in situations when the "true" cluster assignments are not known.
A Silhouette Coefficient is calculated for each observation:
$$SC = \frac{b-a} {max(a, b)}$$It ranges from -1 (worst) to 1 (best). A global score is calculated by taking the mean score for all observations.
In [ ]:
# calculate SC for K=3
from sklearn import metrics
metrics.silhouette_score(X, km.labels_)
In [19]:
# calculate SC for K=2 through K=19
k_range = range(2, 20)
scores = []
for k in k_range:
km = KMeans(n_clusters=k, random_state=1)
km.fit(X_scaled)
scores.append(metrics.silhouette_score(X_scaled, km.labels_))
In [20]:
# plot the results
plt.plot(k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Coefficient')
plt.grid(True)
In [21]:
# K-means with 4 clusters on scaled data
km = KMeans(n_clusters=4, random_state=1)
km.fit(X_scaled)
beer['cluster'] = km.labels_
beer.sort('cluster')
Out[21]:
In [22]:
# DBSCAN with eps=1 and min_samples=3
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=1, min_samples=3)
db.fit(X_scaled)
Out[22]:
In [23]:
# review the cluster labels
db.labels_
Out[23]:
In [26]:
# scatter plot matrix of DBSCAN cluster assignments (0=red, 1=green, 2=blue, -1=yellow)
pd.scatter_matrix(X, c=colors[beer.cluster], figsize=(10,10), s=100)
Out[26]:
In [ ]: