In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import preprocessing
from scipy.spatial.distance import cdist
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import SpectralClustering
from sklearn.metrics import silhouette_samples, silhouette_score
In [3]:
# Read and import data
boston_marathon_results = pd.read_csv('results 2013.csv')
boston_marathon_results.head()
Out[3]:
25k
age
name
division
10k
gender
half
official
bib
ctz
...
overall
pace
state
30k
5k
genderdiv
20k
35k
city
40k
0
49.87
28
Cassidy, Josh R.
9
18.18
M
40.93
90.90
W1
NaN
...
9
3.47
ON
62.07
8.90
9
38.80
74.73
Toronto
85.55
1
77.27
30
Korir, Wesley
5
30.90
M
64.90
132.50
1
NaN
...
5
5.07
NaN
92.97
15.90
5
61.52
108.78
Kenya
124.77
2
77.23
23
Desisa, Lelisa
1
30.90
M
64.92
130.37
2
NaN
...
1
4.98
NaN
92.72
15.93
1
61.53
108.68
Ambo
123.78
3
50.50
32
Fearnley, Kurt H.
5
18.73
M
42.00
88.43
W2
NaN
...
5
3.38
NaN
61.35
8.98
5
39.88
73.00
Hamilton
83.43
4
48.75
39
Hokinoue, Kota
3
18.18
M
40.57
87.22
W3
NaN
...
3
3.33
NaN
59.92
8.92
3
38.55
71.68
Iizuka
81.88
5 rows × 21 columns
In [4]:
boston_marathon_results.columns
Out[4]:
Index(['25k', 'age', 'name', 'division', '10k', 'gender', 'half', 'official',
'bib', 'ctz', 'country', 'overall', 'pace', 'state', '30k', '5k',
'genderdiv', '20k', '35k', 'city', '40k'],
dtype='object')
In [5]:
boston_marathon_scores = boston_marathon_results.drop(['city','country', 'genderdiv', 'bib', 'ctz', 'state', 'name', 'division'], axis = 1)
In [6]:
boston_marathon_scores.replace('-', 0, inplace=True)
boston_marathon_scores['gender'] = boston_marathon_scores.loc[:, 'gender'].map({'F': 0,'M': 1})
print(boston_marathon_scores.columns.unique())
boston_marathon_scores.head()
Index(['25k', 'age', '10k', 'gender', 'half', 'official', 'overall', 'pace',
'30k', '5k', '20k', '35k', '40k'],
dtype='object')
Out[6]:
25k
age
10k
gender
half
official
overall
pace
30k
5k
20k
35k
40k
0
49.87
28
18.18
1
40.93
90.90
9
3.47
62.07
8.90
38.80
74.73
85.55
1
77.27
30
30.90
1
64.90
132.50
5
5.07
92.97
15.90
61.52
108.78
124.77
2
77.23
23
30.90
1
64.92
130.37
1
4.98
92.72
15.93
61.53
108.68
123.78
3
50.50
32
18.73
1
42.00
88.43
5
3.38
61.35
8.98
39.88
73.00
83.43
4
48.75
39
18.18
1
40.57
87.22
3
3.33
59.92
8.92
38.55
71.68
81.88
In [7]:
boston_marathon_scores = boston_marathon_scores.astype(float)
boston_marathon_scores.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16164 entries, 0 to 16163
Data columns (total 13 columns):
25k 16164 non-null float64
age 16164 non-null float64
10k 16164 non-null float64
gender 16164 non-null float64
half 16164 non-null float64
official 16164 non-null float64
overall 16164 non-null float64
pace 16164 non-null float64
30k 16164 non-null float64
5k 16164 non-null float64
20k 16164 non-null float64
35k 16164 non-null float64
40k 16164 non-null float64
dtypes: float64(13)
memory usage: 1.6 MB
In [8]:
#Make a copy of DF
X_tr = boston_marathon_scores
#Standardize
clmns = ['age', 'official','40k', '35k', '30k', '25k', 'half', '20k', '10k', '5k', 'pace']
X_tr_std = normalize(X_tr[clmns])
Compare Spectral Clustering against kMeans using Similarity
As there is no ground truth, the criteria used to evaluate clusters produced using Spectral and kmeans is the silhouette coefficient. From the results obtained, it can be appreaciated that Spectral Clustering requires 6 clusters to have the silhouette score similar to the one obtained with 3 clusters with kmeans.
In [19]:
#Compare from a silhouette_score perspective kmeans against Spectral Clustering
range_n_clusters = np.arange(10)+2
for n_clusters in range_n_clusters:
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
spec_clust = SpectralClustering(n_clusters=n_clusters)
cluster_labels1 = spec_clust.fit_predict(X_tr_std)
silhouette_avg1 = silhouette_score(X_tr_std, cluster_labels1)
kmeans = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10).fit(X_tr_std)
cluster_labels2 = kmeans.fit_predict(X_tr_std)
silhouette_avg2 = silhouette_score(X_tr_std, cluster_labels2)
print("For n_clusters =", n_clusters,
"av. sil_score for Spec. clust is :", silhouette_avg1,
"av. sil_score for kmeans is :",silhouette_avg2 )
For n_clusters = 2 av. sil_score for Spec. clust is : 0.961171331526 av. sil_score for kmeans is : 0.960595115442
For n_clusters = 3 av. sil_score for Spec. clust is : 0.387076810597 av. sil_score for kmeans is : 0.388209250616
For n_clusters = 4 av. sil_score for Spec. clust is : 0.388415111093 av. sil_score for kmeans is : 0.396003681008
For n_clusters = 5 av. sil_score for Spec. clust is : 0.389380079665 av. sil_score for kmeans is : 0.397697024637
For n_clusters = 6 av. sil_score for Spec. clust is : 0.390484776573 av. sil_score for kmeans is : 0.329749119438
For n_clusters = 7 av. sil_score for Spec. clust is : 0.385483467461 av. sil_score for kmeans is : 0.332703384807
For n_clusters = 8 av. sil_score for Spec. clust is : 0.384559772179 av. sil_score for kmeans is : 0.335751771772
For n_clusters = 9 av. sil_score for Spec. clust is : 0.386544891208 av. sil_score for kmeans is : 0.336687926662
For n_clusters = 10 av. sil_score for Spec. clust is : 0.379711180133 av. sil_score for kmeans is : 0.292708178952
For n_clusters = 11 av. sil_score for Spec. clust is : 0.380947772569 av. sil_score for kmeans is : 0.293759032687
the optimal number of kmeans will be determined using the elbow method. Once the kmeans number of clusters is set, the number of clusters using spectral clustering will be used so that it equals the silhouette score obtained in the first case.
K-Means
In [9]:
#Use the elbow method to determine the number of clusters
# k-means determine k
distortions = []
K = range(1,10)
for k in K:
kmeanModel = KMeans(n_clusters=k).fit(X_tr)
kmeanModel.fit(X_tr)
distortions.append(sum(np.min(cdist(X_tr, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X_tr.shape[0])
# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()
The elbow method shows that the optimal number of clusters to be used in the kmeans method is 3, considering the euclidean distance between cluster centers. From an analytical perspective, the inertia functions shows the same results: 3 clusters were the difference between the results obtained by the inertia function are smaller when shifting from 3 to 4 clusters.
In [10]:
#Evaluate the best number of clusters
for i in range(1,10):
km = KMeans(n_clusters=i, init='k-means++', n_init=10).fit(X_tr_std)
print (i, km.inertia_)
1 30.1263346974
2 19.0457804842
3 13.188426726
4 11.2720106651
5 9.93710134099
6 8.54109431453
7 7.47659624147
8 6.8543306694
9 6.17804693014
In [59]:
#Cluster the data
kmeans = KMeans(n_clusters=3, init='k-means++', n_init=10).fit(X_tr_std)
labels = kmeans.labels_
#Glue back to original data
X_tr['clusters'] = labels
X_tr['Gender'] = boston_marathon_scores.gender
X_tr['Overall'] = boston_marathon_scores.overall
#Add the column into our list
clmns.extend(['clusters','Gender','Overall'])
#Lets analyze the clusters
pd.DataFrame(X_tr.groupby(['clusters']).mean())
Out[59]:
25k
age
10k
gender
half
official
overall
pace
30k
5k
20k
35k
40k
Gender
Overall
clusters
0
118.229587
33.293590
46.701476
0.386324
99.217164
208.662765
8523.313127
7.966590
143.787377
23.352465
94.031163
170.262460
197.110309
0.386324
8523.313127
1
0.000000
41.857143
14.568571
0.500000
14.730000
220.382143
11327.642857
8.412857
10.755714
9.150714
13.962143
0.000000
14.455000
0.500000
11327.642857
2
118.059351
48.824824
46.667129
0.782068
99.197408
207.706291
8343.792325
7.930118
143.326589
23.332466
94.015637
169.582073
195.992275
0.782068
8343.792325
In [60]:
clusters_summary = X_tr.groupby(['clusters']).describe()
clusters_summary_transposed = clusters_summary.transpose()
clusters_summary_transposed
Out[60]:
clusters
0
1
2
10k
count
7473.000000
14.000000
8677.000000
mean
46.701476
14.568571
46.667129
std
5.086413
23.947241
5.160402
min
0.000000
0.000000
0.000000
25%
42.880000
0.000000
43.470000
50%
47.420000
0.000000
46.950000
75%
50.400000
36.090000
50.120000
max
66.680000
53.570000
61.580000
20k
count
7473.000000
14.000000
8677.000000
mean
94.031163
13.962143
94.015637
std
10.168686
35.492962
10.030560
min
0.000000
0.000000
0.000000
25%
85.950000
0.000000
87.320000
50%
95.430000
0.000000
94.450000
75%
101.480000
0.000000
101.000000
max
131.720000
98.650000
123.180000
25k
count
7473.000000
14.000000
8677.000000
mean
118.229587
0.000000
118.059351
std
13.036739
0.000000
12.914815
min
0.000000
0.000000
0.000000
25%
108.130000
0.000000
109.600000
50%
120.100000
0.000000
118.570000
75%
127.580000
0.000000
127.050000
max
163.620000
0.000000
158.550000
30k
count
7473.000000
14.000000
8677.000000
mean
143.787377
10.755714
143.326589
std
15.833160
40.244198
15.838780
min
68.220000
0.000000
0.000000
25%
131.230000
0.000000
132.920000
50%
145.900000
0.000000
143.820000
...
...
...
...
...
half
std
10.737139
37.444634
10.539430
min
0.000000
0.000000
0.000000
25%
90.680000
0.000000
92.130000
50%
100.750000
0.000000
99.650000
75%
107.100000
0.000000
106.580000
max
138.670000
104.000000
129.880000
official
count
7473.000000
14.000000
8677.000000
mean
208.662765
220.382143
207.706291
std
23.933593
18.144768
23.574875
min
97.580000
173.580000
85.530000
25%
191.530000
214.842500
191.780000
50%
210.420000
224.450000
208.170000
75%
225.220000
230.722500
225.200000
max
284.230000
245.450000
281.600000
overall
count
7473.000000
14.000000
8677.000000
mean
8523.313127
11327.642857
8343.792325
std
5116.560194
4217.833651
4994.673736
min
1.000000
1114.000000
1.000000
25%
4030.000000
9781.250000
4071.000000
50%
8581.000000
12472.500000
7924.000000
75%
12661.000000
14006.250000
12655.000000
max
17598.000000
16825.000000
17596.000000
pace
count
7473.000000
14.000000
8677.000000
mean
7.966590
8.412857
7.930118
std
0.912951
0.690935
0.899336
min
3.730000
6.630000
3.270000
25%
7.320000
8.197500
7.320000
50%
8.030000
8.565000
7.950000
75%
8.600000
8.802500
8.600000
max
10.850000
9.370000
10.750000
120 rows × 3 columns
In [61]:
# Reduce it to two components.
X_pca = PCA(2).fit_transform(X_tr_std)
# Calculate predicted values.
y_pred = KMeans(n_clusters=3, random_state=42).fit_predict(X_pca)
# Plot the solution.
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_pred)
plt.show()
In [62]:
Graph_kmeans_official = pd.pivot_table(X_tr, 'official', ['clusters', 'gender'])
Graph_kmeans_pace = pd.pivot_table(X_tr, 'pace', ['clusters', 'gender'])
Graph_kmeans_age = pd.pivot_table(X_tr, 'age', ['clusters', 'gender'])
print(Graph_kmeans_official, Graph_kmeans_pace, Graph_kmeans_age)
official
clusters gender
0 0.0 218.637580
1.0 192.817769
1 0.0 223.532857
1.0 217.231429
2 0.0 222.032644
1.0 203.714082 pace
clusters gender
0 0.0 8.347072
1.0 7.362196
1 0.0 8.534286
1.0 8.291429
2 0.0 8.476526
1.0 7.777854 age
clusters gender
0 0.0 34.193197
1.0 31.864565
1 0.0 40.142857
1.0 43.571429
2 0.0 48.960338
1.0 48.787062
Spectral Clustering
In [15]:
# We know we're looking for 6 clusters from the comparison with the kmeans.
n_clusters=6
# Declare and fit the model.
sc = SpectralClustering(n_clusters=n_clusters).fit(X_tr_std)
# Extract cluster assignments for each data point.
labels = sc.labels_
#Glue back to original data
X_tr['clusters'] = labels
X_tr['Gender'] = boston_marathon_scores.gender
X_tr['Overall'] = boston_marathon_scores.overall
#Add the column into our list
clmns.extend(['clusters','Gender','Overall'])
#Lets analyze the clusters
pd.DataFrame(X_tr.groupby(['clusters']).mean())
Out[15]:
25k
age
10k
gender
half
official
overall
pace
30k
5k
20k
35k
40k
Gender
Overall
clusters
0
117.848578
49.104459
46.705808
0.777035
99.082917
205.460250
7864.603982
7.844473
142.773846
23.347735
93.929491
168.457354
194.105159
0.777035
7864.603982
1
118.205056
33.151903
46.951070
0.348409
99.416279
205.273537
7829.390674
7.837257
143.102483
23.508974
94.253910
168.663509
194.152764
0.348409
7829.390674
2
0.000000
40.416667
12.936667
0.416667
8.518333
220.857500
11460.166667
8.430833
0.000000
8.661667
8.068333
0.000000
0.000000
0.416667
11460.166667
3
31.868571
39.928571
42.200000
0.785714
89.550714
204.827143
7637.428571
7.820000
141.996429
22.919286
84.792143
155.258571
126.517143
0.785714
7637.428571
4
130.341429
45.428571
50.031429
0.571429
108.240000
236.570000
14046.285714
9.030000
161.281429
21.010000
102.022857
0.000000
223.585714
0.571429
14046.285714
5
119.219489
41.503353
45.890427
0.724225
99.013487
224.082938
11750.388935
8.554849
146.968906
22.873650
93.681865
178.077615
210.484363
0.724225
11750.388935
In [16]:
clusters_summary = X_tr.groupby(['clusters']).describe()
clusters_summary_transposed = clusters_summary.transpose()
clusters_summary_transposed
Out[16]:
clusters
0
1
2
3
4
5
10k
count
7333.000000
6412.000000
12.000000
14.000000
7.000000
2386.000000
mean
46.705808
46.951070
12.936667
42.200000
50.031429
45.890427
std
5.067477
4.974319
23.441598
12.654792
2.584411
5.614510
min
0.000000
21.400000
0.000000
0.000000
46.270000
0.000000
25%
43.420000
42.977500
0.000000
43.435000
48.075000
43.020000
50%
46.980000
47.750000
0.000000
45.020000
51.180000
46.260000
75%
50.180000
50.600000
12.030000
47.632500
51.685000
49.430000
max
61.580000
66.680000
53.570000
51.800000
53.250000
57.880000
20k
count
7333.000000
6412.000000
12.000000
14.000000
7.000000
2386.000000
mean
93.929491
94.253910
8.068333
84.792143
102.022857
93.681865
std
10.114733
10.079694
27.949527
25.409862
5.972176
10.072741
min
38.230000
44.380000
0.000000
0.000000
92.820000
0.000000
25%
87.120000
85.950000
0.000000
87.247500
98.875000
87.412500
50%
94.370000
95.890000
0.000000
90.400000
102.270000
94.230000
75%
101.030000
101.600000
0.000000
95.192500
105.610000
100.845000
max
123.180000
131.720000
96.820000
103.920000
110.100000
121.070000
25k
count
7333.000000
6412.000000
12.000000
14.000000
7.000000
2386.000000
mean
117.848578
118.205056
0.000000
31.868571
130.341429
119.219489
std
12.791647
12.783019
0.000000
52.296443
9.249017
12.165837
min
48.320000
56.050000
0.000000
0.000000
115.920000
0.000000
25%
109.220000
107.715000
0.000000
0.000000
126.725000
110.872500
50%
118.320000
120.260000
0.000000
0.000000
129.100000
119.680000
75%
126.850000
127.470000
0.000000
82.575000
134.945000
128.300000
max
153.480000
163.620000
0.000000
113.350000
144.030000
158.550000
30k
count
7333.000000
6412.000000
12.000000
14.000000
7.000000
2386.000000
mean
142.773846
143.102483
0.000000
141.996429
161.281429
146.968906
std
15.661068
15.694454
0.000000
13.742250
13.225840
16.564537
min
59.450000
68.220000
0.000000
113.600000
139.300000
0.000000
25%
132.220000
130.420000
0.000000
134.082500
156.000000
136.330000
50%
143.230000
145.520000
0.000000
139.625000
161.870000
147.915000
...
...
...
...
...
...
...
...
half
std
10.678470
10.639831
29.508372
26.845829
7.034143
10.464197
min
40.270000
46.720000
0.000000
0.000000
97.920000
0.000000
25%
91.900000
90.645000
0.000000
92.120000
104.490000
92.305000
50%
99.530000
101.180000
0.000000
95.275000
107.750000
99.560000
75%
106.600000
107.180000
0.000000
100.762500
112.740000
106.520000
max
129.880000
138.670000
102.220000
109.670000
117.550000
127.050000
official
count
7333.000000
6412.000000
12.000000
14.000000
7.000000
2386.000000
mean
205.460250
205.273537
220.857500
204.827143
236.570000
224.082938
std
22.892748
22.731726
19.680258
20.528396
23.676290
22.654757
min
85.530000
97.580000
173.580000
162.000000
195.030000
114.830000
25%
190.280000
187.837500
212.997500
191.492500
225.100000
207.672500
50%
205.850000
208.210000
225.400000
205.635000
242.100000
226.035000
75%
222.080000
221.600000
231.710000
219.295000
251.140000
240.082500
max
269.550000
278.000000
245.450000
238.220000
266.380000
284.230000
overall
count
7333.000000
6412.000000
12.000000
14.000000
7.000000
2386.000000
mean
7864.603982
7829.390674
11460.166667
7637.428571
14046.285714
11750.388935
std
4834.454210
4911.403729
4570.114716
4612.962167
4648.163498
4783.808730
min
1.000000
1.000000
1114.000000
346.000000
4739.000000
34.000000
25%
3790.000000
3326.000000
9272.750000
4024.000000
12607.000000
7786.000000
50%
7326.000000
7938.000000
12716.000000
7252.000000
16397.000000
12868.000000
75%
11858.000000
11740.250000
14245.000000
11069.250000
17210.000000
16096.250000
max
17568.000000
17589.000000
16825.000000
15721.000000
17554.000000
17598.000000
pace
count
7333.000000
6412.000000
12.000000
14.000000
7.000000
2386.000000
mean
7.844473
7.837257
8.430833
7.820000
9.030000
8.554849
std
0.873348
0.867035
0.749405
0.783768
0.901924
0.864270
min
3.270000
3.730000
6.630000
6.180000
7.450000
4.380000
25%
7.270000
7.170000
8.127500
7.320000
8.590000
7.930000
50%
7.870000
7.950000
8.600000
7.850000
9.250000
8.630000
75%
8.480000
8.470000
8.840000
8.367500
9.580000
9.170000
max
10.280000
10.620000
9.370000
9.100000
10.170000
10.850000
120 rows × 6 columns
In [63]:
# Reduce it to two components.
X_pca = PCA(2).fit_transform(X_tr_std)
# Calculate predicted values.
y_pred = SpectralClustering(n_clusters=3).fit_predict(X_pca)
# Plot the solution.
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_pred)
plt.show()
Mean Shift
In [24]:
# Here we set the bandwidth. This function automatically derives a bandwidth
# number based on an inspection of the distances among points in the data.
bandwidth = estimate_bandwidth(X_tr_std, quantile=0.9)
# Declare and fit the model.
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit(X_tr_std)
# Extract cluster assignments for each data point.
labels = ms.labels_
# Coordinates of the cluster centers.
cluster_centers = ms.cluster_centers_
# Count our clusters.
n_clusters_ = len(np.unique(labels))
#Glue back to original data
X_tr['clusters'] = labels
X_tr['Gender'] = boston_marathon_scores.gender
X_tr['Overall'] = boston_marathon_scores.overall
#Add the column into our list
clmns.extend(['clusters','Gender','Overall'])
#Lets analyze the clusters
print("Number of estimated clusters: {}".format(n_clusters_))
pd.DataFrame(X_tr.groupby(['clusters']).mean())
Number of estimated clusters: 18
Out[24]:
25k
age
10k
gender
half
official
overall
pace
30k
5k
20k
35k
40k
Gender
Overall
clusters
0
118.195629
41.634089
46.729067
0.599007
99.225182
208.131799
8423.401304
7.946343
143.564142
23.349711
94.047516
169.965941
196.538315
0.599007
8423.401304
1
123.384286
45.357143
0.000000
0.571429
103.810000
216.680000
10269.571429
8.272857
149.630714
17.516429
98.415000
177.007857
204.820000
0.571429
10269.571429
2
0.000000
39.500000
0.000000
0.500000
0.000000
219.632500
11138.875000
8.385000
0.000000
0.000000
0.000000
0.000000
0.000000
0.500000
11138.875000
3
0.000000
41.750000
45.547500
0.750000
96.765000
205.925000
8088.750000
7.860000
142.660000
22.890000
91.538750
168.860000
194.555000
0.750000
8088.750000
4
130.341429
45.428571
50.031429
0.571429
108.240000
236.570000
14046.285714
9.030000
161.281429
21.010000
102.022857
0.000000
223.585714
0.571429
14046.285714
5
111.540000
39.500000
44.425000
0.750000
93.897500
193.430000
4521.000000
7.387500
135.355000
22.262500
89.032500
159.577500
0.000000
0.750000
4521.000000
6
119.052500
46.750000
46.847500
0.500000
99.985000
214.370000
9688.500000
8.185000
0.000000
23.267500
94.575000
173.530000
202.502500
0.500000
9688.500000
7
115.956667
41.000000
45.010000
0.333333
0.000000
205.583333
8294.333333
7.856667
141.570000
22.523333
91.533333
167.580000
193.983333
0.333333
8294.333333
8
121.625000
41.500000
48.775000
0.500000
102.750000
205.265000
8105.500000
7.835000
145.960000
24.650000
0.000000
170.645000
194.890000
0.500000
8105.500000
9
0.000000
31.000000
0.000000
1.000000
0.000000
228.380000
13438.000000
8.720000
154.670000
24.530000
0.000000
184.430000
214.800000
1.000000
13438.000000
10
0.000000
36.000000
48.720000
1.000000
104.000000
218.080000
10692.000000
8.330000
150.580000
24.170000
98.650000
0.000000
0.000000
1.000000
10692.000000
11
115.100000
41.000000
0.000000
0.000000
97.100000
196.080000
4939.000000
7.480000
138.730000
22.750000
0.000000
162.550000
185.830000
0.000000
4939.000000
12
0.000000
23.000000
53.570000
0.000000
0.000000
225.400000
12717.000000
8.600000
0.000000
26.870000
0.000000
0.000000
0.000000
0.000000
12717.000000
13
0.000000
65.000000
0.000000
1.000000
0.000000
216.980000
10373.000000
8.280000
0.000000
0.000000
0.000000
0.000000
202.370000
1.000000
10373.000000
14
0.000000
43.000000
48.120000
0.000000
102.220000
214.130000
9584.000000
8.170000
0.000000
24.200000
96.820000
0.000000
0.000000
0.000000
9584.000000
15
0.000000
57.000000
53.550000
1.000000
0.000000
225.400000
12715.000000
8.600000
0.000000
26.870000
0.000000
0.000000
0.000000
1.000000
12715.000000
16
121.720000
29.000000
46.920000
0.000000
100.950000
221.850000
11810.000000
8.470000
150.850000
0.000000
0.000000
181.100000
209.350000
0.000000
11810.000000
17
0.000000
46.000000
0.000000
0.000000
0.000000
228.300000
13395.000000
8.720000
0.000000
26.000000
0.000000
0.000000
0.000000
0.000000
13395.000000
In [21]:
clusters_summary = X_tr.groupby(['clusters']).describe()
clusters_summary_transposed = clusters_summary.transpose()
clusters_summary_transposed
Out[21]:
clusters
0
1
2
3
4
5
6
7
8
9
...
12
13
14
15
16
17
18
19
20
21
10k
count
16105.000000
10.000000
8.000000
7.000000
4.000000
4.000000
4.000000
3.000000
3.000000
3.000000
...
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
mean
46.729067
0.000000
0.000000
45.501429
44.425000
46.847500
0.000000
45.010000
50.280000
49.400000
...
0.00
48.72
0.00
51.18
53.57
0.00
48.12
53.55
46.92
0.00
std
4.914671
0.000000
0.000000
4.923442
1.081558
2.355184
0.000000
6.085294
3.604345
2.386608
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
min
18.030000
0.000000
0.000000
36.900000
43.170000
43.580000
0.000000
38.630000
46.270000
47.420000
...
0.00
48.72
0.00
51.18
53.57
0.00
48.12
53.55
46.92
0.00
25%
43.220000
0.000000
0.000000
43.485000
43.965000
45.905000
0.000000
42.140000
48.795000
48.075000
...
0.00
48.72
0.00
51.18
53.57
0.00
48.12
53.55
46.92
0.00
50%
47.170000
0.000000
0.000000
45.420000
44.365000
47.555000
0.000000
45.650000
51.320000
48.730000
...
0.00
48.72
0.00
51.18
53.57
0.00
48.12
53.55
46.92
0.00
75%
50.280000
0.000000
0.000000
48.710000
44.825000
48.497500
0.000000
48.200000
52.285000
50.390000
...
0.00
48.72
0.00
51.18
53.57
0.00
48.12
53.55
46.92
0.00
max
66.680000
0.000000
0.000000
51.800000
45.800000
48.700000
0.000000
50.750000
53.250000
52.050000
...
0.00
48.72
0.00
51.18
53.57
0.00
48.12
53.55
46.92
0.00
20k
count
16105.000000
10.000000
8.000000
7.000000
4.000000
4.000000
4.000000
3.000000
3.000000
3.000000
...
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
mean
94.047516
98.301000
0.000000
91.482857
89.032500
94.575000
98.700000
91.533333
101.730000
101.850000
...
0.00
98.65
0.00
103.42
0.00
0.00
96.82
0.00
0.00
0.00
std
9.963129
7.606121
0.000000
9.712080
1.795223
4.755562
5.907137
14.224072
8.652647
5.566229
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
min
38.230000
81.900000
0.000000
74.980000
86.830000
87.920000
91.350000
76.570000
92.820000
96.770000
...
0.00
98.65
0.00
103.42
0.00
0.00
96.82
0.00
0.00
0.00
25%
86.770000
94.292500
0.000000
86.975000
88.082500
92.877500
95.700000
84.860000
97.545000
98.875000
...
0.00
98.65
0.00
103.42
0.00
0.00
96.82
0.00
0.00
0.00
50%
94.870000
98.775000
0.000000
91.630000
89.135000
95.855000
99.110000
93.150000
102.270000
100.980000
...
0.00
98.65
0.00
103.42
0.00
0.00
96.82
0.00
0.00
0.00
75%
101.270000
104.217500
0.000000
97.950000
90.085000
97.552500
102.110000
99.015000
106.185000
104.390000
...
0.00
98.65
0.00
103.42
0.00
0.00
96.82
0.00
0.00
0.00
max
131.720000
106.870000
0.000000
103.920000
91.030000
98.670000
105.230000
104.880000
110.100000
107.800000
...
0.00
98.65
0.00
103.42
0.00
0.00
96.82
0.00
0.00
0.00
25k
count
16105.000000
10.000000
8.000000
7.000000
4.000000
4.000000
4.000000
3.000000
3.000000
3.000000
...
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
mean
118.195629
123.299000
0.000000
0.000000
111.540000
119.052500
123.597500
115.956667
127.806667
132.933333
...
0.00
0.00
115.10
130.17
0.00
0.00
0.00
0.00
121.72
0.00
std
12.674378
9.617818
0.000000
0.000000
1.605138
6.217853
7.188743
19.538911
11.900022
9.761825
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
min
48.320000
102.830000
0.000000
0.000000
110.100000
110.420000
114.680000
95.570000
115.920000
125.670000
...
0.00
0.00
115.10
130.17
0.00
0.00
0.00
0.00
121.72
0.00
25%
108.920000
118.290000
0.000000
0.000000
110.235000
116.802500
119.885000
106.675000
121.850000
127.385000
...
0.00
0.00
115.10
130.17
0.00
0.00
0.00
0.00
121.72
0.00
50%
119.180000
123.420000
0.000000
0.000000
111.355000
120.555000
124.095000
117.780000
127.780000
129.100000
...
0.00
0.00
115.10
130.17
0.00
0.00
0.00
0.00
121.72
0.00
75%
127.330000
131.315000
0.000000
0.000000
112.660000
122.805000
127.807500
126.150000
133.750000
136.565000
...
0.00
0.00
115.10
130.17
0.00
0.00
0.00
0.00
121.72
0.00
max
163.620000
134.000000
0.000000
0.000000
113.350000
124.680000
131.520000
134.520000
139.720000
144.030000
...
0.00
0.00
115.10
130.17
0.00
0.00
0.00
0.00
121.72
0.00
30k
count
16105.000000
10.000000
8.000000
7.000000
4.000000
4.000000
4.000000
3.000000
3.000000
3.000000
...
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
mean
143.564142
149.453000
0.000000
139.261429
135.355000
0.000000
150.075000
141.570000
154.596667
169.233333
...
154.67
150.58
138.73
157.48
0.00
0.00
0.00
0.00
150.85
0.00
std
15.677899
11.785741
0.000000
15.166610
2.146012
0.000000
8.655736
25.491828
15.335144
10.977169
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
min
59.450000
125.330000
0.000000
113.600000
132.180000
0.000000
139.150000
115.080000
139.300000
161.870000
...
154.67
150.58
138.73
157.48
0.00
0.00
0.00
0.00
150.85
0.00
25%
132.200000
142.905000
0.000000
131.375000
135.097500
0.000000
145.285000
129.390000
146.910000
162.925000
...
154.67
150.58
138.73
157.48
0.00
0.00
0.00
0.00
150.85
0.00
50%
144.730000
149.165000
0.000000
142.350000
136.170000
0.000000
151.375000
143.700000
154.520000
163.980000
...
154.67
150.58
138.73
157.48
0.00
0.00
0.00
0.00
150.85
0.00
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
half
std
10.521256
8.028551
0.000000
10.297575
1.800748
5.022838
6.131479
0.000000
9.187667
7.857394
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
min
40.270000
86.330000
0.000000
79.100000
91.720000
92.820000
96.480000
0.000000
97.920000
102.280000
...
0.00
104.00
97.10
109.20
0.00
0.00
102.22
0.00
100.95
0.00
25%
91.550000
99.520000
0.000000
91.685000
92.920000
98.520000
100.980000
0.000000
102.835000
104.490000
...
0.00
104.00
97.10
109.20
0.00
0.00
102.22
0.00
100.95
0.00
50%
100.100000
104.130000
0.000000
96.680000
93.970000
101.450000
104.500000
0.000000
107.750000
106.700000
...
0.00
104.00
97.10
109.20
0.00
0.00
102.22
0.00
100.95
0.00
75%
106.850000
109.995000
0.000000
103.450000
94.947500
102.915000
107.615000
0.000000
112.015000
112.125000
...
0.00
104.00
97.10
109.20
0.00
0.00
102.22
0.00
100.95
0.00
max
138.670000
112.700000
0.000000
109.670000
95.930000
104.220000
110.900000
0.000000
116.280000
117.550000
...
0.00
104.00
97.10
109.20
0.00
0.00
102.22
0.00
100.95
0.00
official
count
16105.000000
10.000000
8.000000
7.000000
4.000000
4.000000
4.000000
3.000000
3.000000
3.000000
...
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
mean
208.131799
217.303000
219.632500
201.311429
193.430000
214.370000
215.122500
205.583333
221.903333
254.120000
...
228.38
218.08
196.08
227.92
225.40
216.98
214.13
225.40
221.85
228.30
std
23.747320
15.328319
24.220754
21.732860
8.019406
19.050727
10.866865
37.232860
26.686994
12.141779
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
min
85.530000
194.170000
173.580000
162.000000
184.400000
194.050000
199.900000
167.000000
195.030000
242.100000
...
228.38
218.08
196.08
227.92
225.40
216.98
214.13
225.40
221.85
228.30
25%
191.700000
205.922500
207.037500
191.105000
189.672500
202.090000
211.112500
187.725000
208.655000
247.990000
...
228.38
218.08
196.08
227.92
225.40
216.98
214.13
225.40
221.85
228.30
50%
209.220000
215.690000
227.515000
207.500000
192.775000
212.925000
218.310000
208.450000
222.280000
253.880000
...
228.38
218.08
196.08
227.92
225.40
216.98
214.13
225.40
221.85
228.30
75%
225.200000
226.117500
234.637500
216.025000
196.532500
225.205000
222.320000
224.875000
235.340000
260.130000
...
228.38
218.08
196.08
227.92
225.40
216.98
214.13
225.40
221.85
228.30
max
284.230000
242.780000
245.450000
225.420000
203.770000
237.580000
223.970000
241.300000
248.400000
266.380000
...
228.38
218.08
196.08
227.92
225.40
216.98
214.13
225.40
221.85
228.30
overall
count
16105.000000
10.000000
8.000000
7.000000
4.000000
4.000000
4.000000
3.000000
3.000000
3.000000
...
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
mean
8423.401304
10399.700000
11138.875000
6998.428571
4521.000000
9688.500000
9944.250000
8294.333333
11245.333333
17092.666667
...
13438.00
10692.00
4939.00
13310.00
12717.00
10373.00
9584.00
12715.00
11810.00
13395.00
std
5052.028950
3920.898283
5586.997518
4438.780270
1672.116423
4895.439272
2959.093484
7857.198250
6203.282223
613.063075
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
min
1.000000
4559.000000
1114.000000
346.000000
2759.000000
4534.000000
5825.000000
580.000000
4739.000000
16397.000000
...
13438.00
10692.00
4939.00
13310.00
12717.00
10373.00
9584.00
12715.00
11810.00
13395.00
25%
4055.000000
7360.250000
7676.500000
3945.500000
3701.000000
6421.000000
8801.750000
4298.000000
8321.500000
16862.000000
...
13438.00
10692.00
4939.00
13310.00
12717.00
10373.00
9584.00
12715.00
11810.00
13395.00
50%
8241.000000
10063.000000
13220.000000
7741.000000
4281.000000
9317.500000
10793.500000
8016.000000
11904.000000
17327.000000
...
13438.00
10692.00
4939.00
13310.00
12717.00
10373.00
9584.00
12715.00
11810.00
13395.00
75%
12655.000000
12885.500000
14851.000000
10145.500000
5101.000000
12585.000000
11936.000000
12151.500000
14498.500000
17440.500000
...
13438.00
10692.00
4939.00
13310.00
12717.00
10373.00
9584.00
12715.00
11810.00
13395.00
max
17598.000000
16503.000000
16825.000000
12720.000000
6763.000000
15585.000000
12365.000000
16287.000000
17093.000000
17554.000000
...
13438.00
10692.00
4939.00
13310.00
12717.00
10373.00
9584.00
12715.00
11810.00
13395.00
pace
count
16105.000000
10.000000
8.000000
7.000000
4.000000
4.000000
4.000000
3.000000
3.000000
3.000000
...
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
1.00
mean
7.946343
8.297000
8.385000
7.682857
7.387500
8.185000
8.212500
7.856667
8.470000
9.700000
...
8.72
8.33
7.48
8.70
8.60
8.28
8.17
8.60
8.47
8.72
std
0.905882
0.584524
0.922109
0.827642
0.309556
0.721503
0.416203
1.423388
1.015037
0.460326
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
min
3.270000
7.420000
6.630000
6.180000
7.030000
7.420000
7.630000
6.380000
7.450000
9.250000
...
8.72
8.33
7.48
8.70
8.60
8.28
8.17
8.60
8.47
8.72
25%
7.320000
7.860000
7.905000
7.300000
7.247500
7.720000
8.057500
7.175000
7.965000
9.465000
...
8.72
8.33
7.48
8.70
8.60
8.28
8.17
8.60
8.47
8.72
50%
7.980000
8.230000
8.680000
7.920000
7.370000
8.125000
8.335000
7.970000
8.480000
9.680000
...
8.72
8.33
7.48
8.70
8.60
8.28
8.17
8.60
8.47
8.72
75%
8.600000
8.640000
8.960000
8.240000
7.510000
8.590000
8.490000
8.595000
8.980000
9.925000
...
8.72
8.33
7.48
8.70
8.60
8.28
8.17
8.60
8.47
8.72
max
10.850000
9.270000
9.370000
8.600000
7.780000
9.070000
8.550000
9.220000
9.480000
10.170000
...
8.72
8.33
7.48
8.70
8.60
8.28
8.17
8.60
8.47
8.72
120 rows × 22 columns
In [25]:
# Reduce it to two components.
X_pca = PCA(2).fit_transform(X_tr_std)
# Calculate predicted values.
bandwidth = estimate_bandwidth(X_tr_std, quantile=0.9)
y_pred = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit_predict(X_pca)
# Plot the solution.
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_pred)
plt.show()
In [17]:
# Declare the model and fit it in one statement.
# Note that you can provide arguments to the model, but we didn't.
af = AffinityPropagation().fit(X_tr_std)
print('Done')
# Pull the number of clusters and cluster assignments for each data point.
cluster_centers_indices = af.cluster_centers_indices_
n_clusters_ = len(cluster_centers_indices)
labels = af.labels_
#Glue back to original data
X_tr['clusters'] = labels
X_tr['Gender'] = boston_marathon_scores.gender
X_tr['Overall'] = boston_marathon_scores.overall
#Add the column into our list
clmns.extend(['clusters','Gender','Overall'])
#Lets analyze the clusters
print("Number of estimated clusters: {}".format(n_clusters_))
pd.DataFrame(X_tr.groupby(['clusters']).mean())
Done
Number of estimated clusters: 251
Out[17]:
25k
age
10k
gender
half
official
overall
pace
30k
5k
20k
35k
40k
Gender
Overall
clusters
0
51.880000
43.000000
19.148000
1.000000
43.080000
92.754000
9.400000
3.542000
63.870000
9.226000
40.898000
76.494000
87.242000
1.000000
9.400000
1
58.814286
38.142857
21.591429
0.857143
48.728571
107.681429
22.428571
4.115714
73.342857
10.270000
46.268571
88.742857
101.232857
0.857143
22.428571
2
79.596667
42.555556
29.865556
0.888889
65.995556
144.394444
3486.333333
5.512222
99.325556
14.605556
62.567778
119.272222
136.260000
0.888889
3486.333333
3
51.017500
49.500000
19.212500
1.000000
42.605000
90.327500
7.750000
3.452500
62.542500
9.290000
40.477500
74.807500
85.082500
1.000000
7.750000
4
57.156000
41.000000
21.306000
0.800000
47.652000
101.820000
20.400000
3.888000
70.392000
10.136000
45.276000
84.248000
95.932000
0.800000
20.400000
5
131.848333
45.000000
44.376667
0.833333
99.755000
233.056667
13954.666667
8.898333
164.263333
22.131667
93.786667
193.041667
221.220000
0.833333
13954.666667
6
0.000000
37.000000
43.683333
0.666667
92.746667
192.650000
5635.333333
7.353333
133.676667
21.993333
87.916667
157.470000
181.750000
0.666667
5635.333333
7
105.182083
60.541667
41.791667
0.958333
88.628333
180.174167
2933.500000
6.880833
126.957083
20.949583
84.047500
148.961667
170.547500
0.958333
2933.500000
8
110.972500
58.535714
44.063214
0.964286
93.403929
191.054107
4668.089286
7.294643
134.142857
22.101607
88.561964
157.621964
180.764821
0.964286
4668.089286
9
122.701395
46.829457
48.009457
0.550388
102.830310
216.295736
10391.356589
8.258295
149.411240
23.974109
97.425504
177.058992
204.317364
0.550388
10391.356589
10
114.659000
44.500000
46.056000
1.000000
96.658000
220.956000
11392.900000
8.436000
139.021000
23.197000
91.650000
165.798000
202.483000
1.000000
11392.900000
11
116.124444
57.444444
45.626905
0.968254
97.453889
203.575079
7514.071429
7.771667
141.069206
22.821667
92.328492
166.788968
192.265635
0.968254
7514.071429
12
116.609249
41.680751
46.546667
0.553991
98.295634
199.267042
6533.812207
7.608404
140.593568
23.372300
93.217887
164.852676
188.750423
0.553991
6533.812207
13
116.692391
50.347826
46.594420
0.891304
98.382826
198.714420
6150.724638
7.587681
140.623551
23.361957
93.301739
164.740580
188.360217
0.891304
6150.724638
14
110.578000
26.800000
42.462000
0.800000
91.928000
248.208000
16283.800000
9.476000
138.808000
21.234000
87.094000
178.784000
229.872000
0.800000
16283.800000
15
107.839615
33.692308
42.428846
0.923077
90.321538
215.145769
9724.230769
8.215769
132.647308
21.296154
85.590769
164.268846
201.421154
0.923077
9724.230769
16
106.675000
32.000000
42.140000
0.500000
0.000000
187.725000
4298.000000
7.175000
129.390000
21.320000
84.860000
153.185000
176.985000
0.500000
4298.000000
17
117.098605
51.232558
46.693256
1.000000
98.542326
216.117209
10320.046512
8.252093
142.068140
23.506977
93.430698
169.548837
201.450465
1.000000
10320.046512
18
115.245541
32.254777
46.116561
0.375796
97.101783
198.944331
6562.611465
7.595096
139.224395
23.185987
92.081783
163.764140
188.216815
0.375796
6562.611465
19
119.309057
58.490566
46.152642
0.962264
99.569434
217.951887
10739.622642
8.322075
146.331698
22.955849
94.247736
175.279057
204.732830
0.962264
10739.622642
20
113.050000
27.666667
43.616667
0.333333
92.996667
236.006667
14524.666667
9.003333
138.673333
22.046667
87.990000
195.016667
224.593333
0.333333
14524.666667
21
122.548047
35.875000
48.223516
0.171875
102.752734
213.003359
9596.625000
8.132891
148.813125
24.104688
97.376250
175.530859
201.691875
0.171875
9596.625000
22
132.750000
36.000000
45.300000
1.000000
94.820000
229.080000
13647.000000
8.750000
174.520000
22.450000
89.570000
197.070000
219.620000
1.000000
13647.000000
23
97.650000
24.000000
39.220000
1.000000
82.270000
241.620000
16324.000000
9.220000
119.330000
19.830000
78.030000
161.770000
230.050000
1.000000
16324.000000
24
117.728293
23.341463
46.793415
0.365854
98.858293
214.841463
9810.097561
8.201463
143.700976
23.520976
93.726585
171.802927
201.884146
0.365854
9810.097561
25
0.000000
38.000000
45.870000
1.000000
98.400000
238.220000
15721.000000
9.100000
166.450000
23.130000
91.930000
200.020000
227.600000
1.000000
15721.000000
26
112.653333
24.666667
44.510000
0.333333
94.826667
194.580000
5858.666667
7.433333
136.340000
0.000000
89.806667
160.466667
184.140000
0.333333
5858.666667
27
117.573675
38.524096
46.917771
0.415663
99.012952
205.115301
7771.156627
7.830783
142.267048
23.590542
93.907169
167.800482
193.653735
0.415663
7771.156627
28
120.913656
53.193548
47.172366
0.827957
100.997742
215.927527
10345.354839
8.243441
147.719462
23.581398
95.652043
175.846129
203.689140
0.827957
10345.354839
29
116.074000
34.800000
43.846000
0.800000
95.202000
236.736000
13539.300000
9.036000
148.700000
21.918000
89.999000
182.708000
221.378000
0.800000
13539.300000
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
221
115.874184
42.673469
45.857653
0.755102
97.253163
208.741327
8321.234694
7.969082
141.165918
23.038878
92.159898
168.365306
196.411327
0.755102
8321.234694
222
119.938173
39.512690
47.774467
0.340102
101.015178
206.156599
8130.954315
7.871168
144.909239
24.011320
95.792183
170.282081
195.252843
0.340102
8130.954315
223
116.287197
33.560510
46.810127
0.433121
98.273376
195.656051
5794.585987
7.471210
139.674395
23.551911
93.252484
163.004713
185.785287
0.433121
5794.585987
224
121.720000
29.000000
46.920000
0.000000
100.950000
221.850000
11810.000000
8.470000
150.850000
0.000000
0.000000
181.100000
209.350000
0.000000
11810.000000
225
120.766744
53.534884
47.531395
0.976744
101.120698
228.733256
12893.976744
8.733256
148.155116
23.808140
95.796047
179.166047
213.889767
0.976744
12893.976744
226
121.458417
50.891667
47.504750
0.800000
101.566167
216.118167
10132.900000
8.250083
148.209250
23.753583
96.198333
176.190750
203.928000
0.800000
10132.900000
227
117.549773
56.363636
47.297955
0.931818
99.253864
198.214091
6252.522727
7.567045
141.340682
23.769318
94.167500
165.119773
188.182727
0.931818
6252.522727
228
117.201768
42.939024
47.097378
0.615854
98.972073
198.559817
6428.896341
7.581159
140.981585
23.648780
93.886341
164.873598
188.298110
0.615854
6428.896341
229
114.603256
33.511628
45.149535
0.627907
95.881395
215.327209
9778.116279
8.221395
140.828372
22.634419
90.840233
170.989535
202.097442
0.627907
9778.116279
230
120.292027
35.040541
47.345811
0.364865
100.658784
218.332568
10687.702703
8.336351
147.319459
23.700811
95.357973
176.417568
205.741081
0.364865
10687.702703
231
115.883310
46.151724
46.502207
0.779310
97.809379
197.544069
6012.165517
7.542138
139.571517
23.396552
92.789172
163.532000
187.127586
0.779310
6012.165517
232
123.013571
42.959184
47.685510
0.357143
102.650102
219.902347
11153.693878
8.394592
150.351633
23.767551
97.196939
178.971122
207.417449
0.357143
11153.693878
233
117.870411
49.013699
46.665753
0.787671
99.012329
207.639795
8161.253425
7.927740
143.132055
23.396027
93.852260
169.464932
195.891507
0.787671
8161.253425
234
119.457500
56.125000
45.772083
1.000000
98.970000
228.516667
12796.125000
8.722917
148.905417
22.817917
93.666667
182.617500
214.803333
1.000000
12796.125000
235
123.151373
37.441176
47.796176
0.235294
102.759804
218.981765
11053.411765
8.360294
150.522647
23.818333
97.319314
179.113529
206.917059
0.235294
11053.411765
236
114.700556
56.611111
45.436667
1.000000
96.422778
215.272778
10009.222222
8.219444
139.328889
22.772778
91.368889
166.620556
200.129444
1.000000
10009.222222
237
0.000000
46.000000
0.000000
0.000000
0.000000
228.300000
13395.000000
8.720000
0.000000
26.000000
0.000000
0.000000
0.000000
0.000000
13395.000000
238
120.149014
40.183099
46.546479
0.577465
99.976620
222.242394
11449.352113
8.484789
148.167606
23.214225
94.603099
178.832254
209.181972
0.577465
11449.352113
239
120.591377
38.224638
47.550217
0.326087
101.219928
209.267536
8897.311594
7.989783
146.235797
23.767101
95.937899
172.488841
198.126594
0.326087
8897.311594
240
117.789706
53.529412
46.660000
0.931373
98.959314
208.923824
8509.313725
7.976176
143.038824
23.407255
93.793725
169.537647
196.661961
0.931373
8509.313725
241
121.667470
57.385542
47.188554
0.891566
101.581446
216.733373
10764.903614
8.274578
148.893494
23.555542
96.194096
177.106747
204.573614
0.891566
10764.903614
242
121.539621
47.992424
47.755152
0.568182
101.896212
212.162879
9277.303030
8.100530
147.643333
23.879167
96.577803
174.401212
200.674167
0.568182
9277.303030
243
122.611000
46.266667
46.400667
0.900000
101.238667
237.035000
14403.333333
9.047667
153.571000
23.088000
95.542000
187.626667
222.534000
0.900000
14403.333333
244
110.205588
57.647059
43.587059
0.970588
92.563529
197.367647
6159.823529
7.535000
133.829118
21.824412
87.695000
158.894706
185.215882
0.970588
6159.823529
245
126.345634
43.119718
49.374577
0.204225
105.745141
220.125986
11368.014085
8.404225
153.622254
24.638451
100.189718
181.463310
208.426056
0.204225
11368.014085
246
118.554913
40.861272
47.136185
0.456647
99.796994
203.899538
7458.953757
7.785202
143.264509
23.621098
94.629422
168.359884
193.076243
0.456647
7458.953757
247
118.770707
46.555556
46.697778
0.787879
99.523131
214.228586
9672.101010
8.178182
145.048687
23.409798
94.291313
173.045960
201.635152
0.787879
9672.101010
248
134.520000
59.000000
50.750000
0.000000
0.000000
241.300000
16287.000000
9.220000
165.930000
24.930000
104.880000
196.370000
227.980000
0.000000
16287.000000
249
113.978571
52.714286
45.228571
0.857143
95.975714
197.030000
5447.285714
7.524286
137.797143
0.000000
90.990000
161.990000
186.204286
0.857143
5447.285714
250
121.839065
53.582734
47.941583
0.863309
102.317410
211.594604
9116.100719
8.079353
147.807842
23.948417
96.948921
174.247194
200.225755
0.863309
9116.100719
251 rows × 15 columns
In [18]:
clusters_summary = X_tr.groupby(['clusters']).describe()
clusters_summary_transposed = clusters_summary.transpose()
clusters_summary_transposed
Out[18]:
clusters
0
1
2
3
4
5
6
7
8
9
...
241
242
243
244
245
246
247
248
249
250
10k
count
5.000000
7.000000
9.000000
4.000000
5.000000
6.000000
3.000000
24.000000
56.000000
129.000000
...
83.000000
132.000000
30.000000
34.000000
142.000000
173.000000
99.000000
1.00
7.000000
139.000000
mean
19.148000
21.591429
29.865556
19.212500
21.306000
44.376667
43.683333
41.791667
44.063214
48.009457
...
47.188554
47.755152
46.400667
43.587059
49.374577
47.136185
46.697778
50.75
45.228571
47.941583
std
1.170543
3.465484
12.856572
1.027696
1.654004
4.063942
7.538954
4.458106
4.551551
5.290160
...
5.457879
4.973370
3.627409
4.724065
4.781201
4.948415
4.171106
NaN
3.022590
4.054450
min
18.180000
18.200000
18.180000
18.030000
19.770000
37.730000
36.900000
32.980000
34.700000
23.570000
...
20.150000
34.420000
39.230000
31.500000
32.370000
35.800000
31.630000
50.75
41.900000
35.620000
25%
18.180000
19.250000
19.780000
18.555000
19.770000
42.217500
39.625000
39.555000
40.620000
45.330000
...
45.275000
44.565000
43.235000
40.682500
47.990000
42.820000
44.840000
50.75
43.605000
45.775000
50%
18.730000
20.780000
23.580000
19.250000
20.900000
46.300000
42.350000
41.060000
43.355000
48.250000
...
48.520000
47.740000
47.125000
44.625000
50.255000
47.650000
46.670000
50.75
44.430000
48.200000
75%
19.750000
22.655000
41.500000
19.907500
22.970000
46.970000
47.075000
43.975000
47.327500
52.170000
...
50.720000
52.127500
48.690000
46.482500
52.695000
50.980000
49.015000
50.75
45.890000
50.775000
max
20.900000
28.350000
52.350000
20.320000
23.120000
47.880000
51.800000
53.130000
54.970000
58.530000
...
53.850000
57.720000
52.820000
53.020000
58.570000
57.180000
57.520000
50.75
51.280000
56.180000
20k
count
5.000000
7.000000
9.000000
4.000000
5.000000
6.000000
3.000000
24.000000
56.000000
129.000000
...
83.000000
132.000000
30.000000
34.000000
142.000000
173.000000
99.000000
1.00
7.000000
139.000000
mean
40.898000
46.268571
62.567778
40.477500
45.276000
93.786667
87.916667
84.047500
88.561964
97.425504
...
96.194096
96.577803
95.542000
87.695000
100.189718
94.629422
94.291313
104.88
90.990000
96.948921
std
2.917074
8.611826
26.296880
1.778490
3.058648
9.439968
14.711704
9.085377
9.056660
10.828335
...
10.803554
10.081225
6.874044
9.533775
9.702004
9.943687
8.389691
NaN
6.049554
8.271588
min
38.550000
38.800000
38.550000
38.230000
41.700000
77.970000
74.980000
66.630000
70.430000
47.930000
...
43.880000
69.550000
84.430000
63.550000
65.520000
71.820000
63.920000
104.88
84.680000
72.320000
25%
38.570000
40.825000
42.020000
39.467500
43.870000
90.165000
79.915000
79.437500
82.132500
91.870000
...
92.365000
89.860000
89.267500
81.367500
96.770000
85.950000
90.505000
104.88
87.405000
92.340000
50%
39.870000
43.900000
50.870000
40.825000
43.900000
96.315000
84.850000
82.325000
86.720000
98.430000
...
98.450000
97.125000
97.185000
88.925000
101.775000
95.650000
93.930000
104.88
90.930000
97.150000
75%
42.050000
47.725000
84.580000
41.835000
47.930000
98.310000
94.385000
88.660000
94.820000
105.300000
...
103.465000
105.085000
100.917500
93.815000
107.395000
102.680000
99.075000
104.88
91.720000
102.885000
max
45.450000
64.080000
110.870000
42.030000
48.980000
105.030000
103.920000
107.850000
111.450000
120.700000
...
111.530000
116.230000
107.050000
107.180000
118.920000
114.650000
116.420000
104.88
103.070000
113.930000
25k
count
5.000000
7.000000
9.000000
4.000000
5.000000
6.000000
3.000000
24.000000
56.000000
129.000000
...
83.000000
132.000000
30.000000
34.000000
142.000000
173.000000
99.000000
1.00
7.000000
139.000000
mean
51.880000
58.814286
79.596667
51.017500
57.156000
131.848333
0.000000
105.182083
110.972500
122.701395
...
121.667470
121.539621
122.611000
110.205588
126.345634
118.554913
118.770707
134.52
113.978571
121.839065
std
4.016118
11.045453
33.893913
2.058517
3.769235
12.763898
0.000000
11.447189
11.345336
13.686596
...
13.691701
12.751737
8.995522
12.038437
12.342555
12.530305
10.600679
NaN
7.683835
10.436578
min
48.750000
49.870000
48.750000
48.320000
52.620000
110.950000
0.000000
83.470000
88.470000
59.800000
...
55.600000
87.630000
107.850000
80.380000
82.180000
89.780000
80.380000
134.52
106.220000
91.020000
25%
48.750000
51.565000
53.080000
49.955000
55.600000
125.967500
0.000000
99.322500
103.230000
115.450000
...
116.755000
113.032500
113.882500
102.190000
121.632500
107.530000
114.025000
134.52
109.435000
116.000000
50%
50.500000
55.600000
63.830000
51.560000
55.630000
135.205000
0.000000
103.090000
108.550000
124.330000
...
124.200000
122.215000
124.415000
111.065000
128.290000
119.650000
118.300000
134.52
113.350000
121.930000
75%
53.070000
60.690000
107.280000
52.622500
59.850000
139.072500
0.000000
111.090000
119.062500
132.670000
...
130.400000
132.077500
128.597500
117.840000
135.377500
128.500000
124.800000
134.52
115.065000
129.300000
max
58.330000
81.720000
142.680000
52.630000
62.080000
146.480000
0.000000
135.380000
139.830000
152.270000
...
140.670000
146.370000
138.980000
134.420000
149.020000
144.330000
145.700000
134.52
129.280000
143.330000
30k
count
5.000000
7.000000
9.000000
4.000000
5.000000
6.000000
3.000000
24.000000
56.000000
129.000000
...
83.000000
132.000000
30.000000
34.000000
142.000000
173.000000
99.000000
1.00
7.000000
139.000000
mean
63.870000
73.342857
99.325556
62.542500
70.392000
164.263333
133.676667
126.957083
134.142857
149.411240
...
148.893494
147.643333
153.571000
133.829118
153.622254
143.264509
145.048687
165.93
137.797143
147.807842
std
5.217461
14.766886
42.514617
2.234925
4.935693
14.671503
22.566183
14.021694
13.756113
16.710898
...
16.851564
15.550967
11.652216
14.830801
14.983018
15.214577
13.007040
NaN
9.392784
12.755483
min
59.920000
61.350000
60.100000
59.450000
63.980000
137.020000
113.600000
100.530000
107.050000
73.300000
...
68.180000
106.520000
133.630000
98.070000
100.000000
108.250000
98.370000
165.93
128.070000
110.350000
25%
59.920000
63.875000
65.970000
61.640000
68.000000
161.610000
121.465000
119.522500
125.080000
140.600000
...
142.750000
136.942500
142.957500
123.702500
147.820000
129.750000
139.350000
165.93
132.535000
140.705000
50%
61.630000
68.180000
78.900000
63.185000
69.530000
168.615000
129.330000
124.540000
131.335000
151.420000
...
152.100000
148.315000
154.650000
134.645000
156.040000
144.270000
144.270000
165.93
136.670000
148.230000
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
half
std
3.164830
8.989719
27.897963
1.840480
3.168465
9.724984
15.546177
9.582230
9.547910
11.433770
...
11.458470
10.606785
7.256935
10.051022
10.237196
10.476615
8.844753
NaN
6.376530
8.737190
min
40.570000
40.930000
40.570000
40.270000
43.970000
82.530000
79.100000
70.300000
74.320000
50.280000
...
46.180000
73.420000
89.370000
67.200000
69.100000
75.820000
67.500000
0.00
89.420000
76.350000
25%
40.570000
43.065000
44.180000
41.567500
46.200000
97.702500
84.285000
83.752500
86.692500
96.970000
...
97.510000
94.867500
94.365000
85.845000
102.100000
90.630000
95.505000
0.00
92.150000
97.435000
50%
41.980000
46.180000
53.370000
42.985000
46.220000
102.050000
89.470000
86.815000
91.450000
104.000000
...
103.930000
102.600000
102.725000
93.680000
107.375000
100.850000
99.030000
0.00
95.820000
102.480000
75%
44.200000
50.280000
89.320000
44.022500
50.270000
103.712500
99.570000
93.555000
100.055000
111.120000
...
109.360000
110.877500
106.725000
98.972500
113.547500
108.300000
104.580000
0.00
96.785000
108.665000
max
48.080000
67.300000
117.320000
44.180000
51.600000
111.380000
109.670000
113.800000
117.580000
127.300000
...
117.580000
122.580000
113.330000
113.070000
125.300000
120.950000
122.770000
0.00
108.720000
120.320000
official
count
5.000000
7.000000
9.000000
4.000000
5.000000
6.000000
3.000000
24.000000
56.000000
129.000000
...
83.000000
132.000000
30.000000
34.000000
142.000000
173.000000
99.000000
1.00
7.000000
139.000000
mean
92.754000
107.681429
144.394444
90.327500
101.820000
233.056667
192.650000
180.174167
191.054107
216.295736
...
216.733373
212.162879
237.035000
197.367647
220.125986
203.899538
214.228586
241.30
197.030000
211.594604
std
7.062208
22.301293
60.885144
3.328116
7.491559
19.637942
31.763106
20.247115
19.496418
23.989609
...
24.611155
22.066124
17.385564
21.897770
21.434793
21.522678
19.128276
NaN
12.999608
18.065614
min
87.200000
88.430000
88.320000
85.530000
91.780000
196.830000
162.000000
142.450000
152.370000
105.400000
...
99.220000
153.130000
209.280000
144.150000
144.080000
153.820000
145.720000
241.30
183.650000
158.400000
25%
87.220000
93.185000
95.980000
89.542500
97.570000
229.877500
176.265000
168.720000
178.120000
203.680000
...
207.010000
197.675000
219.907500
182.645000
211.797500
184.820000
205.300000
241.30
189.740000
200.880000
50%
90.020000
101.070000
114.520000
91.330000
102.100000
238.300000
190.530000
176.470000
186.600000
217.500000
...
222.020000
213.035000
239.545000
199.660000
223.700000
205.550000
213.550000
241.30
194.670000
212.150000
75%
95.480000
112.785000
194.730000
92.115000
106.900000
243.295000
207.975000
191.415000
205.380000
233.470000
...
232.150000
229.995000
249.167500
212.777500
236.080000
220.700000
225.400000
241.30
198.955000
224.515000
max
103.850000
152.330000
254.320000
93.120000
110.750000
252.880000
225.420000
233.900000
239.970000
269.550000
...
248.580000
252.950000
266.120000
240.600000
260.750000
248.200000
262.570000
241.30
223.500000
248.680000
overall
count
5.000000
7.000000
9.000000
4.000000
5.000000
6.000000
3.000000
24.000000
56.000000
129.000000
...
83.000000
132.000000
30.000000
34.000000
142.000000
173.000000
99.000000
1.00
7.000000
139.000000
mean
9.400000
22.428571
3486.333333
7.750000
20.400000
13954.666667
5635.333333
2933.500000
4668.089286
10391.356589
...
10764.903614
9277.303030
14403.333333
6159.823529
11368.014085
7458.953757
9672.101010
16287.00
5447.285714
9116.100719
std
8.473488
15.977663
6105.646260
4.991660
7.635444
4557.433123
6379.371860
3414.465774
4037.332361
4992.539485
...
4655.468169
5046.808356
3247.004289
4242.884342
4650.684574
4859.636492
4176.880201
NaN
3199.037966
4228.294858
min
2.000000
5.000000
4.000000
1.000000
10.000000
5100.000000
346.000000
31.000000
103.000000
24.000000
...
19.000000
110.000000
8261.000000
38.000000
37.000000
118.000000
45.000000
16287.00
2647.000000
236.000000
25%
3.000000
10.500000
15.000000
5.500000
16.000000
13803.000000
2093.000000
708.250000
1731.250000
6728.000000
...
7646.000000
5291.000000
11226.000000
2510.500000
8929.500000
2812.000000
7180.500000
16287.00
3692.500000
6064.500000
50%
6.000000
20.000000
33.000000
9.000000
21.000000
15625.500000
3840.000000
1467.500000
3107.000000
10508.000000
...
11844.000000
9269.000000
15993.500000
5772.000000
12270.500000
7243.000000
9421.000000
16287.00
4660.000000
9039.000000
75%
14.000000
31.000000
4673.000000
11.250000
26.000000
16563.750000
8280.000000
4011.750000
7200.750000
14654.000000
...
14343.500000
13851.250000
17141.000000
9203.750000
15257.500000
11482.000000
12717.000000
16287.00
5602.500000
12505.500000
max
22.000000
49.000000
17338.000000
12.000000
29.000000
17305.000000
12720.000000
14768.000000
16083.000000
17568.000000
...
17109.000000
17309.000000
17550.000000
16185.000000
17479.000000
17076.000000
17510.000000
16287.00
12234.000000
17115.000000
pace
count
5.000000
7.000000
9.000000
4.000000
5.000000
6.000000
3.000000
24.000000
56.000000
129.000000
...
83.000000
132.000000
30.000000
34.000000
142.000000
173.000000
99.000000
1.00
7.000000
139.000000
mean
3.542000
4.115714
5.512222
3.452500
3.888000
8.898333
7.353333
6.880833
7.294643
8.258295
...
8.274578
8.100530
9.047667
7.535000
8.404225
7.785202
8.178182
9.22
7.524286
8.079353
std
0.272617
0.851545
2.321432
0.128679
0.287002
0.747942
1.211666
0.773844
0.742960
0.914664
...
0.940157
0.840754
0.662777
0.836465
0.817842
0.820805
0.730255
NaN
0.493824
0.688615
min
3.330000
3.380000
3.380000
3.270000
3.500000
7.520000
6.180000
5.430000
5.820000
4.030000
...
3.780000
5.850000
7.980000
5.500000
5.500000
5.880000
5.570000
9.22
7.020000
6.050000
25%
3.330000
3.560000
3.670000
3.420000
3.730000
8.772500
6.730000
6.447500
6.807500
7.780000
...
7.905000
7.545000
8.392500
6.972500
8.090000
7.050000
7.830000
9.22
7.245000
7.670000
50%
3.430000
3.870000
4.370000
3.485000
3.900000
9.100000
7.280000
6.740000
7.125000
8.300000
...
8.480000
8.135000
9.145000
7.620000
8.530000
7.850000
8.150000
9.22
7.430000
8.100000
75%
3.650000
4.310000
7.430000
3.517500
4.080000
9.292500
7.940000
7.307500
7.835000
8.920000
...
8.865000
8.782500
9.515000
8.122500
9.020000
8.430000
8.610000
9.22
7.600000
8.575000
max
3.970000
5.820000
9.700000
3.570000
4.230000
9.650000
8.600000
8.930000
9.170000
10.280000
...
9.480000
9.650000
10.150000
9.180000
9.950000
9.470000
10.020000
9.22
8.530000
9.500000
120 rows × 251 columns
From all the clustering techniques that have been used_ kmeans, spectral, mean shift and affinity, the ones that present more stability in terms of the variance withn the clusters are kmeans and spectral clustering. When the bandwidth is close to quantile 1 (0.9) then the number of clusters obtained with mean shift is reduced to 18 (from 58 in the case of using quantile 0.25). In this case most of the clusters are empty reason why this clustering method has been discarded. The same case applies to the Affinity clustering as there are 251 clusters with less than 1% of the data in each of them.
From the kmeans and spectral clustering perspective, each cluster contains between 1% and 43% of the datapoints. In this case, the best one from a similarity analysis perspective is the kmeans as with less clusters achieves similar silhouette values. For the kmeans cluster, the best solution is 3 clusters from an elbow methodology perspective although the last cluster containes less than 1% of the data, so 2 clusters should be considered.
rom the 3 clusters we can see that in the first cluster men finished the marathon quicker than women with an average pace of 7.36 bein the official timings lower in all cases. Additionally, in cluster one we see that men are younger than women entering in position 192 against women 218. The third cluster men and women have similar age and there is also a difference in 20 positions from the first to the last person in the cluster of people aged on average 48 years old.
Content source: borja876/Thinkful-DataScience-Borja
Similar notebooks: