In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import preprocessing

from scipy.spatial.distance import cdist
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.cluster import AffinityPropagation

from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import SpectralClustering
from sklearn.metrics import silhouette_samples, silhouette_score

In [3]:
# Read and import data
boston_marathon_results = pd.read_csv('results 2013.csv')
boston_marathon_results.head()


Out[3]:
25k age name division 10k gender half official bib ctz ... overall pace state 30k 5k genderdiv 20k 35k city 40k
0 49.87 28 Cassidy, Josh R. 9 18.18 M 40.93 90.90 W1 NaN ... 9 3.47 ON 62.07 8.90 9 38.80 74.73 Toronto 85.55
1 77.27 30 Korir, Wesley 5 30.90 M 64.90 132.50 1 NaN ... 5 5.07 NaN 92.97 15.90 5 61.52 108.78 Kenya 124.77
2 77.23 23 Desisa, Lelisa 1 30.90 M 64.92 130.37 2 NaN ... 1 4.98 NaN 92.72 15.93 1 61.53 108.68 Ambo 123.78
3 50.50 32 Fearnley, Kurt H. 5 18.73 M 42.00 88.43 W2 NaN ... 5 3.38 NaN 61.35 8.98 5 39.88 73.00 Hamilton 83.43
4 48.75 39 Hokinoue, Kota 3 18.18 M 40.57 87.22 W3 NaN ... 3 3.33 NaN 59.92 8.92 3 38.55 71.68 Iizuka 81.88

5 rows × 21 columns


In [4]:
boston_marathon_results.columns


Out[4]:
Index(['25k', 'age', 'name', 'division', '10k', 'gender', 'half', 'official',
       'bib', 'ctz', 'country', 'overall', 'pace', 'state', '30k', '5k',
       'genderdiv', '20k', '35k', 'city', '40k'],
      dtype='object')

In [5]:
boston_marathon_scores = boston_marathon_results.drop(['city','country', 'genderdiv', 'bib', 'ctz', 'state', 'name', 'division'], axis = 1)

In [6]:
boston_marathon_scores.replace('-', 0, inplace=True)
boston_marathon_scores['gender'] = boston_marathon_scores.loc[:, 'gender'].map({'F': 0,'M': 1})
print(boston_marathon_scores.columns.unique())
boston_marathon_scores.head()


Index(['25k', 'age', '10k', 'gender', 'half', 'official', 'overall', 'pace',
       '30k', '5k', '20k', '35k', '40k'],
      dtype='object')
Out[6]:
25k age 10k gender half official overall pace 30k 5k 20k 35k 40k
0 49.87 28 18.18 1 40.93 90.90 9 3.47 62.07 8.90 38.80 74.73 85.55
1 77.27 30 30.90 1 64.90 132.50 5 5.07 92.97 15.90 61.52 108.78 124.77
2 77.23 23 30.90 1 64.92 130.37 1 4.98 92.72 15.93 61.53 108.68 123.78
3 50.50 32 18.73 1 42.00 88.43 5 3.38 61.35 8.98 39.88 73.00 83.43
4 48.75 39 18.18 1 40.57 87.22 3 3.33 59.92 8.92 38.55 71.68 81.88

In [7]:
boston_marathon_scores = boston_marathon_scores.astype(float)
boston_marathon_scores.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16164 entries, 0 to 16163
Data columns (total 13 columns):
25k         16164 non-null float64
age         16164 non-null float64
10k         16164 non-null float64
gender      16164 non-null float64
half        16164 non-null float64
official    16164 non-null float64
overall     16164 non-null float64
pace        16164 non-null float64
30k         16164 non-null float64
5k          16164 non-null float64
20k         16164 non-null float64
35k         16164 non-null float64
40k         16164 non-null float64
dtypes: float64(13)
memory usage: 1.6 MB

In [8]:
#Make a copy of DF
X_tr = boston_marathon_scores

#Standardize
clmns = ['age', 'official','40k', '35k', '30k', '25k', 'half', '20k', '10k', '5k', 'pace']

X_tr_std = normalize(X_tr[clmns])

Compare Spectral Clustering against kMeans using Similarity

As there is no ground truth, the criteria used to evaluate clusters produced using Spectral and kmeans is the silhouette coefficient. From the results obtained, it can be appreaciated that Spectral Clustering requires 6 clusters to have the silhouette score similar to the one obtained with 3 clusters with kmeans.


In [19]:
#Compare from a silhouette_score perspective kmeans against Spectral Clustering
range_n_clusters = np.arange(10)+2

for n_clusters in range_n_clusters:
# The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters

# Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    spec_clust = SpectralClustering(n_clusters=n_clusters)
    cluster_labels1 = spec_clust.fit_predict(X_tr_std)    
    silhouette_avg1 = silhouette_score(X_tr_std, cluster_labels1)
    
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10).fit(X_tr_std)
    cluster_labels2 = kmeans.fit_predict(X_tr_std)    
    silhouette_avg2 = silhouette_score(X_tr_std, cluster_labels2)
    
    print("For n_clusters =", n_clusters,
          "av. sil_score for Spec. clust is :", silhouette_avg1,
         "av. sil_score for kmeans is :",silhouette_avg2 )


For n_clusters = 2 av. sil_score for Spec. clust is : 0.961171331526 av. sil_score for kmeans is : 0.960595115442
For n_clusters = 3 av. sil_score for Spec. clust is : 0.387076810597 av. sil_score for kmeans is : 0.388209250616
For n_clusters = 4 av. sil_score for Spec. clust is : 0.388415111093 av. sil_score for kmeans is : 0.396003681008
For n_clusters = 5 av. sil_score for Spec. clust is : 0.389380079665 av. sil_score for kmeans is : 0.397697024637
For n_clusters = 6 av. sil_score for Spec. clust is : 0.390484776573 av. sil_score for kmeans is : 0.329749119438
For n_clusters = 7 av. sil_score for Spec. clust is : 0.385483467461 av. sil_score for kmeans is : 0.332703384807
For n_clusters = 8 av. sil_score for Spec. clust is : 0.384559772179 av. sil_score for kmeans is : 0.335751771772
For n_clusters = 9 av. sil_score for Spec. clust is : 0.386544891208 av. sil_score for kmeans is : 0.336687926662
For n_clusters = 10 av. sil_score for Spec. clust is : 0.379711180133 av. sil_score for kmeans is : 0.292708178952
For n_clusters = 11 av. sil_score for Spec. clust is : 0.380947772569 av. sil_score for kmeans is : 0.293759032687

the optimal number of kmeans will be determined using the elbow method. Once the kmeans number of clusters is set, the number of clusters using spectral clustering will be used so that it equals the silhouette score obtained in the first case.

K-Means


In [9]:
#Use the elbow method to determine the number of clusters
# k-means determine k
distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(X_tr)
    kmeanModel.fit(X_tr)
    distortions.append(sum(np.min(cdist(X_tr, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X_tr.shape[0])
 
# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()


The elbow method shows that the optimal number of clusters to be used in the kmeans method is 3, considering the euclidean distance between cluster centers. From an analytical perspective, the inertia functions shows the same results: 3 clusters were the difference between the results obtained by the inertia function are smaller when shifting from 3 to 4 clusters.


In [10]:
#Evaluate the best number of clusters
for i in range(1,10):
 km = KMeans(n_clusters=i, init='k-means++', n_init=10).fit(X_tr_std)
 print (i, km.inertia_)


1 30.1263346974
2 19.0457804842
3 13.188426726
4 11.2720106651
5 9.93710134099
6 8.54109431453
7 7.47659624147
8 6.8543306694
9 6.17804693014

In [59]:
#Cluster the data
kmeans = KMeans(n_clusters=3, init='k-means++', n_init=10).fit(X_tr_std)
labels = kmeans.labels_

#Glue back to original data
X_tr['clusters'] = labels
X_tr['Gender'] = boston_marathon_scores.gender
X_tr['Overall'] = boston_marathon_scores.overall

#Add the column into our list
clmns.extend(['clusters','Gender','Overall'])

#Lets analyze the clusters
pd.DataFrame(X_tr.groupby(['clusters']).mean())


Out[59]:
25k age 10k gender half official overall pace 30k 5k 20k 35k 40k Gender Overall
clusters
0 118.229587 33.293590 46.701476 0.386324 99.217164 208.662765 8523.313127 7.966590 143.787377 23.352465 94.031163 170.262460 197.110309 0.386324 8523.313127
1 0.000000 41.857143 14.568571 0.500000 14.730000 220.382143 11327.642857 8.412857 10.755714 9.150714 13.962143 0.000000 14.455000 0.500000 11327.642857
2 118.059351 48.824824 46.667129 0.782068 99.197408 207.706291 8343.792325 7.930118 143.326589 23.332466 94.015637 169.582073 195.992275 0.782068 8343.792325

In [60]:
clusters_summary = X_tr.groupby(['clusters']).describe()
clusters_summary_transposed = clusters_summary.transpose()
clusters_summary_transposed


Out[60]:
clusters 0 1 2
10k count 7473.000000 14.000000 8677.000000
mean 46.701476 14.568571 46.667129
std 5.086413 23.947241 5.160402
min 0.000000 0.000000 0.000000
25% 42.880000 0.000000 43.470000
50% 47.420000 0.000000 46.950000
75% 50.400000 36.090000 50.120000
max 66.680000 53.570000 61.580000
20k count 7473.000000 14.000000 8677.000000
mean 94.031163 13.962143 94.015637
std 10.168686 35.492962 10.030560
min 0.000000 0.000000 0.000000
25% 85.950000 0.000000 87.320000
50% 95.430000 0.000000 94.450000
75% 101.480000 0.000000 101.000000
max 131.720000 98.650000 123.180000
25k count 7473.000000 14.000000 8677.000000
mean 118.229587 0.000000 118.059351
std 13.036739 0.000000 12.914815
min 0.000000 0.000000 0.000000
25% 108.130000 0.000000 109.600000
50% 120.100000 0.000000 118.570000
75% 127.580000 0.000000 127.050000
max 163.620000 0.000000 158.550000
30k count 7473.000000 14.000000 8677.000000
mean 143.787377 10.755714 143.326589
std 15.833160 40.244198 15.838780
min 68.220000 0.000000 0.000000
25% 131.230000 0.000000 132.920000
50% 145.900000 0.000000 143.820000
... ... ... ... ...
half std 10.737139 37.444634 10.539430
min 0.000000 0.000000 0.000000
25% 90.680000 0.000000 92.130000
50% 100.750000 0.000000 99.650000
75% 107.100000 0.000000 106.580000
max 138.670000 104.000000 129.880000
official count 7473.000000 14.000000 8677.000000
mean 208.662765 220.382143 207.706291
std 23.933593 18.144768 23.574875
min 97.580000 173.580000 85.530000
25% 191.530000 214.842500 191.780000
50% 210.420000 224.450000 208.170000
75% 225.220000 230.722500 225.200000
max 284.230000 245.450000 281.600000
overall count 7473.000000 14.000000 8677.000000
mean 8523.313127 11327.642857 8343.792325
std 5116.560194 4217.833651 4994.673736
min 1.000000 1114.000000 1.000000
25% 4030.000000 9781.250000 4071.000000
50% 8581.000000 12472.500000 7924.000000
75% 12661.000000 14006.250000 12655.000000
max 17598.000000 16825.000000 17596.000000
pace count 7473.000000 14.000000 8677.000000
mean 7.966590 8.412857 7.930118
std 0.912951 0.690935 0.899336
min 3.730000 6.630000 3.270000
25% 7.320000 8.197500 7.320000
50% 8.030000 8.565000 7.950000
75% 8.600000 8.802500 8.600000
max 10.850000 9.370000 10.750000

120 rows × 3 columns


In [61]:
# Reduce it to two components.
X_pca = PCA(2).fit_transform(X_tr_std)

# Calculate predicted values.
y_pred = KMeans(n_clusters=3, random_state=42).fit_predict(X_pca)

# Plot the solution.
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_pred)
plt.show()



In [62]:
Graph_kmeans_official = pd.pivot_table(X_tr, 'official', ['clusters', 'gender'])
Graph_kmeans_pace = pd.pivot_table(X_tr, 'pace', ['clusters', 'gender'])
Graph_kmeans_age = pd.pivot_table(X_tr, 'age', ['clusters', 'gender'])
print(Graph_kmeans_official, Graph_kmeans_pace, Graph_kmeans_age)


                   official
clusters gender            
0        0.0     218.637580
         1.0     192.817769
1        0.0     223.532857
         1.0     217.231429
2        0.0     222.032644
         1.0     203.714082                      pace
clusters gender          
0        0.0     8.347072
         1.0     7.362196
1        0.0     8.534286
         1.0     8.291429
2        0.0     8.476526
         1.0     7.777854                        age
clusters gender           
0        0.0     34.193197
         1.0     31.864565
1        0.0     40.142857
         1.0     43.571429
2        0.0     48.960338
         1.0     48.787062

Spectral Clustering


In [15]:
# We know we're looking for 6 clusters from the comparison with the kmeans.
n_clusters=6

# Declare and fit the model.
sc = SpectralClustering(n_clusters=n_clusters).fit(X_tr_std)

# Extract cluster assignments for each data point.
labels = sc.labels_

#Glue back to original data
X_tr['clusters'] = labels
X_tr['Gender'] = boston_marathon_scores.gender
X_tr['Overall'] = boston_marathon_scores.overall

#Add the column into our list
clmns.extend(['clusters','Gender','Overall'])

#Lets analyze the clusters

pd.DataFrame(X_tr.groupby(['clusters']).mean())


Out[15]:
25k age 10k gender half official overall pace 30k 5k 20k 35k 40k Gender Overall
clusters
0 117.848578 49.104459 46.705808 0.777035 99.082917 205.460250 7864.603982 7.844473 142.773846 23.347735 93.929491 168.457354 194.105159 0.777035 7864.603982
1 118.205056 33.151903 46.951070 0.348409 99.416279 205.273537 7829.390674 7.837257 143.102483 23.508974 94.253910 168.663509 194.152764 0.348409 7829.390674
2 0.000000 40.416667 12.936667 0.416667 8.518333 220.857500 11460.166667 8.430833 0.000000 8.661667 8.068333 0.000000 0.000000 0.416667 11460.166667
3 31.868571 39.928571 42.200000 0.785714 89.550714 204.827143 7637.428571 7.820000 141.996429 22.919286 84.792143 155.258571 126.517143 0.785714 7637.428571
4 130.341429 45.428571 50.031429 0.571429 108.240000 236.570000 14046.285714 9.030000 161.281429 21.010000 102.022857 0.000000 223.585714 0.571429 14046.285714
5 119.219489 41.503353 45.890427 0.724225 99.013487 224.082938 11750.388935 8.554849 146.968906 22.873650 93.681865 178.077615 210.484363 0.724225 11750.388935

In [16]:
clusters_summary = X_tr.groupby(['clusters']).describe()
clusters_summary_transposed = clusters_summary.transpose()
clusters_summary_transposed


Out[16]:
clusters 0 1 2 3 4 5
10k count 7333.000000 6412.000000 12.000000 14.000000 7.000000 2386.000000
mean 46.705808 46.951070 12.936667 42.200000 50.031429 45.890427
std 5.067477 4.974319 23.441598 12.654792 2.584411 5.614510
min 0.000000 21.400000 0.000000 0.000000 46.270000 0.000000
25% 43.420000 42.977500 0.000000 43.435000 48.075000 43.020000
50% 46.980000 47.750000 0.000000 45.020000 51.180000 46.260000
75% 50.180000 50.600000 12.030000 47.632500 51.685000 49.430000
max 61.580000 66.680000 53.570000 51.800000 53.250000 57.880000
20k count 7333.000000 6412.000000 12.000000 14.000000 7.000000 2386.000000
mean 93.929491 94.253910 8.068333 84.792143 102.022857 93.681865
std 10.114733 10.079694 27.949527 25.409862 5.972176 10.072741
min 38.230000 44.380000 0.000000 0.000000 92.820000 0.000000
25% 87.120000 85.950000 0.000000 87.247500 98.875000 87.412500
50% 94.370000 95.890000 0.000000 90.400000 102.270000 94.230000
75% 101.030000 101.600000 0.000000 95.192500 105.610000 100.845000
max 123.180000 131.720000 96.820000 103.920000 110.100000 121.070000
25k count 7333.000000 6412.000000 12.000000 14.000000 7.000000 2386.000000
mean 117.848578 118.205056 0.000000 31.868571 130.341429 119.219489
std 12.791647 12.783019 0.000000 52.296443 9.249017 12.165837
min 48.320000 56.050000 0.000000 0.000000 115.920000 0.000000
25% 109.220000 107.715000 0.000000 0.000000 126.725000 110.872500
50% 118.320000 120.260000 0.000000 0.000000 129.100000 119.680000
75% 126.850000 127.470000 0.000000 82.575000 134.945000 128.300000
max 153.480000 163.620000 0.000000 113.350000 144.030000 158.550000
30k count 7333.000000 6412.000000 12.000000 14.000000 7.000000 2386.000000
mean 142.773846 143.102483 0.000000 141.996429 161.281429 146.968906
std 15.661068 15.694454 0.000000 13.742250 13.225840 16.564537
min 59.450000 68.220000 0.000000 113.600000 139.300000 0.000000
25% 132.220000 130.420000 0.000000 134.082500 156.000000 136.330000
50% 143.230000 145.520000 0.000000 139.625000 161.870000 147.915000
... ... ... ... ... ... ... ...
half std 10.678470 10.639831 29.508372 26.845829 7.034143 10.464197
min 40.270000 46.720000 0.000000 0.000000 97.920000 0.000000
25% 91.900000 90.645000 0.000000 92.120000 104.490000 92.305000
50% 99.530000 101.180000 0.000000 95.275000 107.750000 99.560000
75% 106.600000 107.180000 0.000000 100.762500 112.740000 106.520000
max 129.880000 138.670000 102.220000 109.670000 117.550000 127.050000
official count 7333.000000 6412.000000 12.000000 14.000000 7.000000 2386.000000
mean 205.460250 205.273537 220.857500 204.827143 236.570000 224.082938
std 22.892748 22.731726 19.680258 20.528396 23.676290 22.654757
min 85.530000 97.580000 173.580000 162.000000 195.030000 114.830000
25% 190.280000 187.837500 212.997500 191.492500 225.100000 207.672500
50% 205.850000 208.210000 225.400000 205.635000 242.100000 226.035000
75% 222.080000 221.600000 231.710000 219.295000 251.140000 240.082500
max 269.550000 278.000000 245.450000 238.220000 266.380000 284.230000
overall count 7333.000000 6412.000000 12.000000 14.000000 7.000000 2386.000000
mean 7864.603982 7829.390674 11460.166667 7637.428571 14046.285714 11750.388935
std 4834.454210 4911.403729 4570.114716 4612.962167 4648.163498 4783.808730
min 1.000000 1.000000 1114.000000 346.000000 4739.000000 34.000000
25% 3790.000000 3326.000000 9272.750000 4024.000000 12607.000000 7786.000000
50% 7326.000000 7938.000000 12716.000000 7252.000000 16397.000000 12868.000000
75% 11858.000000 11740.250000 14245.000000 11069.250000 17210.000000 16096.250000
max 17568.000000 17589.000000 16825.000000 15721.000000 17554.000000 17598.000000
pace count 7333.000000 6412.000000 12.000000 14.000000 7.000000 2386.000000
mean 7.844473 7.837257 8.430833 7.820000 9.030000 8.554849
std 0.873348 0.867035 0.749405 0.783768 0.901924 0.864270
min 3.270000 3.730000 6.630000 6.180000 7.450000 4.380000
25% 7.270000 7.170000 8.127500 7.320000 8.590000 7.930000
50% 7.870000 7.950000 8.600000 7.850000 9.250000 8.630000
75% 8.480000 8.470000 8.840000 8.367500 9.580000 9.170000
max 10.280000 10.620000 9.370000 9.100000 10.170000 10.850000

120 rows × 6 columns


In [63]:
# Reduce it to two components.
X_pca = PCA(2).fit_transform(X_tr_std)

# Calculate predicted values.

y_pred = SpectralClustering(n_clusters=3).fit_predict(X_pca)

# Plot the solution.
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_pred)
plt.show()


Mean Shift


In [24]:
# Here we set the bandwidth. This function automatically derives a bandwidth
# number based on an inspection of the distances among points in the data.
bandwidth = estimate_bandwidth(X_tr_std, quantile=0.9)

# Declare and fit the model.
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit(X_tr_std)

# Extract cluster assignments for each data point.
labels = ms.labels_

# Coordinates of the cluster centers.
cluster_centers = ms.cluster_centers_

# Count our clusters.
n_clusters_ = len(np.unique(labels))

#Glue back to original data
X_tr['clusters'] = labels
X_tr['Gender'] = boston_marathon_scores.gender
X_tr['Overall'] = boston_marathon_scores.overall

#Add the column into our list
clmns.extend(['clusters','Gender','Overall'])

#Lets analyze the clusters
print("Number of estimated clusters: {}".format(n_clusters_))
pd.DataFrame(X_tr.groupby(['clusters']).mean())


Number of estimated clusters: 18
Out[24]:
25k age 10k gender half official overall pace 30k 5k 20k 35k 40k Gender Overall
clusters
0 118.195629 41.634089 46.729067 0.599007 99.225182 208.131799 8423.401304 7.946343 143.564142 23.349711 94.047516 169.965941 196.538315 0.599007 8423.401304
1 123.384286 45.357143 0.000000 0.571429 103.810000 216.680000 10269.571429 8.272857 149.630714 17.516429 98.415000 177.007857 204.820000 0.571429 10269.571429
2 0.000000 39.500000 0.000000 0.500000 0.000000 219.632500 11138.875000 8.385000 0.000000 0.000000 0.000000 0.000000 0.000000 0.500000 11138.875000
3 0.000000 41.750000 45.547500 0.750000 96.765000 205.925000 8088.750000 7.860000 142.660000 22.890000 91.538750 168.860000 194.555000 0.750000 8088.750000
4 130.341429 45.428571 50.031429 0.571429 108.240000 236.570000 14046.285714 9.030000 161.281429 21.010000 102.022857 0.000000 223.585714 0.571429 14046.285714
5 111.540000 39.500000 44.425000 0.750000 93.897500 193.430000 4521.000000 7.387500 135.355000 22.262500 89.032500 159.577500 0.000000 0.750000 4521.000000
6 119.052500 46.750000 46.847500 0.500000 99.985000 214.370000 9688.500000 8.185000 0.000000 23.267500 94.575000 173.530000 202.502500 0.500000 9688.500000
7 115.956667 41.000000 45.010000 0.333333 0.000000 205.583333 8294.333333 7.856667 141.570000 22.523333 91.533333 167.580000 193.983333 0.333333 8294.333333
8 121.625000 41.500000 48.775000 0.500000 102.750000 205.265000 8105.500000 7.835000 145.960000 24.650000 0.000000 170.645000 194.890000 0.500000 8105.500000
9 0.000000 31.000000 0.000000 1.000000 0.000000 228.380000 13438.000000 8.720000 154.670000 24.530000 0.000000 184.430000 214.800000 1.000000 13438.000000
10 0.000000 36.000000 48.720000 1.000000 104.000000 218.080000 10692.000000 8.330000 150.580000 24.170000 98.650000 0.000000 0.000000 1.000000 10692.000000
11 115.100000 41.000000 0.000000 0.000000 97.100000 196.080000 4939.000000 7.480000 138.730000 22.750000 0.000000 162.550000 185.830000 0.000000 4939.000000
12 0.000000 23.000000 53.570000 0.000000 0.000000 225.400000 12717.000000 8.600000 0.000000 26.870000 0.000000 0.000000 0.000000 0.000000 12717.000000
13 0.000000 65.000000 0.000000 1.000000 0.000000 216.980000 10373.000000 8.280000 0.000000 0.000000 0.000000 0.000000 202.370000 1.000000 10373.000000
14 0.000000 43.000000 48.120000 0.000000 102.220000 214.130000 9584.000000 8.170000 0.000000 24.200000 96.820000 0.000000 0.000000 0.000000 9584.000000
15 0.000000 57.000000 53.550000 1.000000 0.000000 225.400000 12715.000000 8.600000 0.000000 26.870000 0.000000 0.000000 0.000000 1.000000 12715.000000
16 121.720000 29.000000 46.920000 0.000000 100.950000 221.850000 11810.000000 8.470000 150.850000 0.000000 0.000000 181.100000 209.350000 0.000000 11810.000000
17 0.000000 46.000000 0.000000 0.000000 0.000000 228.300000 13395.000000 8.720000 0.000000 26.000000 0.000000 0.000000 0.000000 0.000000 13395.000000

In [21]:
clusters_summary = X_tr.groupby(['clusters']).describe()
clusters_summary_transposed = clusters_summary.transpose()
clusters_summary_transposed


Out[21]:
clusters 0 1 2 3 4 5 6 7 8 9 ... 12 13 14 15 16 17 18 19 20 21
10k count 16105.000000 10.000000 8.000000 7.000000 4.000000 4.000000 4.000000 3.000000 3.000000 3.000000 ... 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
mean 46.729067 0.000000 0.000000 45.501429 44.425000 46.847500 0.000000 45.010000 50.280000 49.400000 ... 0.00 48.72 0.00 51.18 53.57 0.00 48.12 53.55 46.92 0.00
std 4.914671 0.000000 0.000000 4.923442 1.081558 2.355184 0.000000 6.085294 3.604345 2.386608 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min 18.030000 0.000000 0.000000 36.900000 43.170000 43.580000 0.000000 38.630000 46.270000 47.420000 ... 0.00 48.72 0.00 51.18 53.57 0.00 48.12 53.55 46.92 0.00
25% 43.220000 0.000000 0.000000 43.485000 43.965000 45.905000 0.000000 42.140000 48.795000 48.075000 ... 0.00 48.72 0.00 51.18 53.57 0.00 48.12 53.55 46.92 0.00
50% 47.170000 0.000000 0.000000 45.420000 44.365000 47.555000 0.000000 45.650000 51.320000 48.730000 ... 0.00 48.72 0.00 51.18 53.57 0.00 48.12 53.55 46.92 0.00
75% 50.280000 0.000000 0.000000 48.710000 44.825000 48.497500 0.000000 48.200000 52.285000 50.390000 ... 0.00 48.72 0.00 51.18 53.57 0.00 48.12 53.55 46.92 0.00
max 66.680000 0.000000 0.000000 51.800000 45.800000 48.700000 0.000000 50.750000 53.250000 52.050000 ... 0.00 48.72 0.00 51.18 53.57 0.00 48.12 53.55 46.92 0.00
20k count 16105.000000 10.000000 8.000000 7.000000 4.000000 4.000000 4.000000 3.000000 3.000000 3.000000 ... 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
mean 94.047516 98.301000 0.000000 91.482857 89.032500 94.575000 98.700000 91.533333 101.730000 101.850000 ... 0.00 98.65 0.00 103.42 0.00 0.00 96.82 0.00 0.00 0.00
std 9.963129 7.606121 0.000000 9.712080 1.795223 4.755562 5.907137 14.224072 8.652647 5.566229 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min 38.230000 81.900000 0.000000 74.980000 86.830000 87.920000 91.350000 76.570000 92.820000 96.770000 ... 0.00 98.65 0.00 103.42 0.00 0.00 96.82 0.00 0.00 0.00
25% 86.770000 94.292500 0.000000 86.975000 88.082500 92.877500 95.700000 84.860000 97.545000 98.875000 ... 0.00 98.65 0.00 103.42 0.00 0.00 96.82 0.00 0.00 0.00
50% 94.870000 98.775000 0.000000 91.630000 89.135000 95.855000 99.110000 93.150000 102.270000 100.980000 ... 0.00 98.65 0.00 103.42 0.00 0.00 96.82 0.00 0.00 0.00
75% 101.270000 104.217500 0.000000 97.950000 90.085000 97.552500 102.110000 99.015000 106.185000 104.390000 ... 0.00 98.65 0.00 103.42 0.00 0.00 96.82 0.00 0.00 0.00
max 131.720000 106.870000 0.000000 103.920000 91.030000 98.670000 105.230000 104.880000 110.100000 107.800000 ... 0.00 98.65 0.00 103.42 0.00 0.00 96.82 0.00 0.00 0.00
25k count 16105.000000 10.000000 8.000000 7.000000 4.000000 4.000000 4.000000 3.000000 3.000000 3.000000 ... 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
mean 118.195629 123.299000 0.000000 0.000000 111.540000 119.052500 123.597500 115.956667 127.806667 132.933333 ... 0.00 0.00 115.10 130.17 0.00 0.00 0.00 0.00 121.72 0.00
std 12.674378 9.617818 0.000000 0.000000 1.605138 6.217853 7.188743 19.538911 11.900022 9.761825 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min 48.320000 102.830000 0.000000 0.000000 110.100000 110.420000 114.680000 95.570000 115.920000 125.670000 ... 0.00 0.00 115.10 130.17 0.00 0.00 0.00 0.00 121.72 0.00
25% 108.920000 118.290000 0.000000 0.000000 110.235000 116.802500 119.885000 106.675000 121.850000 127.385000 ... 0.00 0.00 115.10 130.17 0.00 0.00 0.00 0.00 121.72 0.00
50% 119.180000 123.420000 0.000000 0.000000 111.355000 120.555000 124.095000 117.780000 127.780000 129.100000 ... 0.00 0.00 115.10 130.17 0.00 0.00 0.00 0.00 121.72 0.00
75% 127.330000 131.315000 0.000000 0.000000 112.660000 122.805000 127.807500 126.150000 133.750000 136.565000 ... 0.00 0.00 115.10 130.17 0.00 0.00 0.00 0.00 121.72 0.00
max 163.620000 134.000000 0.000000 0.000000 113.350000 124.680000 131.520000 134.520000 139.720000 144.030000 ... 0.00 0.00 115.10 130.17 0.00 0.00 0.00 0.00 121.72 0.00
30k count 16105.000000 10.000000 8.000000 7.000000 4.000000 4.000000 4.000000 3.000000 3.000000 3.000000 ... 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
mean 143.564142 149.453000 0.000000 139.261429 135.355000 0.000000 150.075000 141.570000 154.596667 169.233333 ... 154.67 150.58 138.73 157.48 0.00 0.00 0.00 0.00 150.85 0.00
std 15.677899 11.785741 0.000000 15.166610 2.146012 0.000000 8.655736 25.491828 15.335144 10.977169 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min 59.450000 125.330000 0.000000 113.600000 132.180000 0.000000 139.150000 115.080000 139.300000 161.870000 ... 154.67 150.58 138.73 157.48 0.00 0.00 0.00 0.00 150.85 0.00
25% 132.200000 142.905000 0.000000 131.375000 135.097500 0.000000 145.285000 129.390000 146.910000 162.925000 ... 154.67 150.58 138.73 157.48 0.00 0.00 0.00 0.00 150.85 0.00
50% 144.730000 149.165000 0.000000 142.350000 136.170000 0.000000 151.375000 143.700000 154.520000 163.980000 ... 154.67 150.58 138.73 157.48 0.00 0.00 0.00 0.00 150.85 0.00
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
half std 10.521256 8.028551 0.000000 10.297575 1.800748 5.022838 6.131479 0.000000 9.187667 7.857394 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min 40.270000 86.330000 0.000000 79.100000 91.720000 92.820000 96.480000 0.000000 97.920000 102.280000 ... 0.00 104.00 97.10 109.20 0.00 0.00 102.22 0.00 100.95 0.00
25% 91.550000 99.520000 0.000000 91.685000 92.920000 98.520000 100.980000 0.000000 102.835000 104.490000 ... 0.00 104.00 97.10 109.20 0.00 0.00 102.22 0.00 100.95 0.00
50% 100.100000 104.130000 0.000000 96.680000 93.970000 101.450000 104.500000 0.000000 107.750000 106.700000 ... 0.00 104.00 97.10 109.20 0.00 0.00 102.22 0.00 100.95 0.00
75% 106.850000 109.995000 0.000000 103.450000 94.947500 102.915000 107.615000 0.000000 112.015000 112.125000 ... 0.00 104.00 97.10 109.20 0.00 0.00 102.22 0.00 100.95 0.00
max 138.670000 112.700000 0.000000 109.670000 95.930000 104.220000 110.900000 0.000000 116.280000 117.550000 ... 0.00 104.00 97.10 109.20 0.00 0.00 102.22 0.00 100.95 0.00
official count 16105.000000 10.000000 8.000000 7.000000 4.000000 4.000000 4.000000 3.000000 3.000000 3.000000 ... 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
mean 208.131799 217.303000 219.632500 201.311429 193.430000 214.370000 215.122500 205.583333 221.903333 254.120000 ... 228.38 218.08 196.08 227.92 225.40 216.98 214.13 225.40 221.85 228.30
std 23.747320 15.328319 24.220754 21.732860 8.019406 19.050727 10.866865 37.232860 26.686994 12.141779 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min 85.530000 194.170000 173.580000 162.000000 184.400000 194.050000 199.900000 167.000000 195.030000 242.100000 ... 228.38 218.08 196.08 227.92 225.40 216.98 214.13 225.40 221.85 228.30
25% 191.700000 205.922500 207.037500 191.105000 189.672500 202.090000 211.112500 187.725000 208.655000 247.990000 ... 228.38 218.08 196.08 227.92 225.40 216.98 214.13 225.40 221.85 228.30
50% 209.220000 215.690000 227.515000 207.500000 192.775000 212.925000 218.310000 208.450000 222.280000 253.880000 ... 228.38 218.08 196.08 227.92 225.40 216.98 214.13 225.40 221.85 228.30
75% 225.200000 226.117500 234.637500 216.025000 196.532500 225.205000 222.320000 224.875000 235.340000 260.130000 ... 228.38 218.08 196.08 227.92 225.40 216.98 214.13 225.40 221.85 228.30
max 284.230000 242.780000 245.450000 225.420000 203.770000 237.580000 223.970000 241.300000 248.400000 266.380000 ... 228.38 218.08 196.08 227.92 225.40 216.98 214.13 225.40 221.85 228.30
overall count 16105.000000 10.000000 8.000000 7.000000 4.000000 4.000000 4.000000 3.000000 3.000000 3.000000 ... 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
mean 8423.401304 10399.700000 11138.875000 6998.428571 4521.000000 9688.500000 9944.250000 8294.333333 11245.333333 17092.666667 ... 13438.00 10692.00 4939.00 13310.00 12717.00 10373.00 9584.00 12715.00 11810.00 13395.00
std 5052.028950 3920.898283 5586.997518 4438.780270 1672.116423 4895.439272 2959.093484 7857.198250 6203.282223 613.063075 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min 1.000000 4559.000000 1114.000000 346.000000 2759.000000 4534.000000 5825.000000 580.000000 4739.000000 16397.000000 ... 13438.00 10692.00 4939.00 13310.00 12717.00 10373.00 9584.00 12715.00 11810.00 13395.00
25% 4055.000000 7360.250000 7676.500000 3945.500000 3701.000000 6421.000000 8801.750000 4298.000000 8321.500000 16862.000000 ... 13438.00 10692.00 4939.00 13310.00 12717.00 10373.00 9584.00 12715.00 11810.00 13395.00
50% 8241.000000 10063.000000 13220.000000 7741.000000 4281.000000 9317.500000 10793.500000 8016.000000 11904.000000 17327.000000 ... 13438.00 10692.00 4939.00 13310.00 12717.00 10373.00 9584.00 12715.00 11810.00 13395.00
75% 12655.000000 12885.500000 14851.000000 10145.500000 5101.000000 12585.000000 11936.000000 12151.500000 14498.500000 17440.500000 ... 13438.00 10692.00 4939.00 13310.00 12717.00 10373.00 9584.00 12715.00 11810.00 13395.00
max 17598.000000 16503.000000 16825.000000 12720.000000 6763.000000 15585.000000 12365.000000 16287.000000 17093.000000 17554.000000 ... 13438.00 10692.00 4939.00 13310.00 12717.00 10373.00 9584.00 12715.00 11810.00 13395.00
pace count 16105.000000 10.000000 8.000000 7.000000 4.000000 4.000000 4.000000 3.000000 3.000000 3.000000 ... 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
mean 7.946343 8.297000 8.385000 7.682857 7.387500 8.185000 8.212500 7.856667 8.470000 9.700000 ... 8.72 8.33 7.48 8.70 8.60 8.28 8.17 8.60 8.47 8.72
std 0.905882 0.584524 0.922109 0.827642 0.309556 0.721503 0.416203 1.423388 1.015037 0.460326 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min 3.270000 7.420000 6.630000 6.180000 7.030000 7.420000 7.630000 6.380000 7.450000 9.250000 ... 8.72 8.33 7.48 8.70 8.60 8.28 8.17 8.60 8.47 8.72
25% 7.320000 7.860000 7.905000 7.300000 7.247500 7.720000 8.057500 7.175000 7.965000 9.465000 ... 8.72 8.33 7.48 8.70 8.60 8.28 8.17 8.60 8.47 8.72
50% 7.980000 8.230000 8.680000 7.920000 7.370000 8.125000 8.335000 7.970000 8.480000 9.680000 ... 8.72 8.33 7.48 8.70 8.60 8.28 8.17 8.60 8.47 8.72
75% 8.600000 8.640000 8.960000 8.240000 7.510000 8.590000 8.490000 8.595000 8.980000 9.925000 ... 8.72 8.33 7.48 8.70 8.60 8.28 8.17 8.60 8.47 8.72
max 10.850000 9.270000 9.370000 8.600000 7.780000 9.070000 8.550000 9.220000 9.480000 10.170000 ... 8.72 8.33 7.48 8.70 8.60 8.28 8.17 8.60 8.47 8.72

120 rows × 22 columns


In [25]:
# Reduce it to two components.
X_pca = PCA(2).fit_transform(X_tr_std)

# Calculate predicted values.
bandwidth = estimate_bandwidth(X_tr_std, quantile=0.9)
y_pred = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit_predict(X_pca)

# Plot the solution.
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_pred)
plt.show()



In [17]:
# Declare the model and fit it in one statement.
# Note that you can provide arguments to the model, but we didn't.
af = AffinityPropagation().fit(X_tr_std)
print('Done')

# Pull the number of clusters and cluster assignments for each data point.
cluster_centers_indices = af.cluster_centers_indices_
n_clusters_ = len(cluster_centers_indices)
labels = af.labels_

#Glue back to original data
X_tr['clusters'] = labels
X_tr['Gender'] = boston_marathon_scores.gender
X_tr['Overall'] = boston_marathon_scores.overall

#Add the column into our list
clmns.extend(['clusters','Gender','Overall'])

#Lets analyze the clusters
print("Number of estimated clusters: {}".format(n_clusters_))
pd.DataFrame(X_tr.groupby(['clusters']).mean())


Done
Number of estimated clusters: 251
Out[17]:
25k age 10k gender half official overall pace 30k 5k 20k 35k 40k Gender Overall
clusters
0 51.880000 43.000000 19.148000 1.000000 43.080000 92.754000 9.400000 3.542000 63.870000 9.226000 40.898000 76.494000 87.242000 1.000000 9.400000
1 58.814286 38.142857 21.591429 0.857143 48.728571 107.681429 22.428571 4.115714 73.342857 10.270000 46.268571 88.742857 101.232857 0.857143 22.428571
2 79.596667 42.555556 29.865556 0.888889 65.995556 144.394444 3486.333333 5.512222 99.325556 14.605556 62.567778 119.272222 136.260000 0.888889 3486.333333
3 51.017500 49.500000 19.212500 1.000000 42.605000 90.327500 7.750000 3.452500 62.542500 9.290000 40.477500 74.807500 85.082500 1.000000 7.750000
4 57.156000 41.000000 21.306000 0.800000 47.652000 101.820000 20.400000 3.888000 70.392000 10.136000 45.276000 84.248000 95.932000 0.800000 20.400000
5 131.848333 45.000000 44.376667 0.833333 99.755000 233.056667 13954.666667 8.898333 164.263333 22.131667 93.786667 193.041667 221.220000 0.833333 13954.666667
6 0.000000 37.000000 43.683333 0.666667 92.746667 192.650000 5635.333333 7.353333 133.676667 21.993333 87.916667 157.470000 181.750000 0.666667 5635.333333
7 105.182083 60.541667 41.791667 0.958333 88.628333 180.174167 2933.500000 6.880833 126.957083 20.949583 84.047500 148.961667 170.547500 0.958333 2933.500000
8 110.972500 58.535714 44.063214 0.964286 93.403929 191.054107 4668.089286 7.294643 134.142857 22.101607 88.561964 157.621964 180.764821 0.964286 4668.089286
9 122.701395 46.829457 48.009457 0.550388 102.830310 216.295736 10391.356589 8.258295 149.411240 23.974109 97.425504 177.058992 204.317364 0.550388 10391.356589
10 114.659000 44.500000 46.056000 1.000000 96.658000 220.956000 11392.900000 8.436000 139.021000 23.197000 91.650000 165.798000 202.483000 1.000000 11392.900000
11 116.124444 57.444444 45.626905 0.968254 97.453889 203.575079 7514.071429 7.771667 141.069206 22.821667 92.328492 166.788968 192.265635 0.968254 7514.071429
12 116.609249 41.680751 46.546667 0.553991 98.295634 199.267042 6533.812207 7.608404 140.593568 23.372300 93.217887 164.852676 188.750423 0.553991 6533.812207
13 116.692391 50.347826 46.594420 0.891304 98.382826 198.714420 6150.724638 7.587681 140.623551 23.361957 93.301739 164.740580 188.360217 0.891304 6150.724638
14 110.578000 26.800000 42.462000 0.800000 91.928000 248.208000 16283.800000 9.476000 138.808000 21.234000 87.094000 178.784000 229.872000 0.800000 16283.800000
15 107.839615 33.692308 42.428846 0.923077 90.321538 215.145769 9724.230769 8.215769 132.647308 21.296154 85.590769 164.268846 201.421154 0.923077 9724.230769
16 106.675000 32.000000 42.140000 0.500000 0.000000 187.725000 4298.000000 7.175000 129.390000 21.320000 84.860000 153.185000 176.985000 0.500000 4298.000000
17 117.098605 51.232558 46.693256 1.000000 98.542326 216.117209 10320.046512 8.252093 142.068140 23.506977 93.430698 169.548837 201.450465 1.000000 10320.046512
18 115.245541 32.254777 46.116561 0.375796 97.101783 198.944331 6562.611465 7.595096 139.224395 23.185987 92.081783 163.764140 188.216815 0.375796 6562.611465
19 119.309057 58.490566 46.152642 0.962264 99.569434 217.951887 10739.622642 8.322075 146.331698 22.955849 94.247736 175.279057 204.732830 0.962264 10739.622642
20 113.050000 27.666667 43.616667 0.333333 92.996667 236.006667 14524.666667 9.003333 138.673333 22.046667 87.990000 195.016667 224.593333 0.333333 14524.666667
21 122.548047 35.875000 48.223516 0.171875 102.752734 213.003359 9596.625000 8.132891 148.813125 24.104688 97.376250 175.530859 201.691875 0.171875 9596.625000
22 132.750000 36.000000 45.300000 1.000000 94.820000 229.080000 13647.000000 8.750000 174.520000 22.450000 89.570000 197.070000 219.620000 1.000000 13647.000000
23 97.650000 24.000000 39.220000 1.000000 82.270000 241.620000 16324.000000 9.220000 119.330000 19.830000 78.030000 161.770000 230.050000 1.000000 16324.000000
24 117.728293 23.341463 46.793415 0.365854 98.858293 214.841463 9810.097561 8.201463 143.700976 23.520976 93.726585 171.802927 201.884146 0.365854 9810.097561
25 0.000000 38.000000 45.870000 1.000000 98.400000 238.220000 15721.000000 9.100000 166.450000 23.130000 91.930000 200.020000 227.600000 1.000000 15721.000000
26 112.653333 24.666667 44.510000 0.333333 94.826667 194.580000 5858.666667 7.433333 136.340000 0.000000 89.806667 160.466667 184.140000 0.333333 5858.666667
27 117.573675 38.524096 46.917771 0.415663 99.012952 205.115301 7771.156627 7.830783 142.267048 23.590542 93.907169 167.800482 193.653735 0.415663 7771.156627
28 120.913656 53.193548 47.172366 0.827957 100.997742 215.927527 10345.354839 8.243441 147.719462 23.581398 95.652043 175.846129 203.689140 0.827957 10345.354839
29 116.074000 34.800000 43.846000 0.800000 95.202000 236.736000 13539.300000 9.036000 148.700000 21.918000 89.999000 182.708000 221.378000 0.800000 13539.300000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
221 115.874184 42.673469 45.857653 0.755102 97.253163 208.741327 8321.234694 7.969082 141.165918 23.038878 92.159898 168.365306 196.411327 0.755102 8321.234694
222 119.938173 39.512690 47.774467 0.340102 101.015178 206.156599 8130.954315 7.871168 144.909239 24.011320 95.792183 170.282081 195.252843 0.340102 8130.954315
223 116.287197 33.560510 46.810127 0.433121 98.273376 195.656051 5794.585987 7.471210 139.674395 23.551911 93.252484 163.004713 185.785287 0.433121 5794.585987
224 121.720000 29.000000 46.920000 0.000000 100.950000 221.850000 11810.000000 8.470000 150.850000 0.000000 0.000000 181.100000 209.350000 0.000000 11810.000000
225 120.766744 53.534884 47.531395 0.976744 101.120698 228.733256 12893.976744 8.733256 148.155116 23.808140 95.796047 179.166047 213.889767 0.976744 12893.976744
226 121.458417 50.891667 47.504750 0.800000 101.566167 216.118167 10132.900000 8.250083 148.209250 23.753583 96.198333 176.190750 203.928000 0.800000 10132.900000
227 117.549773 56.363636 47.297955 0.931818 99.253864 198.214091 6252.522727 7.567045 141.340682 23.769318 94.167500 165.119773 188.182727 0.931818 6252.522727
228 117.201768 42.939024 47.097378 0.615854 98.972073 198.559817 6428.896341 7.581159 140.981585 23.648780 93.886341 164.873598 188.298110 0.615854 6428.896341
229 114.603256 33.511628 45.149535 0.627907 95.881395 215.327209 9778.116279 8.221395 140.828372 22.634419 90.840233 170.989535 202.097442 0.627907 9778.116279
230 120.292027 35.040541 47.345811 0.364865 100.658784 218.332568 10687.702703 8.336351 147.319459 23.700811 95.357973 176.417568 205.741081 0.364865 10687.702703
231 115.883310 46.151724 46.502207 0.779310 97.809379 197.544069 6012.165517 7.542138 139.571517 23.396552 92.789172 163.532000 187.127586 0.779310 6012.165517
232 123.013571 42.959184 47.685510 0.357143 102.650102 219.902347 11153.693878 8.394592 150.351633 23.767551 97.196939 178.971122 207.417449 0.357143 11153.693878
233 117.870411 49.013699 46.665753 0.787671 99.012329 207.639795 8161.253425 7.927740 143.132055 23.396027 93.852260 169.464932 195.891507 0.787671 8161.253425
234 119.457500 56.125000 45.772083 1.000000 98.970000 228.516667 12796.125000 8.722917 148.905417 22.817917 93.666667 182.617500 214.803333 1.000000 12796.125000
235 123.151373 37.441176 47.796176 0.235294 102.759804 218.981765 11053.411765 8.360294 150.522647 23.818333 97.319314 179.113529 206.917059 0.235294 11053.411765
236 114.700556 56.611111 45.436667 1.000000 96.422778 215.272778 10009.222222 8.219444 139.328889 22.772778 91.368889 166.620556 200.129444 1.000000 10009.222222
237 0.000000 46.000000 0.000000 0.000000 0.000000 228.300000 13395.000000 8.720000 0.000000 26.000000 0.000000 0.000000 0.000000 0.000000 13395.000000
238 120.149014 40.183099 46.546479 0.577465 99.976620 222.242394 11449.352113 8.484789 148.167606 23.214225 94.603099 178.832254 209.181972 0.577465 11449.352113
239 120.591377 38.224638 47.550217 0.326087 101.219928 209.267536 8897.311594 7.989783 146.235797 23.767101 95.937899 172.488841 198.126594 0.326087 8897.311594
240 117.789706 53.529412 46.660000 0.931373 98.959314 208.923824 8509.313725 7.976176 143.038824 23.407255 93.793725 169.537647 196.661961 0.931373 8509.313725
241 121.667470 57.385542 47.188554 0.891566 101.581446 216.733373 10764.903614 8.274578 148.893494 23.555542 96.194096 177.106747 204.573614 0.891566 10764.903614
242 121.539621 47.992424 47.755152 0.568182 101.896212 212.162879 9277.303030 8.100530 147.643333 23.879167 96.577803 174.401212 200.674167 0.568182 9277.303030
243 122.611000 46.266667 46.400667 0.900000 101.238667 237.035000 14403.333333 9.047667 153.571000 23.088000 95.542000 187.626667 222.534000 0.900000 14403.333333
244 110.205588 57.647059 43.587059 0.970588 92.563529 197.367647 6159.823529 7.535000 133.829118 21.824412 87.695000 158.894706 185.215882 0.970588 6159.823529
245 126.345634 43.119718 49.374577 0.204225 105.745141 220.125986 11368.014085 8.404225 153.622254 24.638451 100.189718 181.463310 208.426056 0.204225 11368.014085
246 118.554913 40.861272 47.136185 0.456647 99.796994 203.899538 7458.953757 7.785202 143.264509 23.621098 94.629422 168.359884 193.076243 0.456647 7458.953757
247 118.770707 46.555556 46.697778 0.787879 99.523131 214.228586 9672.101010 8.178182 145.048687 23.409798 94.291313 173.045960 201.635152 0.787879 9672.101010
248 134.520000 59.000000 50.750000 0.000000 0.000000 241.300000 16287.000000 9.220000 165.930000 24.930000 104.880000 196.370000 227.980000 0.000000 16287.000000
249 113.978571 52.714286 45.228571 0.857143 95.975714 197.030000 5447.285714 7.524286 137.797143 0.000000 90.990000 161.990000 186.204286 0.857143 5447.285714
250 121.839065 53.582734 47.941583 0.863309 102.317410 211.594604 9116.100719 8.079353 147.807842 23.948417 96.948921 174.247194 200.225755 0.863309 9116.100719

251 rows × 15 columns


In [18]:
clusters_summary = X_tr.groupby(['clusters']).describe()
clusters_summary_transposed = clusters_summary.transpose()
clusters_summary_transposed


Out[18]:
clusters 0 1 2 3 4 5 6 7 8 9 ... 241 242 243 244 245 246 247 248 249 250
10k count 5.000000 7.000000 9.000000 4.000000 5.000000 6.000000 3.000000 24.000000 56.000000 129.000000 ... 83.000000 132.000000 30.000000 34.000000 142.000000 173.000000 99.000000 1.00 7.000000 139.000000
mean 19.148000 21.591429 29.865556 19.212500 21.306000 44.376667 43.683333 41.791667 44.063214 48.009457 ... 47.188554 47.755152 46.400667 43.587059 49.374577 47.136185 46.697778 50.75 45.228571 47.941583
std 1.170543 3.465484 12.856572 1.027696 1.654004 4.063942 7.538954 4.458106 4.551551 5.290160 ... 5.457879 4.973370 3.627409 4.724065 4.781201 4.948415 4.171106 NaN 3.022590 4.054450
min 18.180000 18.200000 18.180000 18.030000 19.770000 37.730000 36.900000 32.980000 34.700000 23.570000 ... 20.150000 34.420000 39.230000 31.500000 32.370000 35.800000 31.630000 50.75 41.900000 35.620000
25% 18.180000 19.250000 19.780000 18.555000 19.770000 42.217500 39.625000 39.555000 40.620000 45.330000 ... 45.275000 44.565000 43.235000 40.682500 47.990000 42.820000 44.840000 50.75 43.605000 45.775000
50% 18.730000 20.780000 23.580000 19.250000 20.900000 46.300000 42.350000 41.060000 43.355000 48.250000 ... 48.520000 47.740000 47.125000 44.625000 50.255000 47.650000 46.670000 50.75 44.430000 48.200000
75% 19.750000 22.655000 41.500000 19.907500 22.970000 46.970000 47.075000 43.975000 47.327500 52.170000 ... 50.720000 52.127500 48.690000 46.482500 52.695000 50.980000 49.015000 50.75 45.890000 50.775000
max 20.900000 28.350000 52.350000 20.320000 23.120000 47.880000 51.800000 53.130000 54.970000 58.530000 ... 53.850000 57.720000 52.820000 53.020000 58.570000 57.180000 57.520000 50.75 51.280000 56.180000
20k count 5.000000 7.000000 9.000000 4.000000 5.000000 6.000000 3.000000 24.000000 56.000000 129.000000 ... 83.000000 132.000000 30.000000 34.000000 142.000000 173.000000 99.000000 1.00 7.000000 139.000000
mean 40.898000 46.268571 62.567778 40.477500 45.276000 93.786667 87.916667 84.047500 88.561964 97.425504 ... 96.194096 96.577803 95.542000 87.695000 100.189718 94.629422 94.291313 104.88 90.990000 96.948921
std 2.917074 8.611826 26.296880 1.778490 3.058648 9.439968 14.711704 9.085377 9.056660 10.828335 ... 10.803554 10.081225 6.874044 9.533775 9.702004 9.943687 8.389691 NaN 6.049554 8.271588
min 38.550000 38.800000 38.550000 38.230000 41.700000 77.970000 74.980000 66.630000 70.430000 47.930000 ... 43.880000 69.550000 84.430000 63.550000 65.520000 71.820000 63.920000 104.88 84.680000 72.320000
25% 38.570000 40.825000 42.020000 39.467500 43.870000 90.165000 79.915000 79.437500 82.132500 91.870000 ... 92.365000 89.860000 89.267500 81.367500 96.770000 85.950000 90.505000 104.88 87.405000 92.340000
50% 39.870000 43.900000 50.870000 40.825000 43.900000 96.315000 84.850000 82.325000 86.720000 98.430000 ... 98.450000 97.125000 97.185000 88.925000 101.775000 95.650000 93.930000 104.88 90.930000 97.150000
75% 42.050000 47.725000 84.580000 41.835000 47.930000 98.310000 94.385000 88.660000 94.820000 105.300000 ... 103.465000 105.085000 100.917500 93.815000 107.395000 102.680000 99.075000 104.88 91.720000 102.885000
max 45.450000 64.080000 110.870000 42.030000 48.980000 105.030000 103.920000 107.850000 111.450000 120.700000 ... 111.530000 116.230000 107.050000 107.180000 118.920000 114.650000 116.420000 104.88 103.070000 113.930000
25k count 5.000000 7.000000 9.000000 4.000000 5.000000 6.000000 3.000000 24.000000 56.000000 129.000000 ... 83.000000 132.000000 30.000000 34.000000 142.000000 173.000000 99.000000 1.00 7.000000 139.000000
mean 51.880000 58.814286 79.596667 51.017500 57.156000 131.848333 0.000000 105.182083 110.972500 122.701395 ... 121.667470 121.539621 122.611000 110.205588 126.345634 118.554913 118.770707 134.52 113.978571 121.839065
std 4.016118 11.045453 33.893913 2.058517 3.769235 12.763898 0.000000 11.447189 11.345336 13.686596 ... 13.691701 12.751737 8.995522 12.038437 12.342555 12.530305 10.600679 NaN 7.683835 10.436578
min 48.750000 49.870000 48.750000 48.320000 52.620000 110.950000 0.000000 83.470000 88.470000 59.800000 ... 55.600000 87.630000 107.850000 80.380000 82.180000 89.780000 80.380000 134.52 106.220000 91.020000
25% 48.750000 51.565000 53.080000 49.955000 55.600000 125.967500 0.000000 99.322500 103.230000 115.450000 ... 116.755000 113.032500 113.882500 102.190000 121.632500 107.530000 114.025000 134.52 109.435000 116.000000
50% 50.500000 55.600000 63.830000 51.560000 55.630000 135.205000 0.000000 103.090000 108.550000 124.330000 ... 124.200000 122.215000 124.415000 111.065000 128.290000 119.650000 118.300000 134.52 113.350000 121.930000
75% 53.070000 60.690000 107.280000 52.622500 59.850000 139.072500 0.000000 111.090000 119.062500 132.670000 ... 130.400000 132.077500 128.597500 117.840000 135.377500 128.500000 124.800000 134.52 115.065000 129.300000
max 58.330000 81.720000 142.680000 52.630000 62.080000 146.480000 0.000000 135.380000 139.830000 152.270000 ... 140.670000 146.370000 138.980000 134.420000 149.020000 144.330000 145.700000 134.52 129.280000 143.330000
30k count 5.000000 7.000000 9.000000 4.000000 5.000000 6.000000 3.000000 24.000000 56.000000 129.000000 ... 83.000000 132.000000 30.000000 34.000000 142.000000 173.000000 99.000000 1.00 7.000000 139.000000
mean 63.870000 73.342857 99.325556 62.542500 70.392000 164.263333 133.676667 126.957083 134.142857 149.411240 ... 148.893494 147.643333 153.571000 133.829118 153.622254 143.264509 145.048687 165.93 137.797143 147.807842
std 5.217461 14.766886 42.514617 2.234925 4.935693 14.671503 22.566183 14.021694 13.756113 16.710898 ... 16.851564 15.550967 11.652216 14.830801 14.983018 15.214577 13.007040 NaN 9.392784 12.755483
min 59.920000 61.350000 60.100000 59.450000 63.980000 137.020000 113.600000 100.530000 107.050000 73.300000 ... 68.180000 106.520000 133.630000 98.070000 100.000000 108.250000 98.370000 165.93 128.070000 110.350000
25% 59.920000 63.875000 65.970000 61.640000 68.000000 161.610000 121.465000 119.522500 125.080000 140.600000 ... 142.750000 136.942500 142.957500 123.702500 147.820000 129.750000 139.350000 165.93 132.535000 140.705000
50% 61.630000 68.180000 78.900000 63.185000 69.530000 168.615000 129.330000 124.540000 131.335000 151.420000 ... 152.100000 148.315000 154.650000 134.645000 156.040000 144.270000 144.270000 165.93 136.670000 148.230000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
half std 3.164830 8.989719 27.897963 1.840480 3.168465 9.724984 15.546177 9.582230 9.547910 11.433770 ... 11.458470 10.606785 7.256935 10.051022 10.237196 10.476615 8.844753 NaN 6.376530 8.737190
min 40.570000 40.930000 40.570000 40.270000 43.970000 82.530000 79.100000 70.300000 74.320000 50.280000 ... 46.180000 73.420000 89.370000 67.200000 69.100000 75.820000 67.500000 0.00 89.420000 76.350000
25% 40.570000 43.065000 44.180000 41.567500 46.200000 97.702500 84.285000 83.752500 86.692500 96.970000 ... 97.510000 94.867500 94.365000 85.845000 102.100000 90.630000 95.505000 0.00 92.150000 97.435000
50% 41.980000 46.180000 53.370000 42.985000 46.220000 102.050000 89.470000 86.815000 91.450000 104.000000 ... 103.930000 102.600000 102.725000 93.680000 107.375000 100.850000 99.030000 0.00 95.820000 102.480000
75% 44.200000 50.280000 89.320000 44.022500 50.270000 103.712500 99.570000 93.555000 100.055000 111.120000 ... 109.360000 110.877500 106.725000 98.972500 113.547500 108.300000 104.580000 0.00 96.785000 108.665000
max 48.080000 67.300000 117.320000 44.180000 51.600000 111.380000 109.670000 113.800000 117.580000 127.300000 ... 117.580000 122.580000 113.330000 113.070000 125.300000 120.950000 122.770000 0.00 108.720000 120.320000
official count 5.000000 7.000000 9.000000 4.000000 5.000000 6.000000 3.000000 24.000000 56.000000 129.000000 ... 83.000000 132.000000 30.000000 34.000000 142.000000 173.000000 99.000000 1.00 7.000000 139.000000
mean 92.754000 107.681429 144.394444 90.327500 101.820000 233.056667 192.650000 180.174167 191.054107 216.295736 ... 216.733373 212.162879 237.035000 197.367647 220.125986 203.899538 214.228586 241.30 197.030000 211.594604
std 7.062208 22.301293 60.885144 3.328116 7.491559 19.637942 31.763106 20.247115 19.496418 23.989609 ... 24.611155 22.066124 17.385564 21.897770 21.434793 21.522678 19.128276 NaN 12.999608 18.065614
min 87.200000 88.430000 88.320000 85.530000 91.780000 196.830000 162.000000 142.450000 152.370000 105.400000 ... 99.220000 153.130000 209.280000 144.150000 144.080000 153.820000 145.720000 241.30 183.650000 158.400000
25% 87.220000 93.185000 95.980000 89.542500 97.570000 229.877500 176.265000 168.720000 178.120000 203.680000 ... 207.010000 197.675000 219.907500 182.645000 211.797500 184.820000 205.300000 241.30 189.740000 200.880000
50% 90.020000 101.070000 114.520000 91.330000 102.100000 238.300000 190.530000 176.470000 186.600000 217.500000 ... 222.020000 213.035000 239.545000 199.660000 223.700000 205.550000 213.550000 241.30 194.670000 212.150000
75% 95.480000 112.785000 194.730000 92.115000 106.900000 243.295000 207.975000 191.415000 205.380000 233.470000 ... 232.150000 229.995000 249.167500 212.777500 236.080000 220.700000 225.400000 241.30 198.955000 224.515000
max 103.850000 152.330000 254.320000 93.120000 110.750000 252.880000 225.420000 233.900000 239.970000 269.550000 ... 248.580000 252.950000 266.120000 240.600000 260.750000 248.200000 262.570000 241.30 223.500000 248.680000
overall count 5.000000 7.000000 9.000000 4.000000 5.000000 6.000000 3.000000 24.000000 56.000000 129.000000 ... 83.000000 132.000000 30.000000 34.000000 142.000000 173.000000 99.000000 1.00 7.000000 139.000000
mean 9.400000 22.428571 3486.333333 7.750000 20.400000 13954.666667 5635.333333 2933.500000 4668.089286 10391.356589 ... 10764.903614 9277.303030 14403.333333 6159.823529 11368.014085 7458.953757 9672.101010 16287.00 5447.285714 9116.100719
std 8.473488 15.977663 6105.646260 4.991660 7.635444 4557.433123 6379.371860 3414.465774 4037.332361 4992.539485 ... 4655.468169 5046.808356 3247.004289 4242.884342 4650.684574 4859.636492 4176.880201 NaN 3199.037966 4228.294858
min 2.000000 5.000000 4.000000 1.000000 10.000000 5100.000000 346.000000 31.000000 103.000000 24.000000 ... 19.000000 110.000000 8261.000000 38.000000 37.000000 118.000000 45.000000 16287.00 2647.000000 236.000000
25% 3.000000 10.500000 15.000000 5.500000 16.000000 13803.000000 2093.000000 708.250000 1731.250000 6728.000000 ... 7646.000000 5291.000000 11226.000000 2510.500000 8929.500000 2812.000000 7180.500000 16287.00 3692.500000 6064.500000
50% 6.000000 20.000000 33.000000 9.000000 21.000000 15625.500000 3840.000000 1467.500000 3107.000000 10508.000000 ... 11844.000000 9269.000000 15993.500000 5772.000000 12270.500000 7243.000000 9421.000000 16287.00 4660.000000 9039.000000
75% 14.000000 31.000000 4673.000000 11.250000 26.000000 16563.750000 8280.000000 4011.750000 7200.750000 14654.000000 ... 14343.500000 13851.250000 17141.000000 9203.750000 15257.500000 11482.000000 12717.000000 16287.00 5602.500000 12505.500000
max 22.000000 49.000000 17338.000000 12.000000 29.000000 17305.000000 12720.000000 14768.000000 16083.000000 17568.000000 ... 17109.000000 17309.000000 17550.000000 16185.000000 17479.000000 17076.000000 17510.000000 16287.00 12234.000000 17115.000000
pace count 5.000000 7.000000 9.000000 4.000000 5.000000 6.000000 3.000000 24.000000 56.000000 129.000000 ... 83.000000 132.000000 30.000000 34.000000 142.000000 173.000000 99.000000 1.00 7.000000 139.000000
mean 3.542000 4.115714 5.512222 3.452500 3.888000 8.898333 7.353333 6.880833 7.294643 8.258295 ... 8.274578 8.100530 9.047667 7.535000 8.404225 7.785202 8.178182 9.22 7.524286 8.079353
std 0.272617 0.851545 2.321432 0.128679 0.287002 0.747942 1.211666 0.773844 0.742960 0.914664 ... 0.940157 0.840754 0.662777 0.836465 0.817842 0.820805 0.730255 NaN 0.493824 0.688615
min 3.330000 3.380000 3.380000 3.270000 3.500000 7.520000 6.180000 5.430000 5.820000 4.030000 ... 3.780000 5.850000 7.980000 5.500000 5.500000 5.880000 5.570000 9.22 7.020000 6.050000
25% 3.330000 3.560000 3.670000 3.420000 3.730000 8.772500 6.730000 6.447500 6.807500 7.780000 ... 7.905000 7.545000 8.392500 6.972500 8.090000 7.050000 7.830000 9.22 7.245000 7.670000
50% 3.430000 3.870000 4.370000 3.485000 3.900000 9.100000 7.280000 6.740000 7.125000 8.300000 ... 8.480000 8.135000 9.145000 7.620000 8.530000 7.850000 8.150000 9.22 7.430000 8.100000
75% 3.650000 4.310000 7.430000 3.517500 4.080000 9.292500 7.940000 7.307500 7.835000 8.920000 ... 8.865000 8.782500 9.515000 8.122500 9.020000 8.430000 8.610000 9.22 7.600000 8.575000
max 3.970000 5.820000 9.700000 3.570000 4.230000 9.650000 8.600000 8.930000 9.170000 10.280000 ... 9.480000 9.650000 10.150000 9.180000 9.950000 9.470000 10.020000 9.22 8.530000 9.500000

120 rows × 251 columns

From all the clustering techniques that have been used_ kmeans, spectral, mean shift and affinity, the ones that present more stability in terms of the variance withn the clusters are kmeans and spectral clustering. When the bandwidth is close to quantile 1 (0.9) then the number of clusters obtained with mean shift is reduced to 18 (from 58 in the case of using quantile 0.25). In this case most of the clusters are empty reason why this clustering method has been discarded. The same case applies to the Affinity clustering as there are 251 clusters with less than 1% of the data in each of them.

From the kmeans and spectral clustering perspective, each cluster contains between 1% and 43% of the datapoints. In this case, the best one from a similarity analysis perspective is the kmeans as with less clusters achieves similar silhouette values. For the kmeans cluster, the best solution is 3 clusters from an elbow methodology perspective although the last cluster containes less than 1% of the data, so 2 clusters should be considered.

rom the 3 clusters we can see that in the first cluster men finished the marathon quicker than women with an average pace of 7.36 bein the official timings lower in all cases. Additionally, in cluster one we see that men are younger than women entering in position 192 against women 218. The third cluster men and women have similar age and there is also a difference in 20 positions from the first to the last person in the cluster of people aged on average 48 years old.