In [133]:
%matplotlib inline

import cPickle
import matplotlib.pyplot as plt
import matplotlib
import mpld3, mpld3.plugins
import numpy as np
from collections import Counter
import pandas as pd
import seaborn

pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 5)

In [12]:
runners = cPickle.load(file("results/2013/results.pkl"))

In [3]:
runners["1"]


Out[3]:
{'10k': u'0:30:54',
 '20k': u'1:01:31',
 '25k': u'1:17:16',
 '30k': u'1:32:58',
 '35k': u'1:48:47',
 '40k': u'2:04:46',
 '5k': u'0:15:54',
 'age': u'30',
 'bib': u'1',
 'city': u'Kenya',
 'country': u'KEN',
 'ctz': u'',
 'division': u'5',
 'gender': u'M',
 'genderdiv': u'5',
 'half': u'1:04:54',
 'name': u'Korir, Wesley',
 'official': u'2:12:30',
 'overall': u'5',
 'pace': u'0:05:04',
 'state': u''}

In [4]:
def bar(xs, ys):
    fig = plt.figure(figsize=(8,5))
    ax = fig.add_subplot(111, axisbg='#EEEEEE')
    ax.grid(color='white', linestyle='solid')
    ax.bar(xs, ys, fc='lightblue', alpha=0.8, edgecolor="white")

In [5]:
ages = Counter(int(x['age']) for x in runners.itervalues())
xs = range(min(ages.keys()), max(ages.keys()))
ys = [ages[i] for i in xs]
bar(xs, ys)
mpld3.display()


Out[5]:

In [6]:
from collections import Counter
men = Counter(int(x['age']) for x in runners.itervalues() if x['gender']=="M")
mxs = range(min(men.keys()), max(men.keys()))
mys = [men[i] for i in mxs]

wmn = Counter(int(x['age']) for x in runners.itervalues() if x['gender']=="F")
wxs = range(min(wmn.keys()), max(wmn.keys()))
wys = [wmn[i] for i in wxs]

fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(111, axisbg='#EEEEEE', title="2013 Boston Marathon participants, by age and gender")
ax.grid(color='white', linestyle='solid')
b1 = ax.bar(mxs, mys, color='lightblue', alpha=1, edgecolor="white", label="Men")
b2 = ax.bar(wxs, wys, color='pink', alpha=0.6, edgecolor="white", label="Women")
ax.legend()

mpld3.display()


Out[6]:

In [7]:
def minutes(time):
    parts = [int(x) for x in runners[105]['official'].split(':')]
    return parts[0] * 60 + parts[1] + parts[2]/60.

In [15]:
r2014 = cPickle.load(file("results/2014/results.pkl"))

In [16]:
len(r2014)


Out[16]:
31984

In [17]:
r2014["30592"]


Out[17]:
{'10k': u'1:05:10',
 '20k': u'2:09:08',
 '25k': u'2:48:22',
 '30k': u'3:25:35',
 '35k': u'4:14:55',
 '40k': u'5:19:26',
 '5k': u'0:33:01',
 'age': u'37',
 'bib': u'30592',
 'city': u'Portland',
 'country': u'USA',
 'ctz': u'',
 'division': u'6476',
 'gender': u'F',
 'genderdiv': u'13327',
 'half': u'2:16:18',
 'name': u'Wood, Samantha L.',
 'official': u'5:39:06',
 'overall': u'30071',
 'pace': u'0:12:57',
 'state': u'ME'}

In [18]:
from collections import Counter
men = Counter(int(x['age']) for x in r2014.itervalues() if x['gender']=="M")
mxs = range(min(men.keys()), max(men.keys()))
mys = [men[i] for i in mxs]

wmn = Counter(int(x['age']) for x in r2014.itervalues() if x['gender']=="F")
wxs = range(min(wmn.keys()), max(wmn.keys()))
wys = [wmn[i] for i in wxs]

fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(111, axisbg='#EEEEEE', title="2014 Boston Marathon participants, by age and gender")
ax.grid(color='white', linestyle='solid')
b1 = ax.bar(mxs, mys, color='lightblue', alpha=1, edgecolor="white", label="Men")
b2 = ax.bar(wxs, wys, color='pink', alpha=0.6, edgecolor="white", label="Women")
ax.legend()

mpld3.display()


Out[18]:

In [27]:
df = pd.read_csv("results/2014/results.csv")

In [29]:
df.head(10)


Out[29]:
10k name division 25k gender age official bib genderdiv ctz 35k overall pace state 30k 5k half 20k country city
0 17.37 Yamamoto, Hiroyuki 8 47.67 M 47 85.25 W1 8 NaN 71.40 8 3.27 NaN 59.18 8.02 39.72 37.65 JPN Fukuoka ...
1 32.58 Jeptoo, Rita 1 82.43 F 33 138.95 F1 1 NaN 116.37 21 5.30 NaN 99.33 16.22 69.47 65.83 KEN Eldoret ...
2 16.62 Van Dyk, Ernst F. 1 45.80 M 41 80.60 W2 1 NaN 67.42 1 3.08 NaN 56.45 7.75 38.03 36.10 RSA Paarl ...
3 32.57 Dibaba, Mare 3 82.43 F 24 140.58 F2 3 NaN 116.37 27 5.37 NaN 99.33 16.20 69.47 65.83 ETH Shoa ...
4 17.12 Hokinoue, Kota 2 46.37 M 40 81.23 W3 2 NaN 67.83 2 3.10 NaN 57.03 8.02 38.60 36.58 JPN Nogata Fukuoka ...
5 32.58 Sumgong, Jemima Jelagat 4 82.45 F 29 140.68 F3 4 NaN 116.37 28 5.37 NaN 99.33 16.22 69.47 65.83 KEN Nandi ...
6 17.65 Hug, Marcel E. 4 47.67 M 28 84.65 W4 4 NaN 70.23 4 3.23 NaN 58.60 8.38 39.72 37.65 SUI Neuenkirch ...
7 30.48 Geneti, Markos 5 76.95 M 29 129.83 5 5 NaN 107.47 5 4.97 NaN 92.52 15.17 64.85 61.62 ETH Addis Ababa ...
8 17.12 Soejima, Masazumi 3 46.37 M 43 81.23 W6 3 NaN 67.83 3 3.10 NaN 57.03 8.00 38.60 36.60 JPN Isahaya ...
9 30.48 Hall, Ryan 20 77.68 M 31 137.83 6 20 NaN 112.27 20 5.27 CA 94.78 15.15 65.23 61.78 USA Redding ...

10 rows × 21 columns


In [25]:
df.describe()


Out[25]:
division age official genderdiv overall pace
count 31984.000000 31984.000000 31984.000000 31984.000000 31984.000000 31984.000000
mean 1932.563032 42.407079 242.997314 8051.044741 15939.587825 9.275658
std 1715.228694 11.316496 52.300431 4754.005626 9232.978224 1.992486
min 1.000000 18.000000 80.600000 1.000000 1.000000 3.080000
25% 610.000000 33.000000 205.527500 3972.000000 7943.750000 7.850000
50% 1425.000000 42.000000 232.370000 7970.000000 15939.500000 8.870000
75% 2611.000000 50.000000 273.235000 11968.000000 23935.250000 10.430000
max 6979.000000 81.000000 538.880000 17575.000000 31931.000000 20.570000

8 rows × 6 columns


In [99]:
df['official'].groupby(pd.cut(df['age'], range(15,90,5))).aggregate(np.average).plot(kind="bar", title="Average time by age group")


Out[99]:
<matplotlib.axes.AxesSubplot at 0x12da2f390>

In [101]:
df['official'].groupby(pd.cut(df['age'], range(15,90,5))).aggregate(len).plot(kind="bar", title="# of runners by age group")


Out[101]:
<matplotlib.axes.AxesSubplot at 0x12d08da10>

In [180]:
#Q: What's the distribution of finish times *within* an age group... small multiples
agegroups = range(15,90,5)
agebins = pd.cut(df['age'], agegroups,
                 labels=['{}-{}'.format(age,age+5) for age in agegroups][:-1])
times = range(80, 530, 50)
timebins = pd.cut(df['official'], times,
                  labels=['{}-{}'.format(t, t+50) for t in times][:-1])
age_and_time = df['official'].groupby([agebins, timebins]).aggregate(len)
#age_and_time = age_and_time.to_frame()
age_and_time.plot(kind="bar")
age_and_time.index.get_level_values(0)
#age_and_time["(15, 20]"]
#x = [age_and_time[ax] for ax in age_and_time.axes[0].levels[0]]
#x[0].values


Out[180]:
Index([u'15-20', u'15-20', u'15-20', u'15-20', u'15-20', u'15-20', u'15-20', u'20-25', u'20-25', u'20-25', u'20-25', u'20-25', u'20-25', u'20-25', u'20-25', u'25-30', u'25-30', u'25-30', u'25-30', u'25-30', u'25-30', u'25-30', u'25-30', u'30-35', u'30-35', u'30-35', u'30-35', u'30-35', u'30-35', u'30-35', u'30-35', u'35-40', u'35-40', u'35-40', u'35-40', u'35-40', u'35-40', u'35-40', u'35-40', u'40-45', u'40-45', u'40-45', u'40-45', u'40-45', u'40-45', u'40-45', u'40-45', u'45-50', u'45-50', u'45-50', u'45-50', u'45-50', u'45-50', u'45-50', u'45-50', u'50-55', u'50-55', u'50-55', u'50-55', u'50-55', u'50-55', u'50-55', u'55-60', u'55-60', u'55-60', u'55-60', u'55-60', u'55-60', u'55-60', u'60-65', u'60-65', u'60-65', u'60-65', u'60-65', u'60-65', u'60-65', u'65-70', u'65-70', u'65-70', u'65-70', u'65-70', u'65-70', u'70-75', u'70-75', u'70-75', u'70-75', u'70-75', u'70-75', u'75-80', u'75-80', u'75-80', u'75-80', u'75-80', u'80-85', u'80-85'], dtype='object')

In [161]:
pd.cut?

In [157]:
years = []
for year in range(2001, 2015):
    y = pd.read_csv("results/{}/results.csv".format(year), na_values="-")[["official", "gender", "age"]]
    y["year"] = str(year)
    years.append(y)
alltimes = pd.concat(years, ignore_index=True).dropna()

In [138]:
plt.figure()
alltimes.boxplot(column='official', by='year')


Out[138]:
<matplotlib.axes.AxesSubplot at 0x11e43ee50>
<matplotlib.figure.Figure at 0x116cf8290>

In [136]:
seaborn.set(context = 'notebook', style = 'darkgrid')
# not sure how to do this non-globally?
seaborn.set_context(rc={"figure.figsize": (15, 10)})

f, ax1 = plt.subplots(1)
ax1.set_title("Boston Marathon times 2001-2014")
seaborn.boxplot(pd.Series(alltimes.loc[:, "official"], name="Time in minutes"), groupby=alltimes.year)


Out[136]:
<matplotlib.axes.AxesSubplot at 0x121df3390>

In [131]:
f, ax1 = plt.subplots(1)
ax1.set_title("Boston Marathon times 2001-2014")
seaborn.violinplot(pd.Series(alltimes.loc[:, "official"], name="time in minutes"), groupby=alltimes.year, ax=ax1)


Out[131]:
<matplotlib.axes.AxesSubplot at 0x119b6c750>

In [132]:
f, ax1 = plt.subplots(1)
ax1.set_title("Boston Marathon times 2001-2014 by gender")
seaborn.violinplot(pd.Series(alltimes.loc[:, "official"], name="time in minutes"), groupby=[alltimes.year, alltimes.gender], ax=ax1)


Out[132]:
<matplotlib.axes.AxesSubplot at 0x119e18090>

In [144]:
# wow that is a hideous, useless plot. Looks like women finish a predictable amount worse than men every year?
# I wonder how, across all years, age groups do. (That one might benefit from a gender split, more than the above)
# Also TODO: a map of the states & countries of Boston Marathon participants

alltimes
agegroups = range(15,90,5)
agebins = pd.cut(alltimes['age'], agegroups,
                 labels=['{}-{}'.format(age,age+5) for age in agegroups][:-1])

f, ax1 = plt.subplots(1)
ax1.set_title("Boston Marathon times 2001-2014 by age group")
seaborn.violinplot(pd.Series(alltimes.loc[:, "official"], name="time in minutes"), groupby=agebins, ax=ax1)


Out[144]:
<matplotlib.axes.AxesSubplot at 0x119f12a10>

In [154]:
# wow that is a hideous, useless plot. Looks like women finish a predictable amount worse than men every year?
# I wonder how, across all years, age groups do. (That one might benefit from a gender split, more than the above)
# Also TODO: a map of the states & countries of Boston Marathon participants

alltimes
agegroups = range(15,90,5)
agebins = pd.cut(alltimes['age'], agegroups,
                 labels=['{}-{}'.format(age,age+5) for age in agegroups][:-1])

f, ax1 = plt.subplots(1)
ax1.set_title("Boston Marathon times 2001-2014 by age group")
seaborn.boxplot(pd.Series(alltimes.loc[:, "official"], name="time in minutes"), groupby=agebins, ax=ax1)


Out[154]:
<matplotlib.axes.AxesSubplot at 0x120f0af10>

# wow that is a hideous, useless plot. Looks like women finish a predictable amount worse than men every year? # I wonder how, across all years, age groups do. (That one might benefit from a gender split, more than the above) # Also TODO: a map of the states & countries of Boston Marathon participants alltimes agegroups = range(15,90,5) agebins = pd.cut(alltimes['age'], agegroups, labels=['{}-{}'.format(age,age+5) for age in agegroups][:-1]) f, ax1 = plt.subplots(1) ax1.set_title("Boston Marathon times 2001-2014 by age group") seaborn.violinplot(pd.Series(alltimes.loc[:, "official"], name="time in minutes"), groupby=[alltimes.gender, agebins], ax=ax1) g = alltimes.groupby([agebins, alltimes.gender]) g.head()


In [167]:
years = []
for year in range(2001, 2015):
    y = pd.read_csv("results/{}/results.csv".format(year), na_values="-")[["state"]]
    years.append(y)
states = pd.concat(years, ignore_index=True).dropna()
g = states.groupby("state") #.aggregate(len)
h = g.count()

In [170]:
import json
json.dumps(h.to_dict()['state'])


Out[170]:
'{"WA": 4882, "BC": 2914, "VA": 6611, "DE": 739, "DC": 1595, "WI": 4591, "WV": 474, "HI": 543, "CO": 5111, "FL": 6676, "FM": 2, "WY": 249, "NH": 4469, "SK": 350, "NJ": 5860, "PQ": 1344, "NL": 96, "NM": 720, "TX": 9662, "LA": 734, "NB": 679, "NC": 4737, "ND": 296, "NE": 837, "NF": 113, "YT": 23, "TN": 2407, "NY": 15299, "PA": 9399, "PE": 194, "NS": 1102, "NT": 13, "CA": 19467, "NV": 850, "AA": 12, "PR": 374, "GU": 7, "AB": 2338, "AE": 75, "PW": 1, "ON": 11940, "VI": 27, "AK": 554, "OH": 8111, "AL": 1028, "AP": 19, "AS": 1, "AR": 567, "VT": 1443, "IL": 10158, "GA": 3682, "IN": 3009, "IA": 1693, "OK": 898, "AZ": 2686, "ID": 873, "CT": 5082, "ME": 2222, "MD": 4941, "MA": 58667, "MB": 437, "UT": 3240, "MO": 2298, "MN": 5102, "MI": 7184, "RI": 1968, "KS": 1491, "MT": 563, "QC": 1527, "MS": 395, "SC": 1532, "KY": 1399, "OR": 3160, "SD": 269}'

In [172]:
dict(sorted(h.to_dict().iteritems()))


Out[172]:
{'state': {'AA': 12,
  'AB': 2338,
  'AE': 75,
  'AK': 554,
  'AL': 1028,
  'AP': 19,
  'AR': 567,
  'AS': 1,
  'AZ': 2686,
  'BC': 2914,
  'CA': 19467,
  'CO': 5111,
  'CT': 5082,
  'DC': 1595,
  'DE': 739,
  'FL': 6676,
  'FM': 2,
  'GA': 3682,
  'GU': 7,
  'HI': 543,
  'IA': 1693,
  'ID': 873,
  'IL': 10158,
  'IN': 3009,
  'KS': 1491,
  'KY': 1399,
  'LA': 734,
  'MA': 58667,
  'MB': 437,
  'MD': 4941,
  'ME': 2222,
  'MI': 7184,
  'MN': 5102,
  'MO': 2298,
  'MS': 395,
  'MT': 563,
  'NB': 679,
  'NC': 4737,
  'ND': 296,
  'NE': 837,
  'NF': 113,
  'NH': 4469,
  'NJ': 5860,
  'NL': 96,
  'NM': 720,
  'NS': 1102,
  'NT': 13,
  'NV': 850,
  'NY': 15299,
  'OH': 8111,
  'OK': 898,
  'ON': 11940,
  'OR': 3160,
  'PA': 9399,
  'PE': 194,
  'PQ': 1344,
  'PR': 374,
  'PW': 1,
  'QC': 1527,
  'RI': 1968,
  'SC': 1532,
  'SD': 269,
  'SK': 350,
  'TN': 2407,
  'TX': 9662,
  'UT': 3240,
  'VA': 6611,
  'VI': 27,
  'VT': 1443,
  'WA': 4882,
  'WI': 4591,
  'WV': 474,
  'WY': 249,
  'YT': 23}}

In [173]:
years = []
for year in range(2001, 2015):
    y = pd.read_csv("results/{}/results.csv".format(year), na_values="-")[["country"]]
    years.append(y)
states = pd.concat(years, ignore_index=True).dropna()
g = states.groupby("country") #.aggregate(len)
h = g.count()

In [174]:
json.dumps(h.to_dict()['country'])


Out[174]:
'{"LIE": 1, "EGY": 3, "LIB": 2, "QAT": 4, "PAR": 2, "BOL": 3, "SIN": 98, "PAN": 17, "PRK": 1, "TAN": 2, "UAE": 34, "HKG": 211, "HAI": 2, "TPE": 19, "SVK": 29, "CHI": 172, "MAS": 14, "CHN": 79, "URU": 16, "JAM": 8, "SUI": 474, "ZIM": 5, "FIN": 99, "THA": 3, "PHI": 13, "MAR": 8, "AHO": 4, "LAT": 7, "KAZ": 1, "GUA": 57, "BEL": 220, "CRC": 142, "KSA": 7, "DEN": 312, "BER": 191, "CMR": 5, "GER": 1778, "ROM": 7, "SCG": 1, "ROU": 2, "TCA": 1, "TRI": 11, "VGB": 3, "BLR": 2, "GRE": 103, "ANG": 1, "MON": 1, "IND": 17, "INA": 1, "NOR": 86, "CZE": 30, "ESA": 17, "DOM": 31, "LUX": 19, "ISR": 44, "NED": 286, "PER": 68, "ISL": 154, "ETH": 63, "COL": 151, "NEP": 1, "SER": 1, "ECU": 89, "FRA": 707, "LTU": 8, "TWN": 7, "AUS": 639, "GBR": 1837, "AUT": 165, "VEN": 145, "KEN": 145, "TUR": 12, "ITA": 1168, "BRN": 1, "TUN": 1, "RUS": 92, "MEX": 1414, "BRA": 481, "CAY": 21, "BAR": 3, "NGR": 1, "USA": 240937, "SWE": 222, "UKR": 9, "CAN": 23070, "KOR": 1489, "BAH": 20, "CYP": 1, "POR": 93, "CRO": 6, "POL": 109, "EST": 11, "ESP": 384, "SLO": 33, "IRL": 705, "MLT": 1, "NZL": 147, "ARU": 3, "JPN": 1351, "RSA": 100, "ARM": 2, "ARG": 96, "HUN": 24}'

In [ ]: