notebook.community

Edit and run



In [133]:

    
%matplotlib inline

import cPickle
import matplotlib.pyplot as plt
import matplotlib
import mpld3, mpld3.plugins
import numpy as np
from collections import Counter
import pandas as pd
import seaborn

pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 5)



In [12]:

    
runners = cPickle.load(file("results/2013/results.pkl"))



In [3]:

    
runners["1"]









    Out[3]:





{'10k': u'0:30:54',
 '20k': u'1:01:31',
 '25k': u'1:17:16',
 '30k': u'1:32:58',
 '35k': u'1:48:47',
 '40k': u'2:04:46',
 '5k': u'0:15:54',
 'age': u'30',
 'bib': u'1',
 'city': u'Kenya',
 'country': u'KEN',
 'ctz': u'',
 'division': u'5',
 'gender': u'M',
 'genderdiv': u'5',
 'half': u'1:04:54',
 'name': u'Korir, Wesley',
 'official': u'2:12:30',
 'overall': u'5',
 'pace': u'0:05:04',
 'state': u''}



In [4]:

    
def bar(xs, ys):
    fig = plt.figure(figsize=(8,5))
    ax = fig.add_subplot(111, axisbg='#EEEEEE')
    ax.grid(color='white', linestyle='solid')
    ax.bar(xs, ys, fc='lightblue', alpha=0.8, edgecolor="white")



In [5]:

    
ages = Counter(int(x['age']) for x in runners.itervalues())
xs = range(min(ages.keys()), max(ages.keys()))
ys = [ages[i] for i in xs]
bar(xs, ys)
mpld3.display()









    Out[5]:



In [6]:

    
from collections import Counter
men = Counter(int(x['age']) for x in runners.itervalues() if x['gender']=="M")
mxs = range(min(men.keys()), max(men.keys()))
mys = [men[i] for i in mxs]

wmn = Counter(int(x['age']) for x in runners.itervalues() if x['gender']=="F")
wxs = range(min(wmn.keys()), max(wmn.keys()))
wys = [wmn[i] for i in wxs]

fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(111, axisbg='#EEEEEE', title="2013 Boston Marathon participants, by age and gender")
ax.grid(color='white', linestyle='solid')
b1 = ax.bar(mxs, mys, color='lightblue', alpha=1, edgecolor="white", label="Men")
b2 = ax.bar(wxs, wys, color='pink', alpha=0.6, edgecolor="white", label="Women")
ax.legend()

mpld3.display()









    Out[6]:



In [7]:

    
def minutes(time):
    parts = [int(x) for x in runners[105]['official'].split(':')]
    return parts[0] * 60 + parts[1] + parts[2]/60.



In [15]:

    
r2014 = cPickle.load(file("results/2014/results.pkl"))



In [16]:

    
len(r2014)









    Out[16]:





31984



In [17]:

    
r2014["30592"]









    Out[17]:





{'10k': u'1:05:10',
 '20k': u'2:09:08',
 '25k': u'2:48:22',
 '30k': u'3:25:35',
 '35k': u'4:14:55',
 '40k': u'5:19:26',
 '5k': u'0:33:01',
 'age': u'37',
 'bib': u'30592',
 'city': u'Portland',
 'country': u'USA',
 'ctz': u'',
 'division': u'6476',
 'gender': u'F',
 'genderdiv': u'13327',
 'half': u'2:16:18',
 'name': u'Wood, Samantha L.',
 'official': u'5:39:06',
 'overall': u'30071',
 'pace': u'0:12:57',
 'state': u'ME'}



In [18]:

    
from collections import Counter
men = Counter(int(x['age']) for x in r2014.itervalues() if x['gender']=="M")
mxs = range(min(men.keys()), max(men.keys()))
mys = [men[i] for i in mxs]

wmn = Counter(int(x['age']) for x in r2014.itervalues() if x['gender']=="F")
wxs = range(min(wmn.keys()), max(wmn.keys()))
wys = [wmn[i] for i in wxs]

fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(111, axisbg='#EEEEEE', title="2014 Boston Marathon participants, by age and gender")
ax.grid(color='white', linestyle='solid')
b1 = ax.bar(mxs, mys, color='lightblue', alpha=1, edgecolor="white", label="Men")
b2 = ax.bar(wxs, wys, color='pink', alpha=0.6, edgecolor="white", label="Women")
ax.legend()

mpld3.display()









    Out[18]:



In [27]:

    
df = pd.read_csv("results/2014/results.csv")



In [29]:

    
df.head(10)









    Out[29]:






  
    
      
      10k
      name
      division
      25k
      gender
      age
      official
      bib
      genderdiv
      ctz
      35k
      overall
      pace
      state
      30k
      5k
      half
      20k
      country
      city
      
    
  
  
    
      0
       17.37
            Yamamoto, Hiroyuki
        8
       47.67
       M
       47
        85.25
       W1
        8
       NaN
        71.40
        8
       3.27
       NaN
       59.18
        8.02
       39.72
       37.65
       JPN
              Fukuoka
      ...
    
    
      1
       32.58
                  Jeptoo, Rita
        1
       82.43
       F
       33
       138.95
       F1
        1
       NaN
       116.37
       21
       5.30
       NaN
       99.33
       16.22
       69.47
       65.83
       KEN
              Eldoret
      ...
    
    
      2
       16.62
             Van Dyk, Ernst F.
        1
       45.80
       M
       41
        80.60
       W2
        1
       NaN
        67.42
        1
       3.08
       NaN
       56.45
        7.75
       38.03
       36.10
       RSA
                Paarl
      ...
    
    
      3
       32.57
                  Dibaba, Mare
        3
       82.43
       F
       24
       140.58
       F2
        3
       NaN
       116.37
       27
       5.37
       NaN
       99.33
       16.20
       69.47
       65.83
       ETH
                 Shoa
      ...
    
    
      4
       17.12
                Hokinoue, Kota
        2
       46.37
       M
       40
        81.23
       W3
        2
       NaN
        67.83
        2
       3.10
       NaN
       57.03
        8.02
       38.60
       36.58
       JPN
       Nogata Fukuoka
      ...
    
    
      5
       32.58
       Sumgong, Jemima Jelagat
        4
       82.45
       F
       29
       140.68
       F3
        4
       NaN
       116.37
       28
       5.37
       NaN
       99.33
       16.22
       69.47
       65.83
       KEN
                Nandi
      ...
    
    
      6
       17.65
                Hug, Marcel E.
        4
       47.67
       M
       28
        84.65
       W4
        4
       NaN
        70.23
        4
       3.23
       NaN
       58.60
        8.38
       39.72
       37.65
       SUI
           Neuenkirch
      ...
    
    
      7
       30.48
                Geneti, Markos
        5
       76.95
       M
       29
       129.83
        5
        5
       NaN
       107.47
        5
       4.97
       NaN
       92.52
       15.17
       64.85
       61.62
       ETH
          Addis Ababa
      ...
    
    
      8
       17.12
             Soejima, Masazumi
        3
       46.37
       M
       43
        81.23
       W6
        3
       NaN
        67.83
        3
       3.10
       NaN
       57.03
        8.00
       38.60
       36.60
       JPN
              Isahaya
      ...
    
    
      9
       30.48
                    Hall, Ryan
       20
       77.68
       M
       31
       137.83
        6
       20
       NaN
       112.27
       20
       5.27
        CA
       94.78
       15.15
       65.23
       61.78
       USA
              Redding
      ...
    
  

10 rows × 21 columns



In [25]:

    
df.describe()









    Out[25]:






  
    
      
      division
      age
      official
      genderdiv
      overall
      pace
    
  
  
    
      count
       31984.000000
       31984.000000
       31984.000000
       31984.000000
       31984.000000
       31984.000000
    
    
      mean
        1932.563032
          42.407079
         242.997314
        8051.044741
       15939.587825
           9.275658
    
    
      std
        1715.228694
          11.316496
          52.300431
        4754.005626
        9232.978224
           1.992486
    
    
      min
           1.000000
          18.000000
          80.600000
           1.000000
           1.000000
           3.080000
    
    
      25%
         610.000000
          33.000000
         205.527500
        3972.000000
        7943.750000
           7.850000
    
    
      50%
        1425.000000
          42.000000
         232.370000
        7970.000000
       15939.500000
           8.870000
    
    
      75%
        2611.000000
          50.000000
         273.235000
       11968.000000
       23935.250000
          10.430000
    
    
      max
        6979.000000
          81.000000
         538.880000
       17575.000000
       31931.000000
          20.570000
    
  

8 rows × 6 columns



In [99]:

    
df['official'].groupby(pd.cut(df['age'], range(15,90,5))).aggregate(np.average).plot(kind="bar", title="Average time by age group")









    Out[99]:





<matplotlib.axes.AxesSubplot at 0x12da2f390>



In [101]:

    
df['official'].groupby(pd.cut(df['age'], range(15,90,5))).aggregate(len).plot(kind="bar", title="# of runners by age group")









    Out[101]:





<matplotlib.axes.AxesSubplot at 0x12d08da10>



In [180]:

    
#Q: What's the distribution of finish times *within* an age group... small multiples
agegroups = range(15,90,5)
agebins = pd.cut(df['age'], agegroups,
                 labels=['{}-{}'.format(age,age+5) for age in agegroups][:-1])
times = range(80, 530, 50)
timebins = pd.cut(df['official'], times,
                  labels=['{}-{}'.format(t, t+50) for t in times][:-1])
age_and_time = df['official'].groupby([agebins, timebins]).aggregate(len)
#age_and_time = age_and_time.to_frame()
age_and_time.plot(kind="bar")
age_and_time.index.get_level_values(0)
#age_and_time["(15, 20]"]
#x = [age_and_time[ax] for ax in age_and_time.axes[0].levels[0]]
#x[0].values









    Out[180]:





Index([u'15-20', u'15-20', u'15-20', u'15-20', u'15-20', u'15-20', u'15-20', u'20-25', u'20-25', u'20-25', u'20-25', u'20-25', u'20-25', u'20-25', u'20-25', u'25-30', u'25-30', u'25-30', u'25-30', u'25-30', u'25-30', u'25-30', u'25-30', u'30-35', u'30-35', u'30-35', u'30-35', u'30-35', u'30-35', u'30-35', u'30-35', u'35-40', u'35-40', u'35-40', u'35-40', u'35-40', u'35-40', u'35-40', u'35-40', u'40-45', u'40-45', u'40-45', u'40-45', u'40-45', u'40-45', u'40-45', u'40-45', u'45-50', u'45-50', u'45-50', u'45-50', u'45-50', u'45-50', u'45-50', u'45-50', u'50-55', u'50-55', u'50-55', u'50-55', u'50-55', u'50-55', u'50-55', u'55-60', u'55-60', u'55-60', u'55-60', u'55-60', u'55-60', u'55-60', u'60-65', u'60-65', u'60-65', u'60-65', u'60-65', u'60-65', u'60-65', u'65-70', u'65-70', u'65-70', u'65-70', u'65-70', u'65-70', u'70-75', u'70-75', u'70-75', u'70-75', u'70-75', u'70-75', u'75-80', u'75-80', u'75-80', u'75-80', u'75-80', u'80-85', u'80-85'], dtype='object')



In [161]:

    
pd.cut?



In [157]:

    
years = []
for year in range(2001, 2015):
    y = pd.read_csv("results/{}/results.csv".format(year), na_values="-")[["official", "gender", "age"]]
    y["year"] = str(year)
    years.append(y)
alltimes = pd.concat(years, ignore_index=True).dropna()



In [138]:

    
plt.figure()
alltimes.boxplot(column='official', by='year')









    Out[138]:





<matplotlib.axes.AxesSubplot at 0x11e43ee50>






    





<matplotlib.figure.Figure at 0x116cf8290>



In [136]:

    
seaborn.set(context = 'notebook', style = 'darkgrid')
# not sure how to do this non-globally?
seaborn.set_context(rc={"figure.figsize": (15, 10)})

f, ax1 = plt.subplots(1)
ax1.set_title("Boston Marathon times 2001-2014")
seaborn.boxplot(pd.Series(alltimes.loc[:, "official"], name="Time in minutes"), groupby=alltimes.year)









    Out[136]:





<matplotlib.axes.AxesSubplot at 0x121df3390>



In [131]:

    
f, ax1 = plt.subplots(1)
ax1.set_title("Boston Marathon times 2001-2014")
seaborn.violinplot(pd.Series(alltimes.loc[:, "official"], name="time in minutes"), groupby=alltimes.year, ax=ax1)









    Out[131]:





<matplotlib.axes.AxesSubplot at 0x119b6c750>



In [132]:

    
f, ax1 = plt.subplots(1)
ax1.set_title("Boston Marathon times 2001-2014 by gender")
seaborn.violinplot(pd.Series(alltimes.loc[:, "official"], name="time in minutes"), groupby=[alltimes.year, alltimes.gender], ax=ax1)









    Out[132]:





<matplotlib.axes.AxesSubplot at 0x119e18090>



In [144]:

    
# wow that is a hideous, useless plot. Looks like women finish a predictable amount worse than men every year?
# I wonder how, across all years, age groups do. (That one might benefit from a gender split, more than the above)
# Also TODO: a map of the states & countries of Boston Marathon participants

alltimes
agegroups = range(15,90,5)
agebins = pd.cut(alltimes['age'], agegroups,
                 labels=['{}-{}'.format(age,age+5) for age in agegroups][:-1])

f, ax1 = plt.subplots(1)
ax1.set_title("Boston Marathon times 2001-2014 by age group")
seaborn.violinplot(pd.Series(alltimes.loc[:, "official"], name="time in minutes"), groupby=agebins, ax=ax1)









    Out[144]:





<matplotlib.axes.AxesSubplot at 0x119f12a10>



In [154]:

    
# wow that is a hideous, useless plot. Looks like women finish a predictable amount worse than men every year?
# I wonder how, across all years, age groups do. (That one might benefit from a gender split, more than the above)
# Also TODO: a map of the states & countries of Boston Marathon participants

alltimes
agegroups = range(15,90,5)
agebins = pd.cut(alltimes['age'], agegroups,
                 labels=['{}-{}'.format(age,age+5) for age in agegroups][:-1])

f, ax1 = plt.subplots(1)
ax1.set_title("Boston Marathon times 2001-2014 by age group")
seaborn.boxplot(pd.Series(alltimes.loc[:, "official"], name="time in minutes"), groupby=agebins, ax=ax1)









    Out[154]:





<matplotlib.axes.AxesSubplot at 0x120f0af10>

# wow that is a hideous, useless plot. Looks like women finish a predictable amount worse than men every year? # I wonder how, across all years, age groups do. (That one might benefit from a gender split, more than the above) # Also TODO: a map of the states & countries of Boston Marathon participants alltimes agegroups = range(15,90,5) agebins = pd.cut(alltimes['age'], agegroups, labels=['{}-{}'.format(age,age+5) for age in agegroups][:-1]) f, ax1 = plt.subplots(1) ax1.set_title("Boston Marathon times 2001-2014 by age group") seaborn.violinplot(pd.Series(alltimes.loc[:, "official"], name="time in minutes"), groupby=[alltimes.gender, agebins], ax=ax1) g = alltimes.groupby([agebins, alltimes.gender]) g.head()



In [167]:

    
years = []
for year in range(2001, 2015):
    y = pd.read_csv("results/{}/results.csv".format(year), na_values="-")[["state"]]
    years.append(y)
states = pd.concat(years, ignore_index=True).dropna()
g = states.groupby("state") #.aggregate(len)
h = g.count()



In [170]:

    
import json
json.dumps(h.to_dict()['state'])









    Out[170]:





'{"WA": 4882, "BC": 2914, "VA": 6611, "DE": 739, "DC": 1595, "WI": 4591, "WV": 474, "HI": 543, "CO": 5111, "FL": 6676, "FM": 2, "WY": 249, "NH": 4469, "SK": 350, "NJ": 5860, "PQ": 1344, "NL": 96, "NM": 720, "TX": 9662, "LA": 734, "NB": 679, "NC": 4737, "ND": 296, "NE": 837, "NF": 113, "YT": 23, "TN": 2407, "NY": 15299, "PA": 9399, "PE": 194, "NS": 1102, "NT": 13, "CA": 19467, "NV": 850, "AA": 12, "PR": 374, "GU": 7, "AB": 2338, "AE": 75, "PW": 1, "ON": 11940, "VI": 27, "AK": 554, "OH": 8111, "AL": 1028, "AP": 19, "AS": 1, "AR": 567, "VT": 1443, "IL": 10158, "GA": 3682, "IN": 3009, "IA": 1693, "OK": 898, "AZ": 2686, "ID": 873, "CT": 5082, "ME": 2222, "MD": 4941, "MA": 58667, "MB": 437, "UT": 3240, "MO": 2298, "MN": 5102, "MI": 7184, "RI": 1968, "KS": 1491, "MT": 563, "QC": 1527, "MS": 395, "SC": 1532, "KY": 1399, "OR": 3160, "SD": 269}'



In [172]:

    
dict(sorted(h.to_dict().iteritems()))









    Out[172]:





{'state': {'AA': 12,
  'AB': 2338,
  'AE': 75,
  'AK': 554,
  'AL': 1028,
  'AP': 19,
  'AR': 567,
  'AS': 1,
  'AZ': 2686,
  'BC': 2914,
  'CA': 19467,
  'CO': 5111,
  'CT': 5082,
  'DC': 1595,
  'DE': 739,
  'FL': 6676,
  'FM': 2,
  'GA': 3682,
  'GU': 7,
  'HI': 543,
  'IA': 1693,
  'ID': 873,
  'IL': 10158,
  'IN': 3009,
  'KS': 1491,
  'KY': 1399,
  'LA': 734,
  'MA': 58667,
  'MB': 437,
  'MD': 4941,
  'ME': 2222,
  'MI': 7184,
  'MN': 5102,
  'MO': 2298,
  'MS': 395,
  'MT': 563,
  'NB': 679,
  'NC': 4737,
  'ND': 296,
  'NE': 837,
  'NF': 113,
  'NH': 4469,
  'NJ': 5860,
  'NL': 96,
  'NM': 720,
  'NS': 1102,
  'NT': 13,
  'NV': 850,
  'NY': 15299,
  'OH': 8111,
  'OK': 898,
  'ON': 11940,
  'OR': 3160,
  'PA': 9399,
  'PE': 194,
  'PQ': 1344,
  'PR': 374,
  'PW': 1,
  'QC': 1527,
  'RI': 1968,
  'SC': 1532,
  'SD': 269,
  'SK': 350,
  'TN': 2407,
  'TX': 9662,
  'UT': 3240,
  'VA': 6611,
  'VI': 27,
  'VT': 1443,
  'WA': 4882,
  'WI': 4591,
  'WV': 474,
  'WY': 249,
  'YT': 23}}



In [173]:

    
years = []
for year in range(2001, 2015):
    y = pd.read_csv("results/{}/results.csv".format(year), na_values="-")[["country"]]
    years.append(y)
states = pd.concat(years, ignore_index=True).dropna()
g = states.groupby("country") #.aggregate(len)
h = g.count()



In [174]:

    
json.dumps(h.to_dict()['country'])









    Out[174]:





'{"LIE": 1, "EGY": 3, "LIB": 2, "QAT": 4, "PAR": 2, "BOL": 3, "SIN": 98, "PAN": 17, "PRK": 1, "TAN": 2, "UAE": 34, "HKG": 211, "HAI": 2, "TPE": 19, "SVK": 29, "CHI": 172, "MAS": 14, "CHN": 79, "URU": 16, "JAM": 8, "SUI": 474, "ZIM": 5, "FIN": 99, "THA": 3, "PHI": 13, "MAR": 8, "AHO": 4, "LAT": 7, "KAZ": 1, "GUA": 57, "BEL": 220, "CRC": 142, "KSA": 7, "DEN": 312, "BER": 191, "CMR": 5, "GER": 1778, "ROM": 7, "SCG": 1, "ROU": 2, "TCA": 1, "TRI": 11, "VGB": 3, "BLR": 2, "GRE": 103, "ANG": 1, "MON": 1, "IND": 17, "INA": 1, "NOR": 86, "CZE": 30, "ESA": 17, "DOM": 31, "LUX": 19, "ISR": 44, "NED": 286, "PER": 68, "ISL": 154, "ETH": 63, "COL": 151, "NEP": 1, "SER": 1, "ECU": 89, "FRA": 707, "LTU": 8, "TWN": 7, "AUS": 639, "GBR": 1837, "AUT": 165, "VEN": 145, "KEN": 145, "TUR": 12, "ITA": 1168, "BRN": 1, "TUN": 1, "RUS": 92, "MEX": 1414, "BRA": 481, "CAY": 21, "BAR": 3, "NGR": 1, "USA": 240937, "SWE": 222, "UKR": 9, "CAN": 23070, "KOR": 1489, "BAH": 20, "CYP": 1, "POR": 93, "CRO": 6, "POL": 109, "EST": 11, "ESP": 384, "SLO": 33, "IRL": 705, "MLT": 1, "NZL": 147, "ARU": 3, "JPN": 1351, "RSA": 100, "ARM": 2, "ARG": 96, "HUN": 24}'



In [ ]:

	10k	name	division	25k	gender	age	official	bib	genderdiv	ctz	35k	overall	pace	state	30k	5k	half	20k	country	city
0	17.37	Yamamoto, Hiroyuki	8	47.67	M	47	85.25	W1	8	NaN	71.40	8	3.27	NaN	59.18	8.02	39.72	37.65	JPN	Fukuoka	...
1	32.58	Jeptoo, Rita	1	82.43	F	33	138.95	F1	1	NaN	116.37	21	5.30	NaN	99.33	16.22	69.47	65.83	KEN	Eldoret	...
2	16.62	Van Dyk, Ernst F.	1	45.80	M	41	80.60	W2	1	NaN	67.42	1	3.08	NaN	56.45	7.75	38.03	36.10	RSA	Paarl	...
3	32.57	Dibaba, Mare	3	82.43	F	24	140.58	F2	3	NaN	116.37	27	5.37	NaN	99.33	16.20	69.47	65.83	ETH	Shoa	...
4	17.12	Hokinoue, Kota	2	46.37	M	40	81.23	W3	2	NaN	67.83	2	3.10	NaN	57.03	8.02	38.60	36.58	JPN	Nogata Fukuoka	...
5	32.58	Sumgong, Jemima Jelagat	4	82.45	F	29	140.68	F3	4	NaN	116.37	28	5.37	NaN	99.33	16.22	69.47	65.83	KEN	Nandi	...
6	17.65	Hug, Marcel E.	4	47.67	M	28	84.65	W4	4	NaN	70.23	4	3.23	NaN	58.60	8.38	39.72	37.65	SUI	Neuenkirch	...
7	30.48	Geneti, Markos	5	76.95	M	29	129.83	5	5	NaN	107.47	5	4.97	NaN	92.52	15.17	64.85	61.62	ETH	Addis Ababa	...
8	17.12	Soejima, Masazumi	3	46.37	M	43	81.23	W6	3	NaN	67.83	3	3.10	NaN	57.03	8.00	38.60	36.60	JPN	Isahaya	...
9	30.48	Hall, Ryan	20	77.68	M	31	137.83	6	20	NaN	112.27	20	5.27	CA	94.78	15.15	65.23	61.78	USA	Redding	...

	division	age	official	genderdiv	overall	pace
count	31984.000000	31984.000000	31984.000000	31984.000000	31984.000000	31984.000000
mean	1932.563032	42.407079	242.997314	8051.044741	15939.587825	9.275658
std	1715.228694	11.316496	52.300431	4754.005626	9232.978224	1.992486
min	1.000000	18.000000	80.600000	1.000000	1.000000	3.080000
25%	610.000000	33.000000	205.527500	3972.000000	7943.750000	7.850000
50%	1425.000000	42.000000	232.370000	7970.000000	15939.500000	8.870000
75%	2611.000000	50.000000	273.235000	11968.000000	23935.250000	10.430000
max	6979.000000	81.000000	538.880000	17575.000000	31931.000000	20.570000