In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# From solutions this is useful:
# from pylab import rcParams
# rcParams['figure.figsize'] = 10, 5
In [2]:
athletes = pd.read_csv('data/athletes.csv')
countries = pd.read_csv('data/countries.csv')
In [3]:
print(athletes.shape)
print(countries.shape)
In [5]:
countries.head()
Out[5]:
In [8]:
athletes.info()
In [4]:
athletes.head()
Out[4]:
In [12]:
athletes.dob = pd.to_datetime(athletes.dob)
In [25]:
# age done roughly, we should also check day and month but that's ok for my purposes:
athletes['age'] = 2016 - athletes.dob.dt.year
In [27]:
athletes.groupby('sex').age.mean()
Out[27]:
In [31]:
athletes.dob.value_counts()[:20]
Out[31]:
In [36]:
athletes['birthday'] = athletes.dob.apply(lambda x: str(x.day) + '/' + str(x.month))
In [38]:
athletes.birthday.value_counts()[:20]
Out[38]:
In [39]:
athletes['total_medals'] = athletes.gold + athletes.silver + athletes.bronze
In [46]:
countries = countries.merge(athletes.groupby('nationality')['gold', 'silver', 'bronze', 'total_medals'].sum(),
how='left',
left_on='code',
right_index=True)
In [51]:
countries_100_medals = countries[countries.total_medals > 100].sort_values(by='total_medals', ascending=False)
In [52]:
countries_100_medals
Out[52]:
In [80]:
fig = plt.figure(figsize=(16, 6))
ax = sns.barplot(countries_100_medals.country, countries_100_medals.total_medals)
ax.set_ylabel('total medals')
ax.set_title('Countries with over 100 medals');
In [104]:
weightlifting = athletes[(athletes.sex == 'male') & (athletes.sport == 'weightlifting')].copy()
In [88]:
ax = sns.lmplot('weight', 'height', weightlifting, fit_reg=False, size = 10)
ax.ax.xaxis.set_major_locator(plt.MaxNLocator(20));
In [98]:
def weightlifting_class(df):
if df.weight < 57:
cl = 1
elif df.weight < 64:
cl = 2
elif df.weight < 72:
cl = 3
elif df.weight < 80:
cl = 4
elif df.weight < 88:
cl = 5
elif df.weight < 96:
cl = 6
elif df.weight < 112:
cl = 7
else:
cl = 8
return cl
In [105]:
weightlifting['class'] = weightlifting.apply(weightlifting_class, axis=1)
In [106]:
weightlifting
Out[106]:
In [122]:
fig = plt.figure(figsize=(16, 10))
ax = sns.distplot(athletes[athletes.sex == 'male'].height.dropna(), kde=False, label='male')
ax = sns.distplot(athletes[athletes.sex == 'female'].height.dropna(), kde=False, label='female')
ax.set_xlabel('height')
ax.set_title('Height distribution')
ax.legend();
In [128]:
fig = plt.figure(figsize=(10, 10))
ax = sns.boxplot(x='sex', y='height', data=athletes)
ax.set_title('Athletes\' height by sex');
In [ ]: