Exercises


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# From solutions this is useful:
# from pylab import rcParams
# rcParams['figure.figsize'] = 10, 5

In [2]:
athletes = pd.read_csv('data/athletes.csv')
countries = pd.read_csv('data/countries.csv')

In [3]:
print(athletes.shape)
print(countries.shape)


(11538, 11)
(201, 2)

In [5]:
countries.head()


Out[5]:
country code
0 Afghanistan AFG
1 Albania ALB
2 Algeria ALG
3 American Samoa* ASA
4 Andorra AND

In [8]:
athletes.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11538 entries, 0 to 11537
Data columns (total 11 columns):
id             11538 non-null int64
name           11538 non-null object
nationality    11538 non-null object
sex            11538 non-null object
dob            11537 non-null object
height         11208 non-null float64
weight         10879 non-null float64
sport          11538 non-null object
gold           11538 non-null int64
silver         11538 non-null int64
bronze         11538 non-null int64
dtypes: float64(2), int64(4), object(5)
memory usage: 991.6+ KB

In [4]:
athletes.head()


Out[4]:
id name nationality sex dob height weight sport gold silver bronze
0 736041664 A Jesus Garcia ESP male 10/17/69 1.72 64.0 athletics 0 0 0
1 532037425 A Lam Shin KOR female 9/23/86 1.68 56.0 fencing 0 0 0
2 435962603 Aaron Brown CAN male 5/27/92 1.98 79.0 athletics 0 0 1
3 521041435 Aaron Cook MDA male 1/2/91 1.83 80.0 taekwondo 0 0 0
4 33922579 Aaron Gate NZL male 11/26/90 1.81 71.0 cycling 0 0 0

1. What was the average age in male and female athletes?


In [12]:
athletes.dob = pd.to_datetime(athletes.dob)

In [25]:
# age done roughly, we should also check day and month but that's ok for my purposes:
athletes['age'] = 2016 - athletes.dob.dt.year

In [27]:
athletes.groupby('sex').age.mean()


Out[27]:
sex
female    25.908549
male      26.540114
Name: age, dtype: float64

2. What are the most common Dates of Birth?

To clarify - day, month, year


In [31]:
athletes.dob.value_counts()[:20]


Out[31]:
1988-03-05    9
1990-12-20    9
1993-02-18    9
1988-04-03    8
1991-06-19    8
1989-12-14    8
1993-07-30    8
1990-05-02    8
1993-03-03    8
1989-03-01    8
1988-04-29    8
1989-06-09    8
1990-03-14    7
1994-10-25    7
1994-08-19    7
1991-08-18    7
1991-02-08    7
1991-04-16    7
1990-07-11    7
1993-03-29    7
Name: dob, dtype: int64

3. How about the most common birthdays?

To clarify - day, month


In [36]:
athletes['birthday'] = athletes.dob.apply(lambda x: str(x.day) + '/' + str(x.month))

In [38]:
athletes.birthday.value_counts()[:20]


Out[38]:
1/1     58
5/2     51
10/2    48
20/1    47
19/9    47
10/1    47
5/4     46
5/3     45
29/4    44
30/4    44
13/1    44
8/3     44
14/2    43
2/5     43
30/9    43
12/7    43
7/1     42
15/5    42
11/3    42
26/5    42
Name: birthday, dtype: int64

4. What are the Countries with more than 100 medals?


In [39]:
athletes['total_medals'] = athletes.gold + athletes.silver + athletes.bronze

In [46]:
countries = countries.merge(athletes.groupby('nationality')['gold', 'silver', 'bronze', 'total_medals'].sum(),
                            how='left',
                            left_on='code',
                            right_index=True)

In [51]:
countries_100_medals = countries[countries.total_medals > 100].sort_values(by='total_medals', ascending=False)

In [52]:
countries_100_medals


Out[52]:
country code gold silver bronze total_medals
191 United States USA 139.0 54.0 71.0 264.0
70 Germany GER 49.0 44.0 67.0 160.0
190 United Kingdom GBR 64.0 55.0 26.0 145.0
150 Russia RUS 52.0 29.0 34.0 115.0
40 China CHN 46.0 30.0 37.0 113.0

5. Create a bar or pie chart for the results of the previous exercise.


In [80]:
fig = plt.figure(figsize=(16, 6))
ax = sns.barplot(countries_100_medals.country, countries_100_medals.total_medals)
ax.set_ylabel('total medals')
ax.set_title('Countries with over 100 medals');


6. Male weightlifting competitions are divided into 8 weight classes. Can you estimate these weight classes by looking at the data? Hint: Create a scatter plot with Body weight on the x-axis and choose height as y.


In [104]:
weightlifting = athletes[(athletes.sex == 'male') & (athletes.sport == 'weightlifting')].copy()

In [88]:
ax = sns.lmplot('weight', 'height', weightlifting, fit_reg=False, size = 10)
ax.ax.xaxis.set_major_locator(plt.MaxNLocator(20));



In [98]:
def weightlifting_class(df):
    if df.weight < 57:
        cl = 1
    elif df.weight < 64:
        cl = 2
    elif df.weight < 72:
        cl = 3
    elif df.weight < 80:
        cl = 4
    elif df.weight < 88:
        cl = 5
    elif df.weight < 96:
        cl = 6
    elif df.weight < 112:
        cl = 7
    else:
        cl = 8
    return cl

In [105]:
weightlifting['class'] = weightlifting.apply(weightlifting_class, axis=1)

In [106]:
weightlifting


Out[106]:
id name nationality sex dob height weight sport gold silver bronze age birthday total_medals class
120 242818896 Adrian Edward Zielinski POL male 1989-03-28 1.70 94.0 weightlifting 0 0 0 27.0 28/3 0 6
186 162169475 Ahmed Mohamed EGY male 1988-04-27 1.85 144.0 weightlifting 0 0 0 28.0 27/4 0 8
189 101574216 Ahmed Saad EGY male 1986-11-01 1.60 62.0 weightlifting 0 0 0 30.0 1/11 0 2
403 745034260 Alexandr Spac MDA male 1989-11-21 1.64 77.0 weightlifting 0 0 0 27.0 21/11 0 4
405 928138606 Alexandr Zaichikov KAZ male 1992-08-17 1.80 105.0 weightlifting 0 0 1 24.0 17/8 1 7
437 434694198 Alexej Prochorow GER male 1990-03-30 1.91 138.0 weightlifting 0 0 0 26.0 30/3 0 8
461 800154411 Ali Hashemi IRI male 1991-11-01 1.78 93.0 weightlifting 0 0 0 25.0 1/11 0 6
476 502227727 Aliaksandr Bersanau BLR male 1992-09-01 1.76 94.0 weightlifting 0 0 0 24.0 1/9 0 6
481 185532375 Aliaksei Mzhachyk BLR male 1996-06-30 1.90 136.0 weightlifting 0 0 0 20.0 30/6 0 8
546 949130459 Almir Velagic GER male 1981-08-22 1.83 149.0 weightlifting 0 0 0 35.0 22/8 0 8
598 555654573 Amar Music CRO male 1987-03-21 1.70 85.0 weightlifting 0 0 0 29.0 21/3 0 5
733 995131091 Andranik Karapetyan ARM male 1995-12-15 1.80 77.0 weightlifting 0 0 0 21.0 15/12 0 4
809 669378368 Andres Eduardo Mata Perez ESP male 1992-11-11 1.74 76.0 weightlifting 0 0 0 24.0 11/11 0 4
810 349842092 Andres Mauricio Caicedo Piedrahita COL male 1997-08-15 1.74 76.0 weightlifting 0 0 0 19.0 15/8 0 4
1050 75967084 Anton Sudesh Peiris Kurukulasooriyage SRI male 1985-02-03 1.65 62.0 weightlifting 0 0 0 31.0 3/2 0 2
1067 268280424 Antonis Martasidis CYP male 1992-06-14 1.78 85.0 weightlifting 0 0 0 24.0 14/6 0 5
1086 450196345 Arakel Mirzoyan ARM male 1989-10-21 1.70 84.0 weightlifting 0 0 0 27.0 21/10 0 5
1118 388943 Arkadiusz Michalski POL male 1990-01-07 1.80 105.0 weightlifting 0 0 0 26.0 7/1 0 7
1123 467565690 Arli Chontey KAZ male 1992-07-01 1.50 56.0 weightlifting 0 0 0 24.0 1/7 0 1
1168 512734462 Arturs Plesnieks LAT male 1992-01-21 1.80 104.0 weightlifting 0 0 0 24.0 21/1 0 7
1256 525803886 Aurimas Didzbalis LTU male 1991-06-13 1.72 92.0 weightlifting 0 0 1 25.0 13/6 1 6
1342 999333930 Bartlomiej Wojciech Bonk POL male 1984-10-11 1.81 104.0 weightlifting 0 0 0 32.0 11/10 0 7
1387 57339699 Behdad Salimikordasiabi IRI male 1989-12-08 1.92 170.0 weightlifting 0 0 0 27.0 8/12 0 8
1420 301130953 Benjamin Didier Hennequin FRA male 1984-08-24 1.74 84.0 weightlifting 0 0 0 32.0 24/8 0 5
1449 78415215 Bernardin Ledoux Kingue Matam FRA male 1990-05-20 1.63 69.0 weightlifting 0 0 0 26.0 20/5 0 3
1597 630721644 Bredni Roque Mendoza MEX male 1987-11-11 1.60 69.0 weightlifting 0 0 0 29.0 11/11 0 3
1630 345441615 Briken Calja ALB male 1990-02-19 1.70 69.0 weightlifting 0 0 0 26.0 19/2 0 3
1883 388718541 Chagnaadorj Usukhbayar MGL male 1997-05-06 1.55 56.0 weightlifting 0 0 0 19.0 6/5 0 1
1948 851008982 Chatuphum Chinnawong THA male 1993-07-19 1.67 77.0 weightlifting 0 0 0 23.0 19/7 0 4
1964 717003685 Chi-Chung Tan TPE male 1990-02-24 1.55 56.0 weightlifting 0 0 0 26.0 24/2 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9566 857233175 Sathish Kumar Sivalingam IND male 1992-06-23 1.75 77.0 weightlifting 0 0 0 24.0 23/6 0 4
9679 890081492 Serghei Cechir MDA male 1990-10-15 1.72 69.0 weightlifting 0 0 0 26.0 15/10 0 3
9795 316407203 Shih-Chieh Chen TPE male 1989-11-27 1.90 152.0 weightlifting 0 0 0 27.0 27/11 0 8
9877 618197194 Simon Martirosyan ARM male 1997-02-17 1.81 105.0 weightlifting 0 1 0 19.0 17/2 1 7
9908 520685692 Simplice Ribouem AUS male 1982-12-05 1.73 93.0 weightlifting 0 0 0 34.0 5/12 0 6
9912 303411588 Sinphet Kruaithong THA male 1995-08-22 1.59 55.0 weightlifting 0 0 1 21.0 22/8 1 1
9956 476545490 Sohrab Moradi IRI male 1988-09-22 1.70 94.0 weightlifting 1 0 0 28.0 22/9 1 6
9982 698806625 Sonny Webster GBR male 1994-03-10 1.78 94.0 weightlifting 0 0 0 22.0 10/3 0 6
10232 605417556 Tairat Bunsuk THA male 1993-01-11 1.61 69.0 weightlifting 0 0 0 23.0 11/1 0 3
10278 14783411 Tan Tai Hoang VIE male 1990-03-30 1.65 84.0 weightlifting 0 0 0 26.0 30/3 0 5
10290 807845971 Tanumafili Malietoa Jungblut ASA male 1990-06-10 1.88 93.0 weightlifting 0 0 0 26.0 10/6 0 6
10295 725478923 Tao Tian CHN male 1994-04-08 1.72 85.0 weightlifting 0 1 0 22.0 8/4 1 5
10404 470687922 Theodoros Iakovidis GRE male 1991-02-12 1.81 85.0 weightlifting 0 0 0 25.0 12/2 0 5
10571 169080993 Tom Richard Goegebuer BEL male 1975-03-27 1.64 56.0 weightlifting 0 0 0 41.0 27/3 0 1
10630 579249475 Triyatno INA male 1987-12-20 1.61 69.0 weightlifting 0 0 0 29.0 20/12 0 3
10708 446977875 Vadzim Straltsou BLR male 1986-04-30 1.70 94.0 weightlifting 0 1 0 30.0 30/4 1 6
10713 180683514 Vaipava Nevo Ioane SAM male 1988-04-14 1.52 62.0 weightlifting 0 0 0 28.0 14/4 0 2
10935 109518008 Volodymyr Hoza UKR male 1996-04-15 1.83 94.0 weightlifting 0 0 0 20.0 15/4 0 6
10945 929418601 Walid Bidani ALG male 1994-06-11 1.85 123.0 weightlifting 0 0 0 22.0 11/6 0 8
10972 114920742 Welisson Rosa da Silva BRA male 1983-11-22 1.60 85.0 weightlifting 0 0 0 33.0 22/11 0 5
11035 618949367 Witoon Mingmoon THA male 1996-02-10 1.57 56.0 weightlifting 0 0 0 20.0 10/2 0 1
11070 780086362 Xiaojun Lyu CHN male 1984-07-27 1.72 77.0 weightlifting 0 1 0 32.0 27/7 1 4
11262 652136434 Yoelmis Hernandez Paumier CUB male 1986-04-25 1.68 84.0 weightlifting 0 0 0 30.0 25/4 0 5
11266 266326638 Yoichi Itokazu JPN male 1991-05-24 1.60 62.0 weightlifting 0 0 0 25.0 24/5 0 2
11276 608871682 Yong Gwang Kwon PRK male 1996-01-14 1.65 68.0 weightlifting 0 0 0 20.0 14/1 0 3
11300 326797163 Yosuke Nakayama JPN male 1987-03-20 1.61 62.0 weightlifting 0 0 0 29.0 20/3 0 2
11389 109766656 Yun Chol Om PRK male 1991-11-18 1.51 56.0 weightlifting 0 1 0 25.0 18/11 1 1
11473 548518238 Zhe Yang CHN male 1991-07-14 1.87 105.0 weightlifting 0 0 0 25.0 14/7 0 7
11487 334126873 Zhiyong Shi CHN male 1993-10-10 1.68 69.0 weightlifting 1 0 0 23.0 10/10 1 3
11536 900065925 le Quoc Toan Tran VIE male 1989-04-05 1.60 56.0 weightlifting 0 0 0 27.0 5/4 0 1

154 rows × 15 columns

7. Generate a histogram of male and female height distribution among all participants.


In [122]:
fig = plt.figure(figsize=(16, 10))
ax = sns.distplot(athletes[athletes.sex == 'male'].height.dropna(), kde=False, label='male')
ax = sns.distplot(athletes[athletes.sex == 'female'].height.dropna(), kde=False, label='female')
ax.set_xlabel('height')
ax.set_title('Height distribution')
ax.legend();


8. Using the Seaborn package create a box plot for male and female height distribution among all participants.


In [128]:
fig = plt.figure(figsize=(10, 10))
ax = sns.boxplot(x='sex', y='height', data=athletes)
ax.set_title('Athletes\' height by sex');


9. Optional: What else would you try?


In [ ]: