In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
%matplotlib inline

In [2]:
df = pd.read_csv('data/heights_weights_genders.csv')
print(df.shape)
df.columns


(10000, 3)
Out[2]:
Index(['Gender', 'Height', 'Weight'], dtype='object')

In [3]:
plt.style.use('fivethirtyeight')
fig, ax = plt.subplots(figsize=(8,4))
for category, group in df.groupby('Gender'):

    ax.plot(group['Height'], group['Weight'], marker='o', linestyle='', label=category, markersize=10, markeredgewidth=0, alpha=.5)
    ax.legend(loc='upper left')



In [4]:
df.head(5)

#print("The correlation is", df.corr()['MdHHIncE']['RecycleRate'])


Out[4]:
Gender Height Weight
0 Male 73.847017 241.893563
1 Male 68.781904 162.310473
2 Male 74.110105 212.740856
3 Male 71.730978 220.042470
4 Male 69.881796 206.349801

In [5]:
print("Male correlation:", df[df['Gender']=='Male'].corr()['Height']['Weight'])
print("Female correlation:", df[df['Gender']=='Female'].corr()['Height']['Weight'])
print("Whole dataset correlation:", df.corr()['Height']['Weight'])
print("Simpson's paradox!?")


Male correlation: 0.862978848616
Female correlation: 0.849608591419
Whole dataset correlation: 0.924756298741
Simpson's paradox!?