notebook.community

Edit and run



In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
%matplotlib inline



In [2]:

    
df = pd.read_csv('data/heights_weights_genders.csv')
print(df.shape)
df.columns









    



(10000, 3)






    Out[2]:





Index(['Gender', 'Height', 'Weight'], dtype='object')



In [3]:

    
plt.style.use('fivethirtyeight')
fig, ax = plt.subplots(figsize=(8,4))
for category, group in df.groupby('Gender'):

    ax.plot(group['Height'], group['Weight'], marker='o', linestyle='', label=category, markersize=10, markeredgewidth=0, alpha=.5)
    ax.legend(loc='upper left')



In [4]:

    
df.head(5)

#print("The correlation is", df.corr()['MdHHIncE']['RecycleRate'])



In [5]:

    
print("Male correlation:", df[df['Gender']=='Male'].corr()['Height']['Weight'])
print("Female correlation:", df[df['Gender']=='Female'].corr()['Height']['Weight'])
print("Whole dataset correlation:", df.corr()['Height']['Weight'])
print("Simpson's paradox!?")









    



Male correlation: 0.862978848616
Female correlation: 0.849608591419
Whole dataset correlation: 0.924756298741
Simpson's paradox!?

	Gender	Height	Weight
0	Male	73.847017	241.893563
1	Male	68.781904	162.310473
2	Male	74.110105	212.740856
3	Male	71.730978	220.042470
4	Male	69.881796	206.349801