Using the heights_weights_genders.csv, analyze the difference between the height weight correlation in women and men.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df=pd.read_csv('/home/sean/git/algorithms/class4/homework/data/heights_weights_genders.csv')

In [5]:
df.groupby(by='Gender').describe()


Out[5]:
Height Weight
Gender
Female count 5000.000000 5000.000000
mean 63.708774 135.860093
std 2.696284 19.022468
min 54.263133 64.700127
25% 61.894441 122.934096
50% 63.730924 136.117583
75% 65.563565 148.810926
max 73.389586 202.237214
Male count 5000.000000 5000.000000
mean 69.026346 187.020621
std 2.863362 19.781155
min 58.406905 112.902939
25% 67.174679 173.887767
50% 69.027709 187.033546
75% 70.988744 200.357802
max 78.998742 269.989699

In [6]:
male_df=df[df['Gender']=='Male']

In [7]:
female_df=df[df['Gender']=='Female']

In [30]:
ax.scatter?

In [45]:
fig, ax = plt.subplots()
fitm = np.polyfit(male_df['Height'], male_df['Weight'], deg=1)
fitf = np.polyfit(female_df['Height'], female_df['Weight'], deg=1)
ax.plot(female_df['Height'], fit[0] * female_df['Height'] + fitf[1], color='magenta', alpha=0.5)
ax.scatter(female_df['Height'], female_df['Weight'], alpha=0.1, c='pink', linewidths=0)
ax.plot(male_df['Height'], fit[0] * male_df['Height'] + fitm[1], color='cyan', alpha=0.5)
ax.scatter(male_df['Height'], male_df['Weight'], alpha=0.1, c='blue', linewidths=0)


Out[45]:
<matplotlib.collections.PathCollection at 0x7f110ed08e80>

In [36]:
df.corr()


Out[36]:
Height Weight
Height 1.000000 0.924756
Weight 0.924756 1.000000

In [37]:
male_df.corr()


Out[37]:
Height Weight
Height 1.000000 0.862979
Weight 0.862979 1.000000

In [38]:
female_df.corr()


Out[38]:
Height Weight
Height 1.000000 0.849609
Weight 0.849609 1.000000

In [ ]: