Using the heights_weights_genders.csv, analyze the difference between the height weight correlation in women and men.



In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline



In [3]:

    
df=pd.read_csv('/home/sean/git/algorithms/class4/homework/data/heights_weights_genders.csv')



In [5]:

    
df.groupby(by='Gender').describe()









    Out[5]:






  
    
      
      
      Height
      Weight
    
    
      Gender
      
      
      
    
  
  
    
      Female
      count
      5000.000000
      5000.000000
    
    
      mean
      63.708774
      135.860093
    
    
      std
      2.696284
      19.022468
    
    
      min
      54.263133
      64.700127
    
    
      25%
      61.894441
      122.934096
    
    
      50%
      63.730924
      136.117583
    
    
      75%
      65.563565
      148.810926
    
    
      max
      73.389586
      202.237214
    
    
      Male
      count
      5000.000000
      5000.000000
    
    
      mean
      69.026346
      187.020621
    
    
      std
      2.863362
      19.781155
    
    
      min
      58.406905
      112.902939
    
    
      25%
      67.174679
      173.887767
    
    
      50%
      69.027709
      187.033546
    
    
      75%
      70.988744
      200.357802
    
    
      max
      78.998742
      269.989699



In [6]:

    
male_df=df[df['Gender']=='Male']



In [7]:

    
female_df=df[df['Gender']=='Female']



In [30]:

    
ax.scatter?



In [45]:

    
fig, ax = plt.subplots()
fitm = np.polyfit(male_df['Height'], male_df['Weight'], deg=1)
fitf = np.polyfit(female_df['Height'], female_df['Weight'], deg=1)
ax.plot(female_df['Height'], fit[0] * female_df['Height'] + fitf[1], color='magenta', alpha=0.5)
ax.scatter(female_df['Height'], female_df['Weight'], alpha=0.1, c='pink', linewidths=0)
ax.plot(male_df['Height'], fit[0] * male_df['Height'] + fitm[1], color='cyan', alpha=0.5)
ax.scatter(male_df['Height'], male_df['Weight'], alpha=0.1, c='blue', linewidths=0)









    Out[45]:





<matplotlib.collections.PathCollection at 0x7f110ed08e80>



In [36]:

    
df.corr()



In [37]:

    
male_df.corr()



In [38]:

    
female_df.corr()



In [ ]:

		Height	Weight
Gender
Female	count	5000.000000	5000.000000
	mean	63.708774	135.860093
	std	2.696284	19.022468
	min	54.263133	64.700127
	25%	61.894441	122.934096
	50%	63.730924	136.117583
	75%	65.563565	148.810926
	max	73.389586	202.237214
Male	count	5000.000000	5000.000000
	mean	69.026346	187.020621
	std	2.863362	19.781155
	min	58.406905	112.902939
	25%	67.174679	173.887767
	50%	69.027709	187.033546
	75%	70.988744	200.357802
	max	78.998742	269.989699