In [9]:
import numpy as np
import pandas as pd
from scipy.stats import norm
In [10]:
!ls ../../data/LLCP2018.XPT
In [11]:
filename = '../../data/LLCP2018.XPT'
df = pd.read_sas(filename)
df.head()
Out[11]:
In [12]:
df['SEX1'].value_counts()
Out[12]:
In [13]:
male = df['SEX1'] == 1
male.sum()
Out[13]:
In [14]:
female = df['SEX1'] == 2
female.sum()
Out[14]:
In [15]:
df['HTM4'].describe()
Out[15]:
In [37]:
height_male = df.loc[male, 'HTM4']
height_male += np.random.normal(0, 2, male.sum())
height_male.mean(), height_male.std()
Out[37]:
In [43]:
height_male.std() / height_male.mean()
Out[43]:
In [41]:
height_female = df.loc[female, 'HTM4']
height_female += np.random.normal(0, 2, female.sum())
height_female.mean(), height_female.std()
Out[41]:
In [44]:
height_female.std() / height_female.mean()
Out[44]:
In [39]:
def estimate_std(series, num_sigmas):
ps = norm.cdf([-num_sigmas/2, num_sigmas/2])
ipr = series.quantile(ps)
std = np.diff(ipr) / num_sigmas
return std
In [45]:
index = np.linspace(1, 6, 51)
std_series_male = pd.Series(1.0, index=index)
for num_sigmas in index:
std_series_male[num_sigmas] = estimate_std(height_male, num_sigmas)
std_series_male.plot()
Out[45]:
In [42]:
index = np.linspace(1, 5, 51)
std_series_female = pd.Series(1.0, index=index)
for num_sigmas in index:
std_series_female[num_sigmas] = estimate_std(height_female, num_sigmas)
std_series_female.plot()
Out[42]:
In [20]:
ipr_female = height_female.quantile(ps)
ipr_female
Out[20]:
In [21]:
np.diff(ipr_male) / 3
Out[21]:
In [22]:
np.diff(ipr_female) / 3
Out[22]:
In [23]:
from empiricaldist import Cdf
cdf_male = Cdf.from_seq(height_male)
cdf_male.plot()
cdf_female = Cdf.from_seq(height_female)
cdf_female.plot()
In [ ]: