In [16]:
%matplotlib inline
import nsfg
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import thinkplot
import thinkstats2
from math import sqrt
In [3]:
df = nsfg.ReadFemPreg()
df.describe()
Out[3]:
In [30]:
live = df[df.outcome == 1]
df[df.outcome==1].prglngth.hist(by=(df.birthord == 1))
Out[30]:
In [60]:
bins = xrange(26,45)
plt.hist(live[live.birthord == 1].prglngth.values, bins, alpha=0.5, label='first', align='left', rwidth=0.45)
plt.hist(live[live.birthord != 1].prglngth.values, bins, alpha=0.5, label='other', align='right', rwidth=0.45)
plt.legend(loc='upper right')
plt.show()
In [41]:
df[df.outcome != 1].prglngth.values
Out[41]:
In [65]:
first_hist = thinkstats2.Hist(live[live.birthord == 1].prglngth)
other_hist = thinkstats2.Hist(live[live.birthord != 1].prglngth)
print live[live.birthord == 1].prglngth.value_counts(), first_hist
In [67]:
width = 0.45
thinkplot.PrePlot(2)
thinkplot.Hist(first_hist, align='right', width=width)
thinkplot.Hist(other_hist, align='left', width=width)
thinkplot.Show(xlabel='Weeks', ylabel='frequency', bins=xrange(26,45))
In [74]:
first = live[live.birthord == 1]
other = live[live.birthord != 1]
print live.prglngth.mean(), live.prglngth.var(), live.prglngth.std()
print first.prglngth.mean(), first.prglngth.var(), first.prglngth.std()
print other.prglngth.mean(), other.prglngth.var(), other.prglngth.std()
In [84]:
first_hist = thinkstats2.Hist(first.totalwgt_lb.round(), label='first')
other_hist = thinkstats2.Hist(other.totalwgt_lb.round(), label='other')
width = 0.45
thinkplot.PrePlot(2)
thinkplot.Hist(first_hist, align='right', width=width)
thinkplot.Hist(other_hist, align='left', width=width)
thinkplot.Show(xlabel='Pounds', ylabel='frequency')
print first.totalwgt_lb.mean(), other.totalwgt_lb.mean()
print first.totalwgt_lb.std(), other.totalwgt_lb.std()
In [86]:
def cohen_d(group1, group2):
diff = group1.mean() - group2.mean()
n1, n2 = len(group1), len(group2)
var1, var2 = group1.var(), group2.var()
numerator = n1 * var1 + n2 * var2
denominator = n1 + n2
pooled_std = sqrt(numerator / denominator)
d = diff / pooled_std
return d
In [89]:
cohen_d(other.prglngth, first.prglngth)
Out[89]:
In [4]:
df.describe()
Out[4]:
In [8]:
df2 = df.dropna(subset=['agepreg', 'totalwgt_lb'])
ages, weights = df.agepreg, df.totalwgt_lb
In [17]:
thinkplot.HexBin(np.log(ages), weights)
thinkplot.Show(xlabel='Ages', ylabel='Weights')
In [19]:
thinkstats2.Corr(np.log(ages),weights)
Out[19]:
In [18]:
thinkstats2.SpearmanCorr(np.log(ages),weights)
Out[18]:
In [ ]: