In [1]:
from numpy import *
In [2]:
a = array([1,2,3,4])
In [14]:
b = transpose(vstack([a, a]))
b
Out[14]:
In [15]:
reshape(b, (1,8))
Out[15]:
In [32]:
def interleave(a, b):
c = empty(a.shape[0] + b.shape[0])
print a.shape, b.shape, c.shape
c[::2] = a
c[1::2] = b
return c
In [33]:
interleave(a, a)
Out[33]:
In [34]:
roll(a, 1)
Out[34]:
In [36]:
a[1:] - a[:-1]
Out[36]:
In [37]:
b = roll(a, 1)
b[0] = 0
a - b
Out[37]:
In [41]:
import brfss
import thinkstats2
import numpy as np
df = brfss.ReadBrfss(nrows=None)
In [139]:
df = df.dropna(subset=['htm3', 'wtkg2'])
cdf = thinkstats2.Cdf(df.htm3)
print cdf[200] - cdf[145]
In [140]:
bins = np.arange(135, 210, 5)
print(bins)
indices = np.digitize(df.htm3, bins)
groups = df.groupby(indices)
groups
Out[140]:
In [141]:
for i, group in groups:
print i, group.htm3.mean(), group.wtkg2.mean()
In [143]:
for i, group in groups:
print i, len(group)
In [144]:
cdfs = [thinkstats2.Cdf(group.wtkg2) for i, group in groups][1:-1]
print len(cdfs)
In [145]:
import thinkplot
thinkplot.PrePlot(3)
for percent in [75, 50, 25]:
ys = [cdf.Percentile(percent) for cdf in cdfs]
label = '%dth' % percent
thinkplot.Plot(heights, ys, label=label)
thinkplot.Show()
In [158]:
import first
live, firsts, others = first.MakeFrames()
live = live.dropna(subset=['agepreg', 'totalwgt_lb'])
In [159]:
live.agepreg
Out[159]:
In [160]:
live.totalwgt_lb
Out[160]:
In [161]:
thinkplot.Scatter(live.agepreg, live.totalwgt_lb)
In [163]:
thinkstats2.Corr(live.agepreg, live.totalwgt_lb)
Out[163]:
In [164]:
thinkstats2.SpearmanCorr(live.agepreg, live.totalwgt_lb)
Out[164]:
In [3]:
import first
import chap01ex_soln
import thinkstats2
import thinkplot
import pandas
In [4]:
live, first, others = first.MakeFrames()
In [5]:
resp = chap01ex_soln.ReadFemResp()
resp.index = resp.caseid
In [157]:
vars1 = thinkstats2.ReadStataDct('2002FemPreg.dct').variables
vars2 = thinkstats2.ReadStataDct('2002FemResp.dct').variables
all_vars = vars1.append(vars2)
all_vars.index = all_vars.name
all_vars.loc['birthwgt_lb'].desc
Out[157]:
In [163]:
all_vars.loc['race'].desc[0]
Out[163]:
In [106]:
len(live), len(live.columns)
Out[106]:
In [107]:
len(resp), len(resp.columns)
Out[107]:
In [382]:
import linear
live = live[live.prglngth>30]
live = linear.ResampleRowsWeighted(live)
join = live.join(resp, on='caseid', rsuffix='_r')
In [216]:
# %timeit join = pandas.merge(live, resp, left_on='caseid', right_index=True, sort=False)
In [383]:
len(join), len(join.columns), 3087+244
Out[383]:
In [384]:
def QuickLeastSquares(xs, ys):
n = float(len(xs))
meanx = xs.mean()
dxs = xs - meanx
varx = np.dot(dxs, dxs) / n
meany = ys.mean()
dys = ys - meany
cov = np.dot(dxs, dys) / n
slope = cov / varx
inter = meany - slope * meanx
res = ys - (inter + slope * xs)
mse = np.dot(res, res) / n
return inter, slope, mse
In [385]:
join.screentime = pandas.to_datetime(join.screentime)
In [386]:
t = []
for name in join.columns:
try:
if join[name].var() < 1e-7:
continue
formula = 'totalwgt_lb ~ agepreg + ' + name
model = smf.ols(formula, data=join)
if model.nobs < len(join)/2:
continue
results = model.fit()
metric = results.rsquared
except:
continue
if not np.isnan(metric):
t.append((metric, name, results.nobs))
In [387]:
len(t), len(join.columns)
Out[387]:
In [388]:
import numpy as np
import re
t.sort(reverse=True)
for mse, name, n in t[:30]:
key = re.sub('_[r]$', '', name)
try:
desc = all_vars.loc[name].desc
if isinstance(desc, pandas.Series):
desc = desc[0]
print name, n, mse, desc
except KeyError:
print name, n, mse
In [389]:
boys = live[live.babysex==1]
girls = live[live.babysex==2]
In [390]:
boys.totalwgt_lb.mean()
Out[390]:
In [391]:
boys.totalwgt_lb.mean() - girls.totalwgt_lb.mean()
Out[391]:
In [507]:
import statsmodels.formula.api as smf
formula = 'totalwgt_lb ~ agepreg + C(race) + babysex==1 + nbrnaliv>1 + paydu==1 + totincr'
results = smf.ols(formula, data=join).fit()
results.summary()
Out[507]:
In [319]:
import statsmodels.formula.api as smf
import statsmodels.api as sm
formula = 'totalwgt_lb ~ prglngth>30 + babysex==1 + C(race_r) + nbrnaliv>1 + agepreg'
results = smf.ols(formula, data=join).fit()
results.summary()
Out[319]:
In [ ]:
In [472]:
join[join.agepreg<18].babysex.value_counts()
Out[472]:
In [494]:
join['boy'] = (join.babysex==1).astype(int)
join['isyoung'] = (join.agepreg<18).astype(int)
join['isold'] = (join.agepreg>35).astype(int)
join['intact'] = (join.intact18==1).astype(int)
join['wait'] = (join.agefstsx - join.menarche)
join['agepreg2'] = join.agepreg**2
In [505]:
model = smf.logit('boy ~ agepreg + agepreg2 + agefstsx + wait + paydu==1 + prglngth', data=join)
results = model.fit()
results.summary()
Out[505]:
In [513]:
import first
live, firsts, others = first.MakeFrames()
live['boy'] = (live.babysex==1).astype(int)
live['agepreg2'] = live.agepreg**2
model = smf.logit('boy ~ agepreg + agepreg2', data=live)
results = model.fit()
results.summary()
Out[513]:
In [436]:
logit_mod = sm.Logit(join.male, join[['totincr', 'agepreg', '']])
logit_res = logit_mod.fit()
logit_res.summary()
#dir(logit_res)
Out[436]:
In [ ]: