In [1]:
from  numpy import *

In [2]:
a = array([1,2,3,4])

In [14]:
b = transpose(vstack([a, a]))
b


Out[14]:
array([[1, 1],
       [2, 2],
       [3, 3],
       [4, 4]])

In [15]:
reshape(b, (1,8))


Out[15]:
array([[1, 1, 2, 2, 3, 3, 4, 4]])

In [32]:
def interleave(a, b):
    c = empty(a.shape[0] + b.shape[0])
    print a.shape, b.shape, c.shape
    c[::2] = a
    c[1::2] = b
    return c

In [33]:
interleave(a, a)


(4,) (4,) (8,)
Out[33]:
array([ 1.,  1.,  2.,  2.,  3.,  3.,  4.,  4.])

In [34]:
roll(a, 1)


Out[34]:
array([4, 1, 2, 3])

In [36]:
a[1:] - a[:-1]


Out[36]:
array([1, 1, 1])

In [37]:
b = roll(a, 1)
b[0] = 0
a - b


Out[37]:
array([1, 1, 1, 1])

In [41]:
import brfss
import thinkstats2
import numpy as np
df = brfss.ReadBrfss(nrows=None)

In [139]:
df = df.dropna(subset=['htm3', 'wtkg2'])
cdf = thinkstats2.Cdf(df.htm3)
print cdf[200] - cdf[145]


0.994272822814

In [140]:
bins = np.arange(135, 210, 5)
print(bins)
indices = np.digitize(df.htm3, bins)
groups = df.groupby(indices)
groups


[135 140 145 150 155 160 165 170 175 180 185 190 195 200 205]
Out[140]:
<pandas.core.groupby.DataFrameGroupBy object at 0x496add0>

In [141]:
for i, group in groups:
    print i, group.htm3.mean(), group.wtkg2.mean()


0 120.124590164 62.885704918
1 135.785087719 63.0217105263
2 141.433962264 59.052851153
3 146.337187789 60.5686540241
4 151.528439682 64.5687589957
5 156.350341994 67.8319274054
6 161.674210452 71.5338478969
7 166.569727467 75.4599640966
8 171.409542325 80.0723382746
9 176.54908086 86.2047739197
10 181.575961428 92.1623611526
11 186.324699905 97.9511067391
12 191.785971223 104.072843011
13 196.642656162 107.373590321
14 201.738271605 108.304419753
15 210.58778626 99.0778625954

In [143]:
for i, group in groups:
    print i, len(group)


0 305
1 228
2 477
3 2162
4 18759
5 45761
6 70610
7 72138
8 61725
9 49938
10 43555
11 20077
12 7784
13 1777
14 405
15 131

In [144]:
cdfs = [thinkstats2.Cdf(group.wtkg2) for i, group in groups][1:-1]
print len(cdfs)


14

In [145]:
import thinkplot
thinkplot.PrePlot(3)
for percent in [75, 50, 25]:
    ys = [cdf.Percentile(percent) for cdf in cdfs]
    label = '%dth' % percent
    thinkplot.Plot(heights, ys, label=label)
thinkplot.Show()


<matplotlib.figure.Figure at 0x47e5450>

In [158]:
import first
live, firsts, others = first.MakeFrames()
live = live.dropna(subset=['agepreg', 'totalwgt_lb'])

In [159]:
live.agepreg


Out[159]:
0     33.16
1     39.25
2     14.33
3     17.83
4     18.33
5     27.00
6     28.83
7     30.16
8     28.08
9     32.33
10    25.75
11    23.00
12    24.58
15    28.33
16    30.33
...
13566    32.66
13569    18.00
13570    24.41
13571    27.83
13572    33.16
13573    22.25
13574    24.41
13576    31.66
13578    24.00
13579    25.91
13581    30.66
13584    26.91
13588    17.91
13591    21.58
13592    21.58
Name: agepreg, Length: 9038, dtype: float64

In [160]:
live.totalwgt_lb


Out[160]:
0     8.8125
1     7.8750
2     9.1250
3     7.0000
4     6.1875
5     8.5625
6     9.5625
7     8.3750
8     7.5625
9     6.6250
10    7.8125
11    7.0000
12    4.0000
15    7.6875
16    7.5000
...
13566    7.5000
13569    5.8125
13570    6.6875
13571    6.0000
13572    5.8125
13573    6.5625
13574    6.1250
13576    6.4375
13578    6.0000
13579    7.0000
13581    6.3750
13584    6.3750
13588    6.1875
13591    7.5000
13592    7.5000
Name: totalwgt_lb, Length: 9038, dtype: float64

In [161]:
thinkplot.Scatter(live.agepreg, live.totalwgt_lb)



In [163]:
thinkstats2.Corr(live.agepreg, live.totalwgt_lb)


Out[163]:
0.068833970354109056

In [164]:
thinkstats2.SpearmanCorr(live.agepreg, live.totalwgt_lb)


Out[164]:
0.09480324493300854

In [3]:
import first
import chap01ex_soln
import thinkstats2
import thinkplot
import pandas

In [4]:
live, first, others = first.MakeFrames()

In [5]:
resp = chap01ex_soln.ReadFemResp()
resp.index = resp.caseid

In [157]:
vars1 = thinkstats2.ReadStataDct('2002FemPreg.dct').variables
vars2 = thinkstats2.ReadStataDct('2002FemResp.dct').variables

all_vars = vars1.append(vars2)
all_vars.index = all_vars.name
all_vars.loc['birthwgt_lb'].desc


Out[157]:
'BD-3 BIRTHWEIGHT IN POUNDS - 1ST BABY FROM THIS PREGNANCY'

In [163]:
all_vars.loc['race'].desc[0]


Out[163]:
'RACE'

In [106]:
len(live), len(live.columns)


Out[106]:
(9148, 244)

In [107]:
len(resp), len(resp.columns)


Out[107]:
(7643, 3087)

In [382]:
import linear
live = live[live.prglngth>30]
live = linear.ResampleRowsWeighted(live)
join = live.join(resp, on='caseid', rsuffix='_r')

In [216]:
# %timeit join = pandas.merge(live, resp, left_on='caseid', right_index=True, sort=False)

In [383]:
len(join), len(join.columns), 3087+244


Out[383]:
(8884, 3331, 3331)

In [384]:
def QuickLeastSquares(xs, ys):
    n = float(len(xs))

    meanx = xs.mean()
    dxs = xs - meanx
    varx = np.dot(dxs, dxs) / n

    meany = ys.mean()
    dys = ys - meany

    cov = np.dot(dxs, dys) / n
    slope = cov / varx
    inter = meany - slope * meanx

    res = ys - (inter + slope * xs)
    mse = np.dot(res, res) / n
    return inter, slope, mse

In [385]:
join.screentime = pandas.to_datetime(join.screentime)

In [386]:
t = []
for name in join.columns:
    try:
        if join[name].var() < 1e-7:
            continue
            
        formula = 'totalwgt_lb ~ agepreg + ' + name
        model = smf.ols(formula, data=join)
        if model.nobs < len(join)/2:
            continue
            
        results = model.fit()
        metric = results.rsquared
    except:
        continue
        
    if not np.isnan(metric):
        t.append((metric, name, results.nobs))

In [387]:
len(t), len(join.columns)


Out[387]:
(882, 3331)

In [388]:
import numpy as np
import re

t.sort(reverse=True)
for mse, name, n in t[:30]:
    key = re.sub('_[r]$', '', name)
    try:
        desc = all_vars.loc[name].desc
        if isinstance(desc, pandas.Series):
            desc = desc[0]
        print name, n, mse, desc
    except KeyError:
        print name, n, mse


totalwgt_lb 8772.0 1.0
wantrp09_i 8772.0 0.97359596999 WANTRP09IMPUTATION FLAG
outcom09_i 8772.0 0.97359596999 OUTCOM09 IMPUTATION FLAG
oldwr09_i 8772.0 0.97359596999 OLDWR09 IMPUTATION FLAG
lbw1_i 8772.0 0.97359596999 LBW1 IMPUTATION FLAG
birthwgt_lb 8772.0 0.946178656244 BD-3 BIRTHWEIGHT IN POUNDS - 1ST BABY FROM THIS PREGNANCY
lbw1 8772.0 0.269930546161 LOW BIRTHWEIGHT - BABY 1
prglngth 8772.0 0.132829613743 DURATION OF COMPLETED PREGNANCY IN WEEKS
wksgest 8730.0 0.125332213206 GESTATIONAL LENGTH OF COMPLETED PREGNANCY (IN WEEKS)
agecon 8772.0 0.10161993152 AGE AT TIME OF CONCEPTION
mosgest 8772.0 0.0290869952502 GESTATIONAL LENGTH OF COMPLETED PREGNANCY (IN MONTHS)
babysex 8772.0 0.0258522351717 BD-2 SEX OF 1ST LIVEBORN BABY FROM THIS PREGNANCY
havedeg 4486.0 0.0177127753546 AF-10 WHETHER R HAS ANY COLLEGE OR UNIVERSITY DEGREES
fstmthp11 4471.0 0.0153211741586 EF-4 METHOD USED AT FIRST SEX WITH LAST PARTNER IN PAST 12 MONS -LAST MENTION
pubassis_r 8772.0 0.0148715291902
race_r 8772.0 0.0148256821308
race 8772.0 0.0148256821308 RACE
pubassis 8772.0 0.0147868712388 WHETHER R RECEIVED PUBLIC ASSISTANCE IN 2001
basewgt_r 8772.0 0.0146841660434
basewgt 8772.0 0.0146841660434 BASE WEIGHT
foodstmp 8772.0 0.0146783638487 JI-6 IN 2001 RECEIVED FOOD STAMPS
bfeedwks 7583.0 0.014421962084 DURATION OF BREASTFEEDING IN WEEKS
rmarout03 5788.0 0.013870056935 INFORMAL MARITAL STATUS WHEN PREGNANCY ENDED - 3RD
anynurse 7685.0 0.0138642482171 BH-1 WHETHER R BREASTFED THIS CHILD AT ALL - 1ST FROM THIS PREG
finalwgt_r 8772.0 0.0136692711511
finalwgt 8772.0 0.0136692711511 FINAL POST-STRATIFIED AND ADJUSTED WEIGHT
adj_mod_basewgt_r 8772.0 0.013391634905
adj_mod_basewgt 8772.0 0.013391634905 ADJUSTED MODIFIED BASE WEIGHT
bc12pay141 4494.0 0.0133708447233 FA-6 WAY BILL WAS PAID-ALL SERVICES IN 1 VISIT, 1ST METHOD
bc12plcx14 4494.0 0.013293130539 FA-5 WHERE R REC'D SERVICES OBTAINED IN 1 VISIT IN LAST 12 MONTHS

In [389]:
boys = live[live.babysex==1]
girls = live[live.babysex==2]

In [390]:
boys.totalwgt_lb.mean()


Out[390]:
7.5857812851044706

In [391]:
boys.totalwgt_lb.mean() - girls.totalwgt_lb.mean()


Out[391]:
0.33394432606721036

In [507]:
import statsmodels.formula.api as smf
formula = 'totalwgt_lb ~ agepreg + C(race) + babysex==1 + nbrnaliv>1 + paydu==1 + totincr'
results = smf.ols(formula, data=join).fit()
results.summary()


Out[507]:
OLS Regression Results
Dep. Variable: totalwgt_lb R-squared: 0.058
Model: OLS Adj. R-squared: 0.057
Method: Least Squares F-statistic: 76.70
Date: Wed, 25 Jun 2014 Prob (F-statistic): 2.14e-108
Time: 14:59:51 Log-Likelihood: -13977.
No. Observations: 8772 AIC: 2.797e+04
Df Residuals: 8764 BIC: 2.803e+04
Df Model: 7
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 6.5289 0.066 98.859 0.000 6.399 6.658
C(race)[T.2] 0.3613 0.036 10.120 0.000 0.291 0.431
C(race)[T.3] 0.2854 0.057 4.984 0.000 0.173 0.398
babysex == 1[T.True] 0.3384 0.025 13.297 0.000 0.289 0.388
nbrnaliv > 1[T.True] -1.1490 0.104 -11.090 0.000 -1.352 -0.946
paydu == 1[T.True] 0.1115 0.031 3.650 0.000 0.052 0.171
agepreg 0.0108 0.003 4.319 0.000 0.006 0.016
totincr 0.0110 0.004 2.790 0.005 0.003 0.019
Omnibus: 357.878 Durbin-Watson: 1.992
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1057.180
Skew: -0.127 Prob(JB): 2.73e-230
Kurtosis: 4.682 Cond. No. 225.

In [319]:
import statsmodels.formula.api as smf
import statsmodels.api as sm
formula = 'totalwgt_lb ~ prglngth>30 + babysex==1 + C(race_r) + nbrnaliv>1 + agepreg'
results = smf.ols(formula, data=join).fit()
results.summary()


Out[319]:
OLS Regression Results
Dep. Variable: totalwgt_lb R-squared: 0.056
Model: OLS Adj. R-squared: 0.055
Method: Least Squares F-statistic: 103.2
Date: Tue, 24 Jun 2014 Prob (F-statistic): 4.10e-106
Time: 13:14:18 Log-Likelihood: -14316.
No. Observations: 8781 AIC: 2.864e+04
Df Residuals: 8775 BIC: 2.869e+04
Df Model: 5
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 6.6326 0.064 104.338 0.000 6.508 6.757
babysex == 1[T.True] 0.2949 0.026 11.177 0.000 0.243 0.347
C(race_r)[T.2] 0.4054 0.031 13.076 0.000 0.345 0.466
C(race_r)[T.3] 0.2762 0.052 5.356 0.000 0.175 0.377
nbrnaliv > 1[T.True] -1.3990 0.108 -12.941 0.000 -1.611 -1.187
agepreg 0.0125 0.002 5.178 0.000 0.008 0.017
Omnibus: 400.419 Durbin-Watson: 1.600
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1387.956
Skew: -0.052 Prob(JB): 4.07e-302
Kurtosis: 4.945 Cond. No. 210.

In [ ]:


In [472]:
join[join.agepreg<18].babysex.value_counts()


Out[472]:
1    357
2    295
dtype: int64

In [494]:
join['boy'] = (join.babysex==1).astype(int)
join['isyoung'] = (join.agepreg<18).astype(int)
join['isold'] = (join.agepreg>35).astype(int)
join['intact'] = (join.intact18==1).astype(int)
join['wait'] = (join.agefstsx - join.menarche)
join['agepreg2'] = join.agepreg**2

In [505]:
model = smf.logit('boy ~ agepreg + agepreg2 + agefstsx + wait + paydu==1 + prglngth', data=join)
results = model.fit()
results.summary()


Optimization terminated successfully.
         Current function value: 0.690493
         Iterations 5
Out[505]:
Logit Regression Results
Dep. Variable: boy No. Observations: 8884
Model: Logit Df Residuals: 8877
Method: MLE Df Model: 6
Date: Wed, 25 Jun 2014 Pseudo R-squ.: 0.003741
Time: 14:29:52 Log-Likelihood: -6134.3
converged: True LL-Null: -6157.4
LLR p-value: 2.863e-08
coef std err z P>|z| [95.0% Conf. Int.]
Intercept -0.0593 0.590 -0.101 0.920 -1.215 1.097
paydu == 1[T.True] 0.0729 0.046 1.596 0.111 -0.017 0.162
agepreg -0.1098 0.032 -3.401 0.001 -0.173 -0.047
agepreg2 0.0019 0.001 3.120 0.002 0.001 0.003
agefstsx 0.0162 0.004 4.164 0.000 0.009 0.024
wait -0.0108 0.003 -3.191 0.001 -0.017 -0.004
prglngth 0.0338 0.011 3.060 0.002 0.012 0.055

In [513]:
import first
live, firsts, others = first.MakeFrames()
live['boy'] = (live.babysex==1).astype(int)
live['agepreg2'] = live.agepreg**2
model = smf.logit('boy ~ agepreg + agepreg2', data=live)
results = model.fit()
results.summary()


Optimization terminated successfully.
         Current function value: 0.692962
         Iterations 3
Out[513]:
Logit Regression Results
Dep. Variable: boy No. Observations: 9148
Model: Logit Df Residuals: 9145
Method: MLE Df Model: 2
Date: Wed, 25 Jun 2014 Pseudo R-squ.: 0.0001122
Time: 15:04:00 Log-Likelihood: -6339.2
converged: True LL-Null: -6339.9
LLR p-value: 0.4909
coef std err z P>|z| [95.0% Conf. Int.]
Intercept 0.4111 0.389 1.058 0.290 -0.351 1.173
agepreg -0.0323 0.031 -1.057 0.290 -0.092 0.028
agepreg2 0.0006 0.001 1.117 0.264 -0.000 0.002

In [436]:
logit_mod = sm.Logit(join.male, join[['totincr', 'agepreg', '']])
logit_res = logit_mod.fit()
logit_res.summary()
#dir(logit_res)


Optimization terminated successfully.
         Current function value: 0.693047
         Iterations 3
Out[436]:
Logit Regression Results
Dep. Variable: male No. Observations: 8884
Model: Logit Df Residuals: 8882
Method: MLE Df Model: 1
Date: Tue, 24 Jun 2014 Pseudo R-squ.: 5.681e-05
Time: 16:42:23 Log-Likelihood: -6157.0
converged: True LL-Null: -6157.4
LLR p-value: 0.4029
coef std err z P>|z| [95.0% Conf. Int.]
totincr 0.0066 0.006 1.141 0.254 -0.005 0.018
agepreg -0.0017 0.002 -0.800 0.424 -0.006 0.003

In [ ]: