notebook.community

Edit and run



In [1]:

    
from  numpy import *



In [2]:

    
a = array([1,2,3,4])



In [14]:

    
b = transpose(vstack([a, a]))
b









    Out[14]:





array([[1, 1],
       [2, 2],
       [3, 3],
       [4, 4]])



In [15]:

    
reshape(b, (1,8))









    Out[15]:





array([[1, 1, 2, 2, 3, 3, 4, 4]])



In [32]:

    
def interleave(a, b):
    c = empty(a.shape[0] + b.shape[0])
    print a.shape, b.shape, c.shape
    c[::2] = a
    c[1::2] = b
    return c



In [33]:

    
interleave(a, a)









    



(4,) (4,) (8,)






    Out[33]:





array([ 1.,  1.,  2.,  2.,  3.,  3.,  4.,  4.])



In [34]:

    
roll(a, 1)









    Out[34]:





array([4, 1, 2, 3])



In [36]:

    
a[1:] - a[:-1]









    Out[36]:





array([1, 1, 1])



In [37]:

    
b = roll(a, 1)
b[0] = 0
a - b









    Out[37]:





array([1, 1, 1, 1])



In [41]:

    
import brfss
import thinkstats2
import numpy as np
df = brfss.ReadBrfss(nrows=None)



In [139]:

    
df = df.dropna(subset=['htm3', 'wtkg2'])
cdf = thinkstats2.Cdf(df.htm3)
print cdf[200] - cdf[145]









    



0.994272822814



In [140]:

    
bins = np.arange(135, 210, 5)
print(bins)
indices = np.digitize(df.htm3, bins)
groups = df.groupby(indices)
groups









    



[135 140 145 150 155 160 165 170 175 180 185 190 195 200 205]






    Out[140]:





<pandas.core.groupby.DataFrameGroupBy object at 0x496add0>



In [141]:

    
for i, group in groups:
    print i, group.htm3.mean(), group.wtkg2.mean()









    



0 120.124590164 62.885704918
1 135.785087719 63.0217105263
2 141.433962264 59.052851153
3 146.337187789 60.5686540241
4 151.528439682 64.5687589957
5 156.350341994 67.8319274054
6 161.674210452 71.5338478969
7 166.569727467 75.4599640966
8 171.409542325 80.0723382746
9 176.54908086 86.2047739197
10 181.575961428 92.1623611526
11 186.324699905 97.9511067391
12 191.785971223 104.072843011
13 196.642656162 107.373590321
14 201.738271605 108.304419753
15 210.58778626 99.0778625954



In [143]:

    
for i, group in groups:
    print i, len(group)



In [144]:

    
cdfs = [thinkstats2.Cdf(group.wtkg2) for i, group in groups][1:-1]
print len(cdfs)



In [145]:

    
import thinkplot
thinkplot.PrePlot(3)
for percent in [75, 50, 25]:
    ys = [cdf.Percentile(percent) for cdf in cdfs]
    label = '%dth' % percent
    thinkplot.Plot(heights, ys, label=label)
thinkplot.Show()









    












    





<matplotlib.figure.Figure at 0x47e5450>



In [158]:

    
import first
live, firsts, others = first.MakeFrames()
live = live.dropna(subset=['agepreg', 'totalwgt_lb'])



In [159]:

    
live.agepreg









    Out[159]:





0     33.16
1     39.25
2     14.33
3     17.83
4     18.33
5     27.00
6     28.83
7     30.16
8     28.08
9     32.33
10    25.75
11    23.00
12    24.58
15    28.33
16    30.33
...
13566    32.66
13569    18.00
13570    24.41
13571    27.83
13572    33.16
13573    22.25
13574    24.41
13576    31.66
13578    24.00
13579    25.91
13581    30.66
13584    26.91
13588    17.91
13591    21.58
13592    21.58
Name: agepreg, Length: 9038, dtype: float64



In [160]:

    
live.totalwgt_lb









    Out[160]:





0     8.8125
1     7.8750
2     9.1250
3     7.0000
4     6.1875
5     8.5625
6     9.5625
7     8.3750
8     7.5625
9     6.6250
10    7.8125
11    7.0000
12    4.0000
15    7.6875
16    7.5000
...
13566    7.5000
13569    5.8125
13570    6.6875
13571    6.0000
13572    5.8125
13573    6.5625
13574    6.1250
13576    6.4375
13578    6.0000
13579    7.0000
13581    6.3750
13584    6.3750
13588    6.1875
13591    7.5000
13592    7.5000
Name: totalwgt_lb, Length: 9038, dtype: float64



In [161]:

    
thinkplot.Scatter(live.agepreg, live.totalwgt_lb)



In [163]:

    
thinkstats2.Corr(live.agepreg, live.totalwgt_lb)









    Out[163]:





0.068833970354109056



In [164]:

    
thinkstats2.SpearmanCorr(live.agepreg, live.totalwgt_lb)









    Out[164]:





0.09480324493300854



In [3]:

    
import first
import chap01ex_soln
import thinkstats2
import thinkplot
import pandas



In [4]:

    
live, first, others = first.MakeFrames()



In [5]:

    
resp = chap01ex_soln.ReadFemResp()
resp.index = resp.caseid



In [157]:

    
vars1 = thinkstats2.ReadStataDct('2002FemPreg.dct').variables
vars2 = thinkstats2.ReadStataDct('2002FemResp.dct').variables

all_vars = vars1.append(vars2)
all_vars.index = all_vars.name
all_vars.loc['birthwgt_lb'].desc









    Out[157]:





'BD-3 BIRTHWEIGHT IN POUNDS - 1ST BABY FROM THIS PREGNANCY'



In [163]:

    
all_vars.loc['race'].desc[0]









    Out[163]:





'RACE'



In [106]:

    
len(live), len(live.columns)









    Out[106]:





(9148, 244)



In [107]:

    
len(resp), len(resp.columns)









    Out[107]:





(7643, 3087)



In [382]:

    
import linear
live = live[live.prglngth>30]
live = linear.ResampleRowsWeighted(live)
join = live.join(resp, on='caseid', rsuffix='_r')



In [216]:

    
# %timeit join = pandas.merge(live, resp, left_on='caseid', right_index=True, sort=False)



In [383]:

    
len(join), len(join.columns), 3087+244









    Out[383]:





(8884, 3331, 3331)



In [384]:

    
def QuickLeastSquares(xs, ys):
    n = float(len(xs))

    meanx = xs.mean()
    dxs = xs - meanx
    varx = np.dot(dxs, dxs) / n

    meany = ys.mean()
    dys = ys - meany

    cov = np.dot(dxs, dys) / n
    slope = cov / varx
    inter = meany - slope * meanx

    res = ys - (inter + slope * xs)
    mse = np.dot(res, res) / n
    return inter, slope, mse



In [385]:

    
join.screentime = pandas.to_datetime(join.screentime)



In [386]:

    
t = []
for name in join.columns:
    try:
        if join[name].var() < 1e-7:
            continue
            
        formula = 'totalwgt_lb ~ agepreg + ' + name
        model = smf.ols(formula, data=join)
        if model.nobs < len(join)/2:
            continue
            
        results = model.fit()
        metric = results.rsquared
    except:
        continue
        
    if not np.isnan(metric):
        t.append((metric, name, results.nobs))



In [387]:

    
len(t), len(join.columns)









    Out[387]:





(882, 3331)



In [388]:

    
import numpy as np
import re

t.sort(reverse=True)
for mse, name, n in t[:30]:
    key = re.sub('_[r]$', '', name)
    try:
        desc = all_vars.loc[name].desc
        if isinstance(desc, pandas.Series):
            desc = desc[0]
        print name, n, mse, desc
    except KeyError:
        print name, n, mse









    



totalwgt_lb 8772.0 1.0
wantrp09_i 8772.0 0.97359596999 WANTRP09IMPUTATION FLAG
outcom09_i 8772.0 0.97359596999 OUTCOM09 IMPUTATION FLAG
oldwr09_i 8772.0 0.97359596999 OLDWR09 IMPUTATION FLAG
lbw1_i 8772.0 0.97359596999 LBW1 IMPUTATION FLAG
birthwgt_lb 8772.0 0.946178656244 BD-3 BIRTHWEIGHT IN POUNDS - 1ST BABY FROM THIS PREGNANCY
lbw1 8772.0 0.269930546161 LOW BIRTHWEIGHT - BABY 1
prglngth 8772.0 0.132829613743 DURATION OF COMPLETED PREGNANCY IN WEEKS
wksgest 8730.0 0.125332213206 GESTATIONAL LENGTH OF COMPLETED PREGNANCY (IN WEEKS)
agecon 8772.0 0.10161993152 AGE AT TIME OF CONCEPTION
mosgest 8772.0 0.0290869952502 GESTATIONAL LENGTH OF COMPLETED PREGNANCY (IN MONTHS)
babysex 8772.0 0.0258522351717 BD-2 SEX OF 1ST LIVEBORN BABY FROM THIS PREGNANCY
havedeg 4486.0 0.0177127753546 AF-10 WHETHER R HAS ANY COLLEGE OR UNIVERSITY DEGREES
fstmthp11 4471.0 0.0153211741586 EF-4 METHOD USED AT FIRST SEX WITH LAST PARTNER IN PAST 12 MONS -LAST MENTION
pubassis_r 8772.0 0.0148715291902
race_r 8772.0 0.0148256821308
race 8772.0 0.0148256821308 RACE
pubassis 8772.0 0.0147868712388 WHETHER R RECEIVED PUBLIC ASSISTANCE IN 2001
basewgt_r 8772.0 0.0146841660434
basewgt 8772.0 0.0146841660434 BASE WEIGHT
foodstmp 8772.0 0.0146783638487 JI-6 IN 2001 RECEIVED FOOD STAMPS
bfeedwks 7583.0 0.014421962084 DURATION OF BREASTFEEDING IN WEEKS
rmarout03 5788.0 0.013870056935 INFORMAL MARITAL STATUS WHEN PREGNANCY ENDED - 3RD
anynurse 7685.0 0.0138642482171 BH-1 WHETHER R BREASTFED THIS CHILD AT ALL - 1ST FROM THIS PREG
finalwgt_r 8772.0 0.0136692711511
finalwgt 8772.0 0.0136692711511 FINAL POST-STRATIFIED AND ADJUSTED WEIGHT
adj_mod_basewgt_r 8772.0 0.013391634905
adj_mod_basewgt 8772.0 0.013391634905 ADJUSTED MODIFIED BASE WEIGHT
bc12pay141 4494.0 0.0133708447233 FA-6 WAY BILL WAS PAID-ALL SERVICES IN 1 VISIT, 1ST METHOD
bc12plcx14 4494.0 0.013293130539 FA-5 WHERE R REC'D SERVICES OBTAINED IN 1 VISIT IN LAST 12 MONTHS



In [389]:

    
boys = live[live.babysex==1]
girls = live[live.babysex==2]



In [390]:

    
boys.totalwgt_lb.mean()









    Out[390]:





7.5857812851044706



In [391]:

    
boys.totalwgt_lb.mean() - girls.totalwgt_lb.mean()









    Out[391]:





0.33394432606721036



In [507]:

    
import statsmodels.formula.api as smf
formula = 'totalwgt_lb ~ agepreg + C(race) + babysex==1 + nbrnaliv>1 + paydu==1 + totincr'
results = smf.ols(formula, data=join).fit()
results.summary()









    Out[507]:





OLS Regression Results

  Dep. Variable:        totalwgt_lb      R-squared:             0.058 


  Model:                    OLS          Adj. R-squared:        0.057 


  Method:              Least Squares     F-statistic:           76.70 


  Date:              Wed, 25 Jun 2014    Prob (F-statistic):  2.14e-108


  Time:                  14:59:51        Log-Likelihood:      -13977. 


  No. Observations:         8772         AIC:                2.797e+04


  Df Residuals:             8764         BIC:                2.803e+04


  Df Model:                    7                                      




                          coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept                 6.5289      0.066     98.859   0.000      6.399     6.658


  C(race)[T.2]              0.3613      0.036     10.120   0.000      0.291     0.431


  C(race)[T.3]              0.2854      0.057      4.984   0.000      0.173     0.398


  babysex == 1[T.True]      0.3384      0.025     13.297   0.000      0.289     0.388


  nbrnaliv > 1[T.True]     -1.1490      0.104    -11.090   0.000     -1.352    -0.946


  paydu == 1[T.True]        0.1115      0.031      3.650   0.000      0.052     0.171


  agepreg                   0.0108      0.003      4.319   0.000      0.006     0.016


  totincr                   0.0110      0.004      2.790   0.005      0.003     0.019




  Omnibus:        357.878    Durbin-Watson:         1.992 


  Prob(Omnibus):   0.000     Jarque-Bera (JB):   1057.180 


  Skew:           -0.127     Prob(JB):           2.73e-230


  Kurtosis:        4.682     Cond. No.               225.



In [319]:

    
import statsmodels.formula.api as smf
import statsmodels.api as sm
formula = 'totalwgt_lb ~ prglngth>30 + babysex==1 + C(race_r) + nbrnaliv>1 + agepreg'
results = smf.ols(formula, data=join).fit()
results.summary()









    Out[319]:





OLS Regression Results

  Dep. Variable:        totalwgt_lb      R-squared:             0.056 


  Model:                    OLS          Adj. R-squared:        0.055 


  Method:              Least Squares     F-statistic:           103.2 


  Date:              Tue, 24 Jun 2014    Prob (F-statistic):  4.10e-106


  Time:                  13:14:18        Log-Likelihood:      -14316. 


  No. Observations:         8781         AIC:                2.864e+04


  Df Residuals:             8775         BIC:                2.869e+04


  Df Model:                    5                                      




                          coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept                 6.6326      0.064    104.338   0.000      6.508     6.757


  babysex == 1[T.True]      0.2949      0.026     11.177   0.000      0.243     0.347


  C(race_r)[T.2]            0.4054      0.031     13.076   0.000      0.345     0.466


  C(race_r)[T.3]            0.2762      0.052      5.356   0.000      0.175     0.377


  nbrnaliv > 1[T.True]     -1.3990      0.108    -12.941   0.000     -1.611    -1.187


  agepreg                   0.0125      0.002      5.178   0.000      0.008     0.017




  Omnibus:        400.419    Durbin-Watson:         1.600 


  Prob(Omnibus):   0.000     Jarque-Bera (JB):   1387.956 


  Skew:           -0.052     Prob(JB):           4.07e-302


  Kurtosis:        4.945     Cond. No.               210.



In [ ]:



In [472]:

    
join[join.agepreg<18].babysex.value_counts()









    Out[472]:





1    357
2    295
dtype: int64



In [494]:

    
join['boy'] = (join.babysex==1).astype(int)
join['isyoung'] = (join.agepreg<18).astype(int)
join['isold'] = (join.agepreg>35).astype(int)
join['intact'] = (join.intact18==1).astype(int)
join['wait'] = (join.agefstsx - join.menarche)
join['agepreg2'] = join.agepreg**2



In [505]:

    
model = smf.logit('boy ~ agepreg + agepreg2 + agefstsx + wait + paydu==1 + prglngth', data=join)
results = model.fit()
results.summary()









    



Optimization terminated successfully.
         Current function value: 0.690493
         Iterations 5






    Out[505]:





Logit Regression Results

  Dep. Variable:         boy          No. Observations:      8884  


  Model:                Logit         Df Residuals:          8877  


  Method:                MLE          Df Model:                 6  


  Date:           Wed, 25 Jun 2014    Pseudo R-squ.:      0.003741 


  Time:               14:29:52        Log-Likelihood:      -6134.3 


  converged:            True          LL-Null:             -6157.4 


                                    LLR p-value:        2.863e-08




                        coef      std err       z       P>|z|  [95.0% Conf. Int.] 


  Intercept              -0.0593      0.590     -0.101   0.920     -1.215     1.097


  paydu == 1[T.True]      0.0729      0.046      1.596   0.111     -0.017     0.162


  agepreg                -0.1098      0.032     -3.401   0.001     -0.173    -0.047


  agepreg2                0.0019      0.001      3.120   0.002      0.001     0.003


  agefstsx                0.0162      0.004      4.164   0.000      0.009     0.024


  wait                   -0.0108      0.003     -3.191   0.001     -0.017    -0.004


  prglngth                0.0338      0.011      3.060   0.002      0.012     0.055



In [513]:

    
import first
live, firsts, others = first.MakeFrames()
live['boy'] = (live.babysex==1).astype(int)
live['agepreg2'] = live.agepreg**2
model = smf.logit('boy ~ agepreg + agepreg2', data=live)
results = model.fit()
results.summary()









    



Optimization terminated successfully.
         Current function value: 0.692962
         Iterations 3






    Out[513]:





Logit Regression Results

  Dep. Variable:         boy          No. Observations:      9148  


  Model:                Logit         Df Residuals:          9145  


  Method:                MLE          Df Model:                 2  


  Date:           Wed, 25 Jun 2014    Pseudo R-squ.:      0.0001122


  Time:               15:04:00        Log-Likelihood:      -6339.2 


  converged:            True          LL-Null:             -6339.9 


                                    LLR p-value:         0.4909  




               coef      std err       z       P>|z|  [95.0% Conf. Int.] 


  Intercept      0.4111      0.389      1.058   0.290     -0.351     1.173


  agepreg       -0.0323      0.031     -1.057   0.290     -0.092     0.028


  agepreg2       0.0006      0.001      1.117   0.264     -0.000     0.002



In [436]:

    
logit_mod = sm.Logit(join.male, join[['totincr', 'agepreg', '']])
logit_res = logit_mod.fit()
logit_res.summary()
#dir(logit_res)









    



Optimization terminated successfully.
         Current function value: 0.693047
         Iterations 3






    Out[436]:





Logit Regression Results

  Dep. Variable:        male          No. Observations:      8884  


  Model:                Logit         Df Residuals:          8882  


  Method:                MLE          Df Model:                 1  


  Date:           Tue, 24 Jun 2014    Pseudo R-squ.:      5.681e-05


  Time:               16:42:23        Log-Likelihood:      -6157.0 


  converged:            True          LL-Null:             -6157.4 


                                    LLR p-value:         0.4029  




             coef      std err       z       P>|z|  [95.0% Conf. Int.] 


  totincr      0.0066      0.006      1.141   0.254     -0.005     0.018


  agepreg     -0.0017      0.002     -0.800   0.424     -0.006     0.003



In [ ]:

Dep. Variable:	totalwgt_lb	R-squared:	0.058
Model:	OLS	Adj. R-squared:	0.057
Method:	Least Squares	F-statistic:	76.70
Date:	Wed, 25 Jun 2014	Prob (F-statistic):	2.14e-108
Time:	14:59:51	Log-Likelihood:	-13977.
No. Observations:	8772	AIC:	2.797e+04
Df Residuals:	8764	BIC:	2.803e+04
Df Model:	7

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	6.5289	0.066	98.859	0.000	6.399 6.658
C(race)[T.2]	0.3613	0.036	10.120	0.000	0.291 0.431
C(race)[T.3]	0.2854	0.057	4.984	0.000	0.173 0.398
babysex == 1[T.True]	0.3384	0.025	13.297	0.000	0.289 0.388
nbrnaliv > 1[T.True]	-1.1490	0.104	-11.090	0.000	-1.352 -0.946
paydu == 1[T.True]	0.1115	0.031	3.650	0.000	0.052 0.171
agepreg	0.0108	0.003	4.319	0.000	0.006 0.016
totincr	0.0110	0.004	2.790	0.005	0.003 0.019

Omnibus:	357.878	Durbin-Watson:	1.992
Prob(Omnibus):	0.000	Jarque-Bera (JB):	1057.180
Skew:	-0.127	Prob(JB):	2.73e-230
Kurtosis:	4.682	Cond. No.	225.

Omnibus:	400.419	Durbin-Watson:	1.600
Prob(Omnibus):	0.000	Jarque-Bera (JB):	1387.956
Skew:	-0.052	Prob(JB):	4.07e-302
Kurtosis:	4.945	Cond. No.	210.

Dep. Variable:	boy	No. Observations:	8884
Model:	Logit	Df Residuals:	8877
Method:	MLE	Df Model:	6
Date:	Wed, 25 Jun 2014	Pseudo R-squ.:	0.003741
Time:	14:29:52	Log-Likelihood:	-6134.3
converged:	True	LL-Null:	-6157.4
		LLR p-value:	2.863e-08

	coef	std err	z	P>\|z\|	[95.0% Conf. Int.]
Intercept	-0.0593	0.590	-0.101	0.920	-1.215 1.097
paydu == 1[T.True]	0.0729	0.046	1.596	0.111	-0.017 0.162
agepreg	-0.1098	0.032	-3.401	0.001	-0.173 -0.047
agepreg2	0.0019	0.001	3.120	0.002	0.001 0.003
agefstsx	0.0162	0.004	4.164	0.000	0.009 0.024
wait	-0.0108	0.003	-3.191	0.001	-0.017 -0.004
prglngth	0.0338	0.011	3.060	0.002	0.012 0.055

Dep. Variable:	boy	No. Observations:	9148
Model:	Logit	Df Residuals:	9145
Method:	MLE	Df Model:	2
Date:	Wed, 25 Jun 2014	Pseudo R-squ.:	0.0001122
Time:	15:04:00	Log-Likelihood:	-6339.2
converged:	True	LL-Null:	-6339.9
		LLR p-value:	0.4909

Dep. Variable:	male	No. Observations:	8884
Model:	Logit	Df Residuals:	8882
Method:	MLE	Df Model:	1
Date:	Tue, 24 Jun 2014	Pseudo R-squ.:	5.681e-05
Time:	16:42:23	Log-Likelihood:	-6157.0
converged:	True	LL-Null:	-6157.4
		LLR p-value:	0.4029