validate1982data



In [40]:
from __future__ import print_function, division

import survival
import thinkstats2
import thinkplot

import gzip
import pandas

%matplotlib inline

In [41]:
filename = '1982NSFGData.dat.gz'
fp = gzip.open(filename, 'rb')

In [42]:
s = '0123456789'
print(s*8)

for i, line in enumerate(fp):
    print(line)
    if i > 0:
        break


01234567890123456789012345678901234567890123456789012345678901234567890123456789
1011010046606902421111 1      1      1      1        4    1191         0021                               000000000                                                           1111111111111129898192 9696                    1200109335 0939012094051                                122       012  1012    2 390520    999133342         0111222222222212220120012001200120012001200120012001201                                                    2 2     103032    2330303      122               21122222222201      15  2  11111201      1503     9898229              2 211     07876150951610985                                                                                                                          101  027351112                                      12103  3 13                                        22122122217207052   303      13099702071233NY20060911699175681105009256    005945196904642                                                 00                         0059452  050023011  000042531162211610261111071519        001    6         1101201212222222222122211208 08                       22 12222212122002111112    2  1        073    42 115  00000000001  6    000000     00                     030030000000000000000    0000000000       073    12                              9924     0000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000                                                               0000400

1011010087506731731121 1      1      1      1        4    1321         01                                 0100000012                                                          1111111111111129898172 9696                    12012096351                                             142       002  1012    2 160332    0362    10607099660091111222222212220000300030000120012001200120012001201                                                    2 2     101011    1290202      122               222222222222                                     19898182              11211     14354084491510954     01       09441925                  999999                                                                141  0610967 01  383351091101024523                             12806  3 13              070327  142              12222222221520455112 0        13099702071233NY70013778104114011103003265    001614669459374 010001010000100111111000000000101000009620030962019696                     0016147  0310230111 0000827311522115139011111408170409447801109445         1100200211112222222122211208 08                       18 22        0000             2      01061225842 135  01010100002 16    000000     01           018096203501002001000101000010011111100000000018    061220114024   225899999999            9883     0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000          009995233324081096230953                             0000800


In [43]:
filename = '1982NSFGData.dat.gz'
names = ['finalwgt', 'ageint', 'mar2p', 'cmmarrhx', 'cmintvw', 'cmbirth']
colspecs = [(976-1, 982),
            (1001-1, 1002),
            (1268-1, 1271),
            (1037-1, 1040),
            (841-1, 844),
            (12-1, 15),
            ]
df = pandas.read_fwf(filename,
                     colspecs=colspecs, 
                     names=names,
                     header=None,
                     nrows=7969,
                     compression='gzip')

len(df) # should be 7969


Out[43]:
7969

In [44]:
df.ageint.value_counts().sort_index()


Out[44]:
15    269
16    300
17    370
18    480
19    469
20    342
21    282
22    280
23    303
24    283
25    278
26    319
27    289
28    315
29    300
30    293
31    289
32    246
33    257
34    263
35    236
36    202
37    163
38    166
39    181
40    181
41    145
42    161
43    147
44    160
dtype: int64

In [45]:
df[df.cmbirth>9000].shape   #should be 21 unknown month of birth


Out[45]:
(21, 6)

In [46]:
df.loc[df.cmbirth>9000, 'cmbirth'] -= 9000

In [47]:
survival.CleanData(df)

In [48]:
df[df.mar2p.isnull()].shape


Out[48]:
(3148, 10)

In [49]:
df[df.cmmarrhx.isnull()].shape


Out[49]:
(3318, 10)

In [50]:
df['evrmarry'] = ~df.cmmarrhx.isnull()

In [51]:
df.loc[df.evrmarry & df.cmmarrhx.isnull()].shape


Out[51]:
(0, 11)

In [52]:
df.ageint.value_counts().sort_index()


Out[52]:
15    269
16    300
17    370
18    480
19    469
20    342
21    282
22    280
23    303
24    283
25    278
26    319
27    289
28    315
29    300
30    293
31    289
32    246
33    257
34    263
35    236
36    202
37    163
38    166
39    181
40    181
41    145
42    161
43    147
44    160
dtype: int64

In [53]:
cdf = thinkstats2.Cdf(df.agemarry)
thinkplot.Cdf(cdf)
len(df.agemarry.dropna())


Out[53]:
4651

In [9]:


In [ ]: