validate1988data



In [1]:
from __future__ import print_function, division

import survival
import thinkstats2
import thinkplot

import gzip
import pandas
import numpy as np

%matplotlib inline

In [2]:
filename = '1988FemRespData.dat'
fin = open(filename, 'r')
line = fin.read(3553)
print(line)


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-2-2fcebc07c2d8> in <module>()
      1 filename = '1988FemRespData.dat'
----> 2 fin = open(filename, 'r')
      3 line = fin.read(3553)
      4 print(line)

IOError: [Errno 2] No such file or directory: '1988FemRespData.dat'

In [48]:
def add_newlines():
    filename = '1988FemRespData.dat'
    fin = open(filename, 'r')
    fout = open('1988FemRespDataLines.dat', 'w')

    for i in range(8450):
        line = fin.read(3553)
        fout.write(line + '\n')
    
    fout.close()

In [3]:
filename = '1988FemRespDataLines.dat.gz'
fp = gzip.open(filename, 'r')

s = '0123456789'
print(s*8)

for i, line in enumerate(fp):
    print(line)
    if i > 0:
        break


01234567890123456789012345678901234567890123456789012345678901234567890123456789
0000100   201061       6 00723  0232821111 1      1      1      1        161   00000  0000011 01044   2                                     1   213318    11  0000000000 2                                                                                 000022   2    0000000000   0000000000   0000000000 2      111111111111111195060207241200000                                        00000          00000000000000000000000000000000      00000  00000   00000  00000   00000  00000   00000  00000                                                 00000                                                                                                                                                                                                                                                            01061222   00000                          00000                          00000                          00000                          00000                                  1               00000 2            2       1021     969602    2    01103536                               00000                                                                               00000                                        00000                                                        00000                               2  0000000000 2222 2 21000003000000 2   51000003000000  51000200040506000000152                                            222222  1                        2048180125 440004 2 000200000000000000000000000000002  5    00000   00000    00000   00000    0000000000000000000000000   0000000000000000000000000   00000000000000000000 00000    00000                                  00000      00000                                                                            00000 1010000000000070020600163017452210104810000055061200                                                                                                                                                                                                                  1                                11222222210606011999999999999999999999999999999                             01061      628279931161      2600000113181100000000000000000                                           00002                                     30   00        42     5     6 9696   0201                                     33333                        2  1  51 5100020004050600000015         00                                  1          62114701312         200        2812       2      2 2053249105343590534359057514407137921000000000000000000000000000 00000 00000000000000000000000  0000000000000000 0  000000 0000000000000000 0000000000000000000000000  0  00 00000000000000000000000000000000 00000000  1262234       0       0 1547411 1625872       0       0 1563075       0 1394346 1455117       0 1530365       0       0 1530459       0 1384579 1584067       0       0 1390994       0 1449819 1492664       0 1616948       0       0 1275054       0 1302225 1289356       0       0 1483580 1440527       0       0 1325360       0 1740542 1504652       0 1466335       0 1373721       0 1494901       0       0 1492668       0 1530022       0 1449765       0 1661727 1370854       0       0       0 1463082       0 1522949       0 1538348       0 1457197       0 1327488       0 1491723       0 1621816       0 1478120       0       0 1485925       0 1413752       0 1503169       0 1532063       0 1370681 1200511       0       0 1307784 1440308       0       0 1430987       0 1619480 1500545       0

0000200   201059       2600614  0053732222 1      1      1      1        112   00000  000002 200843   2                                     2307198217    222 0100000102 1                                                                                 0201212  2    0000000000   0000000000   0000000000 2      1121121112112112  060102432 00784                                        00000         302010530105400000000000000000000122   00000  00000   00000  00000   00000  00000   00000  00000 222  1 2                                        00000        1   22222222222222220000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000050005000000000000000000000006000600060006000600001059221   00000                          00000                          00000                          00000                          00000                               211 010000000000002910541                                  02    1                    221                    00000   2       2222222222221                                                      190826   101000000000000000000000700020000000000785                                                        00000                           13222  0000000000 2221 2 21010000000500 1   52               100020004050600000015100005                                       222222  30000010062    96222     1014336121 110030 2 010000000000000000000000000000002  3  0 00000   00000    00000   00000    0000000000000000000000000   0000000000000000000000000   00000000000000000000 01014   100964030                           2   00000      00681    101010030 2 010000000000000000000000000000002  12 2   12               1908261 010000000000070020900170014537620102710000040031200                                                                                                                                                                                                                  400000005000000          453762032222221222100501100                        00                               01059      237370553103      190000020417120202010100000102000793  8882669962751408007841492100793  4  00011995995        222222222222222222200002333300 0078414322422 5     3 01019960002    14     33 221221 000000020000000033333                        2  1  52  1000200040506000000153 275    00                          0096403024         62121601211         1002917316737011416   21491001 2023575002357500235750028185303670223000000000000000000000000000 00000 00000000000000000000000  0000000000000000 0  000000 0000000000000000 0000000000000000000000000  0  00 00000000000000000000000000001000 000000001       0  697085 1108855       0       0  553996       0  799192  777176       0  672263       0       0  669647  642954       0  472556       0       0  955138       0  540935  735102       0  835328       0       0  836293  732392       0       0  690691  520363       0 1202312       0       0  516871  556760       0       0  752986  845990       0  561170       0  767332       0  547626       0  518231       0       0  595288       0 1506157       0  422700       0  544607       0  643258       0 1045080       0  763247       0  626630  538157  896762       0  562327  584254       0  820451       0  496369       0  863794       0       0  599524       0  620449       0  705749       0  977984  598224       0  576808       0       0  655070  485218       0       0  868572  542184       0


In [15]:
filename = '1988FemRespDataLines.dat.gz'
names = ['finalwgt', 'ageint', 'currentcm', 'firstcm', 'cmintvw', 'cmbirth']
colspecs = [(2568-1, 2574),
            (36-1, 37),
            (1521-1, 1525),
            (1538-1, 1542),
            (12-1, 16),
            (26-1, 30),
            ]
df = pandas.read_fwf(filename,
                     colspecs=colspecs, 
                     names=names,
                     header=None,
                     compression='gzip')

In [16]:
len(df)  # should be 8450


Out[16]:
8450

In [17]:
df.ageint.value_counts().sort_index()


Out[17]:
14      3
15    210
16    240
17    263
18    258
19    260
20    259
21    238
22    280
23    240
24    292
25    308
26    322
27    330
28    292
29    355
30    342
31    335
32    370
33    328
34    314
35    296
36    305
37    314
38    251
39    270
40    263
41    257
42    247
43    194
44    197
45     17
dtype: int64

In [18]:
df.currentcm.value_counts().sort_index()


Out[18]:
0      4419
685       1
705       1
722       1
723       1
726       2
728       1
729       2
730       3
731       2
732       1
734       1
735       1
736       1
738       3
...
1056     26
1057     14
1058     11
1059      6
1060      6
1061      3
1062      1
90781     1
90834     1
90978     1
90990     1
91018     1
91026     1
91038     2
99999    16
Length: 344, dtype: int64

In [19]:
df.currentcm.replace([0, 99999], np.nan, inplace=True)
df.loc[df.currentcm>90000, 'currentcm'] -= 90000

In [20]:
df.firstcm.value_counts().sort_index()


Out[20]:
0      6452
706       1
708       2
710       2
711       1
712       1
714       2
720       2
722       3
724       1
726       5
727       1
728       1
729       1
730       3
...
90886     1
90894     6
90901     1
90906     1
90907     1
90918     2
90922     1
90930     1
90942     1
90943     1
90978     2
90979     1
90990     1
91002     1
99999    19
Length: 355, dtype: int64

In [21]:
df.firstcm.replace([0, 99999], np.nan, inplace=True)
df.loc[df.firstcm>90000, 'firstcm'] -= 90000

In [25]:
df['cmmarrhx'] = df.currentcm
df.cmmarrhx.fillna(df.firstcm)
sum(df.cmmarrhx.isnull())


Out[25]:
4435

In [26]:
df.cmintvw.value_counts().sort_index()


Out[26]:
1057     545
1058    2034
1059    2288
1060    1701
1061     874
1062     682
1063     191
1064     135
dtype: int64

In [27]:
df.cmbirth.value_counts().sort_index()


Out[27]:
519    10
520    14
521    20
522    17
523    17
524    18
525    16
526    20
527    13
528    16
529    25
530    14
531    19
532    11
533    16
...
865    31
866    13
867    15
868    15
869    22
870    18
871    23
872    20
873    28
874    15
875    17
876    22
877    22
878    11
879     7
Length: 361, dtype: int64

In [29]:
survival.CleanData(df)

In [30]:
df['evrmarry'] = ~df.cmmarrhx.isnull()
df


Out[30]:
finalwgt ageint currentcm firstcm cmintvw cmbirth cmmarrhx agemarry age decade fives evrmarry
0 713792 28 NaN NaN 1061 723 NaN NaN 28.166667 6 12 False
1 367022 37 NaN NaN 1059 614 NaN NaN 37.083333 5 10 False
2 975924 21 NaN NaN 1057 796 NaN NaN 21.750000 6 13 False
3 587796 39 NaN 838 1057 581 NaN NaN 39.666667 4 9 False
4 719633 31 974 882 1062 683 974 24.250000 31.583333 5 11 True
5 730622 17 NaN NaN 1060 844 NaN NaN 18.000000 7 14 False
6 608474 39 848 809 1057 578 848 22.500000 39.916667 4 9 True
7 777787 30 NaN 924 1058 696 NaN NaN 30.166667 5 11 False
8 1030290 22 NaN NaN 1059 791 NaN NaN 22.333333 6 13 False
9 639364 18 NaN NaN 1059 836 NaN NaN 18.583333 6 13 False
10 820707 34 947 NaN 1058 642 947 25.416667 34.666667 5 10 True
11 766942 29 NaN 957 1058 699 NaN NaN 29.916667 5 11 False
12 712940 30 960 NaN 1061 696 960 22.000000 30.416667 5 11 True
13 748807 38 848 NaN 1060 595 848 21.083333 38.750000 4 9 True
14 682754 33 NaN 937 1058 652 NaN NaN 33.833333 5 10 False
15 773938 38 NaN 894 1057 592 NaN NaN 38.750000 4 9 False
16 787966 33 NaN NaN 1060 659 NaN NaN 33.416667 5 10 False
17 773938 35 942 NaN 1060 629 942 26.083333 35.916667 5 10 True
18 642272 30 977 NaN 1060 688 977 24.083333 31.000000 5 11 True
19 733060 30 918 NaN 1060 693 918 18.750000 30.583333 5 11 True
20 734133 37 835 NaN 1060 610 835 18.750000 37.500000 5 10 True
21 745297 25 1031 NaN 1060 754 1031 23.083333 25.500000 6 12 True
22 799872 35 1039 870 1060 635 1039 33.666667 35.416667 5 10 True
23 807851 20 NaN NaN 1060 811 NaN NaN 20.750000 6 13 False
24 807851 20 NaN NaN 1060 809 NaN NaN 20.916667 6 13 False
25 929923 43 824 NaN 1060 532 824 24.333333 44.000000 4 8 True
26 705780 35 846 NaN 1060 629 846 18.083333 35.916667 5 10 True
27 724044 38 945 848 1060 596 945 29.083333 38.666667 4 9 True
28 690662 33 875 NaN 1060 655 875 18.333333 33.750000 5 10 True
29 2183253 15 NaN NaN 1058 866 NaN NaN 16.000000 7 14 False
... ... ... ... ... ... ... ... ... ... ... ... ...
8420 2036322 39 NaN 825 1058 585 NaN NaN 39.416667 4 9 False
8421 2478173 30 992 NaN 1059 693 992 24.916667 30.500000 5 11 True
8422 2174687 29 NaN 946 1058 701 NaN NaN 29.750000 5 11 False
8423 2301277 40 821 NaN 1058 570 821 20.916667 40.666667 4 9 True
8424 2594495 25 NaN NaN 1059 756 NaN NaN 25.250000 6 12 False
8425 1963379 43 765 NaN 1058 536 765 19.083333 43.500000 4 8 True
8426 2276609 34 936 NaN 1058 646 936 24.166667 34.333333 5 10 True
8427 2056998 37 896 NaN 1058 604 896 24.333333 37.833333 5 10 True
8428 2592015 24 NaN NaN 1058 763 NaN NaN 24.583333 6 12 False
8429 2486216 36 961 NaN 1059 619 961 28.500000 36.666667 5 10 True
8430 2624510 32 NaN NaN 1058 668 NaN NaN 32.500000 5 11 False
8431 2460840 25 1052 NaN 1060 752 1052 25.000000 25.666667 6 12 True
8432 2384200 25 NaN NaN 1058 748 NaN NaN 25.833333 6 12 False
8433 2384200 27 NaN NaN 1058 723 NaN NaN 27.916667 6 12 False
8434 2126824 23 1044 NaN 1058 779 1044 22.083333 23.250000 6 12 True
8435 1925357 27 1044 NaN 1059 728 1044 26.333333 27.583333 6 12 True
8436 2105497 26 NaN NaN 1058 745 NaN NaN 26.083333 6 12 False
8437 2518126 23 NaN NaN 1058 776 NaN NaN 23.500000 6 12 False
8438 2384200 26 NaN NaN 1059 746 NaN NaN 26.083333 6 12 False
8439 2518126 23 NaN NaN 1059 772 NaN NaN 23.916667 6 12 False
8440 2549695 33 NaN NaN 1059 656 NaN NaN 33.583333 5 10 False
8441 2518126 24 NaN NaN 1058 764 NaN NaN 24.500000 6 12 False
8442 645391 31 929 NaN 1059 679 929 20.833333 31.666667 5 11 True
8443 2986139 26 997 NaN 1058 740 997 21.416667 26.500000 6 12 True
8444 2092079 34 978 NaN 1058 642 978 28.000000 34.666667 5 10 True
8445 2251351 26 NaN NaN 1059 740 NaN NaN 26.583333 6 12 False
8446 2251351 26 NaN NaN 1058 736 NaN NaN 26.833333 6 12 False
8447 2384200 26 NaN NaN 1058 741 NaN NaN 26.416667 6 12 False
8448 1469892 38 931 839 1063 606 931 27.083333 38.083333 5 10 True
8449 2620612 30 1014 NaN 1063 693 1014 26.750000 30.833333 5 11 True

8450 rows × 12 columns


In [31]:
cdf = thinkstats2.Cdf(df.age - df.ageint)
thinkplot.Cdf(cdf)


Out[31]:
{'xscale': 'linear', 'yscale': 'linear'}

In [32]:
cdf = thinkstats2.Cdf(df.agemarry)
thinkplot.Cdf(cdf)
len(df.agemarry.dropna())


Out[32]:
4015

In [14]:


In [ ]: