validate1995data



In [1]:
from __future__ import print_function, division

import nsfg3

In [15]:
import pandas
import numpy as np

def ReadFemResp1995():
    """Reads respondent data from NSFG Cycle 5.

    returns: DataFrame
    """
    dat_file = '1995FemRespData.dat.gz'
    names = ['cmintvw', 'timesmar', 'cmmarrhx', 'cmbirth', 'finalwgt']
    colspecs = [(12360-1, 12363),
                (4637-1, 4638),
                (11759-1, 11762),
                (14-1, 16),
                (12350-1, 12359)]
    df = pandas.read_fwf(dat_file, 
                         compression='gzip', 
                         colspecs=colspecs, 
                         names=names)

    df.timesmar.replace([98, 99], np.nan, inplace=True)
    df['evrmarry'] = (df.timesmar > 0)

    nsfg3.CleanData(df)
    return df

In [16]:
df = ReadFemResp1995()

In [17]:
sum(~df.evrmarry)


Out[17]:
4006

In [18]:
len(df[(df.cmbirth >= 604) & (df.cmbirth <= 720)])


Out[18]:
3918

In [19]:
len(df[(df.cmmarrhx >= 780) & (df.cmmarrhx <= 840)])


Out[19]:
192

In [20]:
df.cmintvw.value_counts().sort_index()


Out[20]:
1141     514
1142    2448
1143    2466
1144    1692
1145    1381
1146    1076
1147     722
1148     172
1149      81
1150     295
Name: cmintvw, dtype: int64

In [21]:
df.finalwgt.value_counts().sort_index()


Out[21]:
349.8761      1
356.8193      1
433.7413      1
434.1431      1
441.2415      1
463.0787      1
464.5584      1
468.2044      1
472.6256      1
482.7008      1
495.1924      1
498.6220      1
507.9773      1
511.4543      1
519.3578      1
523.6670      1
525.7335      1
555.8899      1
561.4898      1
561.6297      1
564.7645      1
567.7595      1
589.4533      1
590.1313      1
590.1364      1
617.0250      1
634.0294      1
634.8276      1
637.1154      1
640.9224      1
             ..
18588.5661    1
18702.0666    1
18890.5291    1
19985.1776    1
20077.5177    1
20161.3343    1
20457.1239    1
20628.0956    1
20731.0408    1
20864.2010    1
20916.7939    1
21558.0296    1
21886.1411    1
22304.0038    1
22568.3367    1
22718.1978    1
22812.9377    1
23231.5175    1
23497.7818    1
23693.3226    1
23758.9147    1
24146.9122    1
24653.6224    1
24916.6825    1
25049.8728    1
25568.3298    1
25840.3059    1
26067.4314    1
26562.8055    1
33549.8227    1
Name: finalwgt, dtype: int64

In [22]:
df.timesmar.value_counts().sort_index()


Out[22]:
1    5559
2    1077
3     174
4      26
5       5
Name: timesmar, dtype: int64

In [ ]: