In [1]:

    
# HIDDEN
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np



In [3]:

    
# Read in data from health records survey as a raw table
# Public data from the UC Michigan Institute for Social Research
# Health and Retirement Survey - rsonline.isr.umich.edu
#
hrec06 = Table.read_table("./data/hrsextract06.csv")
hrec06









    Out[3]:





    
        
            hhidpn  r8agey_m  ragender  raracem  rahispan  raedyrs  h8cpl  h8atota  rayshlt  r8shlt  r8expyrs  r8cesd  r8bpavgs  r8bpavgp  r8smoken  r8mdactx  r8weightbio  r8weight  r8heightbio  r8height
        
    
    
        
            3010     70       1        1       0        12      1     914000     3       3      21.5     0      140      58       0        4        74.1623     71.6672  1.64465     1.6256  
        
    
        
            3020     67       2        1       0        16      1     914000     1       3      23.7     0      139      72       0        1        66.9048     65.317   1.61925     1.6256  
        
    
        
            10003030  50       2        1       0        16      0     12000      3       5      47.5     4      124      98       0        2        64.4101     58.9667  1.5494      1.5748  
        
    
        
            10004010  66       1        1       0        16      1     1.832e+06  1       4      7        1      130      74       0        2        101.605     102.511  1.8415      1.8542  
        
    
        
            10004040  60       2        1       0        12      1     1.832e+06  1       2      28.25    0      125      67       0        2        77.3374     77.1103  1.6383      1.651   
        
    
        
            10013010  68       1        1       0        12      0     50         1       3      16.5     2      125      67       0        5        110.223     108.862  1.7272      1.7272  
        
    
        
            10038010  70       1        1       0        16      1     2.5e+06    2       2      22.5     1      107      52       0        2        73.4819     74.8423  1.70815     1.7526  
        
    
        
            10038040  63       2        1       0        16      1     2.5e+06    1       1      27.05    1      112      70       0        2        65.3173     64.4098  1.67005     1.6764  
        
    
        
            10050010  64       2        3       0        17      0     664066     1       2      27.025   1      136      90       0        5        75.9767     68.0385  1.5875      1.6256  
        
    
        
            10059020  70       2        1       0        16      1     1.2405e+07  2       3      19.5     0      112      54       1        2        58.967      57.6059  1.6637      1.6764  
        
    

... (6332 rows omitted)

Indirection

They say "all problems in computer science can be solved with an extra level of indirection." It certainly provides some real leverage in data wrangling. Rather than write a bunch of spaghetti code, we will build a table that defines the transformation we would like to perform on the raw data in order to have something cleaner to work with. In this we can map the indecipherable identifiers into something more understandable; we can establish formatters; we can translate field encodings into clear mnemonics, and so on.

We need a tool for finding elements in the translation table; that's table_lookup. Then we can build our mapping tool, map_raw_table.



In [28]:

    
health_map = Table(["raw label", "label", "encoding", "Description"]).with_rows(
       [["hhidpn",  "id", None, "identifier"],
        ["r8agey_m", "age", None, "age in years in wave 8"],
        ["ragender", "gender", ['male','female'], "1 = male,  2 = female)"],
        ["raracem",  "race",   ['white','black','other'], "(1 = white,  2 = black,  3 = other)"],
        ["rahispan", "hispanic",  None, "(1 = yes)"],
        ["raedyrs",  "education", None, "education in years"],
        ["h8cpl",    "couple",    None, "in a couple household (1 = yes)"],
        ["r8bpavgs", "blood pressure", None,"average systolic BP"],
        ["r8bpavgp", "pulse", None, "average pulse"],
        ["r8smoken", "smoker",None, "currently smokes cigarettes"],
        ["r8mdactx", "exercise", None, "frequency of moderate exercise (1=everyday, 2=>1perweek, 3=1perweek, 4=1-3permonth\
, 5=never)"],
        ["r8weightbio", "weight", None, "objective weight in kg"],
        ["r8heightbio","height", None, "objective height in m"]])
health_map









    Out[28]:





    
        
            raw label  label  encoding  Description
        
    
    
        
            hhidpn    id             None                        identifier                         
        
    
        
            r8agey_m  age            None                        age in years in wave 8             
        
    
        
            ragender  gender         ['male', 'female']          1 = male,  2 = female)             
        
    
        
            raracem   race           ['white', 'black', 'other']  (1 = white,  2 = black,  3 = other)
        
    
        
            rahispan  hispanic       None                        (1 = yes)                          
        
    
        
            raedyrs   education      None                        education in years                 
        
    
        
            h8cpl     couple         None                        in a couple household (1 = yes)    
        
    
        
            r8bpavgs  blood pressure  None                        average systolic BP                
        
    
        
            r8bpavgp  pulse          None                        average pulse                      
        
    
        
            r8smoken  smoker         None                        currently smokes cigarettes        
        
    

... (3 rows omitted)



In [29]:

    
def table_lookup(table,key_col,key,map_col):
    row = np.where(table[key_col]==key)
    if len(row[0]) == 1:
        return table[map_col][row[0]][0]
    else:
        return -1



In [30]:

    
def map_raw_table(raw_table,map_table):
    mapped = Table()
    for raw_label in raw_table :
        if raw_label in map_table["raw label"] :
            new_label = table_lookup(map_table,'raw label',raw_label,'label')
            encoding = table_lookup(map_table,'raw label',raw_label,'encoding')
            if encoding is None :
                mapped[new_label] = raw_table[raw_label]
            else:
                mapped[new_label] = raw_table.apply(lambda x: encoding[x-1], raw_label)
    return mapped



In [31]:

    
# create a more usable table by mapping the raw to finished
health = map_raw_table(hrec06,health_map)
health









    Out[31]:





    
        
            id  age  gender  race  hispanic  education  couple  blood pressure  pulse  smoker  exercise  weight  height
        
    
    
        
            3010     70   male   white  0        12        1      140            58    0      4        74.1623  1.64465
        
    
        
            3020     67   female  white  0        16        1      139            72    0      1        66.9048  1.61925
        
    
        
            10003030  50   female  white  0        16        0      124            98    0      2        64.4101  1.5494 
        
    
        
            10004010  66   male   white  0        16        1      130            74    0      2        101.605  1.8415 
        
    
        
            10004040  60   female  white  0        12        1      125            67    0      2        77.3374  1.6383 
        
    
        
            10013010  68   male   white  0        12        0      125            67    0      5        110.223  1.7272 
        
    
        
            10038010  70   male   white  0        16        1      107            52    0      2        73.4819  1.70815
        
    
        
            10038040  63   female  white  0        16        1      112            70    0      2        65.3173  1.67005
        
    
        
            10050010  64   female  other  0        17        0      136            90    0      5        75.9767  1.5875 
        
    
        
            10059020  70   female  white  0        16        1      112            54    1      2        58.967  1.6637 
        
    

... (6332 rows omitted)

Descriptive statistics - smoking



In [32]:

    
def firstQtile(x) : return np.percentile(x,25)
def thirdQtile(x) : return np.percentile(x,25)
summary_ops = (min, firstQtile, np.median, np.mean, thirdQtile, max, sum)



In [33]:

    
# Let's try what is the effect of smoking
smokers = health.where('smoker',1)
nosmokers = health.where('smoker',0)
print(smokers.num_rows, ' smokers')
print(nosmokers.num_rows, ' non-smokers')









    



893  smokers
5449  non-smokers



In [34]:

    
smokers.stats(summary_ops)









    Out[34]:





    
        
            statistic  id  age  gender  race  hispanic  education  couple  blood pressure  pulse  smoker  exercise  weight  height
        
    
    
        
            min        1.0059e+07  30      female  black  0         0         0       68             45      1      1        36.0606  1.4224 
        
    
        
            firstQtile  4.8508e+07  55                   0         11        0       114            66      1      2        64.1833  1.6002 
        
    
        
            median     1.15627e+08  62                   0         12        1       129            74      1      3        75.2963  1.67005
        
    
        
            mean       1.98972e+08  62.0728               0.0615901  12.1299   0.62598  130.354        74.5801  1      2.98768  76.7292  1.67348
        
    
        
            thirdQtile  4.8508e+07  55                   0         11        0       114            66      1      2        64.1833  1.6002 
        
    
        
            max        5.02753e+08  87      male   white  1         17        1       217            119     1      5        134.263  1.9685 
        
    
        
            sum        1.77682e+11  55431                55        10832     559     116406         66600   893    2668     68519.2  1494.42



In [35]:

    
nosmokers.stats(summary_ops)









    Out[35]:





    
        
            statistic  id  age  gender  race  hispanic  education  couple  blood pressure  pulse  smoker  exercise  weight  height
        
    
    
        
            min        3010        31      female  black  0        0         0        79             34      0      1        32.205  1.36525
        
    
        
            firstQtile  4.5397e+07  60                   0        12        0        117            62      0      2        68.0388  1.5875 
        
    
        
            median     1.10427e+08  68                   0        12        1        129            69      0      2        79.6054  1.65735
        
    
        
            mean       1.65713e+08  67.4986               0.073408  12.9648   0.715728  130.934        69.4542  0      2.69114  81.2584  1.66315
        
    
        
            thirdQtile  4.5397e+07  60                   0        12        0        117            62      0      2        68.0388  1.5875 
        
    
        
            max        5.02759e+08  90      male   white  1        17        1        224            133     0      5        137.892  1.96215
        
    
        
            sum        9.02972e+11  367800               400      70645     3900     713462         378456  0      14664    442777  9062.52



In [36]:

    
help(smokers.hist)









    



Help on method hist in module datascience.tables:

hist(select=None, overlay=True, bins=None, counts=None, unit=None, **vargs) method of datascience.tables.Table instance
    Plots one histogram for each column in the table.
    
    Every column must be numerical.
    
    Kwargs:
        overlay (bool): If True, plots 1 chart with all the histograms
            overlaid on top of each other (instead of the default behavior
            of one histogram for each column in the table). Also adds a
            legend that matches each bar color to its column.
    
        bins (column name or list): Lower bound for each bin in the
            histogram. If None, bins will be chosen automatically.
    
        counts (column name or column): A column of counted values.
            All other columns are treated as counts of these values.
            If None, each value in each row is assigned a count of 1.
    
        vargs: Additional arguments that get passed into :func:plt.hist.
            See http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.hist
            for additional arguments that can be passed into vargs. These
            include: `range`, `normed`, `cumulative`, and `orientation`,
            to name a few.
    
    >>> t = Table().with_columns([
    ...     'count',  [9, 3, 3, 1],
    ...     'points', [1, 2, 2, 10]])
    >>> t
    count | points
    9     | 1
    3     | 2
    3     | 2
    1     | 10
    >>> t.hist() # doctest: +SKIP
    <histogram of values in count>
    <histogram of values in points>
    
    >>> t = Table().with_columns([
    ...     'value',      [101, 102, 103],
    ...     'proportion', [0.25, 0.5, 0.25]])
    >>> t.hist(counts='value') # doctest: +SKIP
    <histogram of values in prop weighted by corresponding values in value>

What is the effect of smoking on weight?



In [37]:

    
smokers.hist('weight', bins=20)



In [38]:

    
nosmokers.hist('weight', bins=20)



In [39]:

    
np.mean(nosmokers['weight'])-np.mean(smokers['weight'])









    Out[39]:





4.5292127574925019

Permutation tests



In [42]:

    
# Lets draw two samples of equal size
n_sample = 200
smoker_sample = smokers.sample(n_sample)
nosmoker_sample = nosmokers.sample(n_sample)
weight = Table().with_columns([('NoSmoke', nosmoker_sample['weight']),('Smoke', smoker_sample['weight'])])
weight.hist(overlay=True,bins=30,normed=True)



In [43]:

    
weight.stats(summary_ops)









    Out[43]:





    
        
            statistic  NoSmoke  Smoke
        
    
    
        
            min        41.2769  41.2769
        
    
        
            firstQtile  70.08   64.1833
        
    
        
            median     80.059  73.0283
        
    
        
            mean       81.0979  76.7909
        
    
        
            thirdQtile  70.08   64.1833
        
    
        
            max        131.088  127.006
        
    
        
            sum        16219.6  15358.2

Is the difference observed between these samples representative of the larger population?



In [44]:

    
combined = Table().with_column('all', np.append(nosmoker_sample['weight'],smoker_sample['weight']))



In [45]:

    
combined.num_rows









    Out[45]:





400



In [46]:

    
# permutation test, split the combined into two random groups, do the comparison of those
def getdiff():
    A,B = combined.split(n_sample)
    return (np.mean(A['all'])-np.mean(B['all']))



In [47]:

    
# Do the permutation many times and form the distribution of results
num_samples = 300
diff_samples = Table().with_column('diffs', [getdiff() for i in range(num_samples)])
diff_samples.hist(bins=np.arange(-5,5,0.5), normed=True)

The 4.5 kg difference is certainly not an artifact of the sample we started with. The smokers definitely weigh less. At the same time, these are not light people in this study. Better go back and understand what was the purpose of the study that led to the selection of these six thousand individuals.

Other Factors



In [48]:

    
# A sense of the overall population represented - older
health.select(['age','education']).hist(bins=20)



In [49]:

    
# How does education correlate with age?
health.select(['age','education']).scatter('age', fit_line=True)



In [50]:

    
health.pivot_hist('race','education',normed=True)



In [51]:

    
# How are races represented in the dataset and how does hispanic overlay the three?
race = health.select(['race', 'hispanic'])   
race['count']=1
by_race = race.group('race',sum)
by_race['race frac'] = by_race['count sum']/np.sum(by_race['count sum'])
by_race['hisp frac'] = by_race['hispanic sum'] / by_race['count sum']
by_race









    Out[51]:





    
        
            race  hispanic sum  count sum  race frac  hisp frac
        
    
    
        
            black  10           766       0.120782  0.0130548
        
    
        
            other  148          279       0.0439924  0.530466 
        
    
        
            white  297          5297      0.835225  0.0560695



In [52]:

    
health.select(['height','weight']).scatter('height','weight',fit_line=True)

hhidpn	r8agey_m	ragender	raracem	raedyrs	h8cpl	h8atota	rayshlt	r8shlt	r8expyrs	r8cesd	r8bpavgs	r8bpavgp	r8smoken	r8mdactx	r8weightbio	r8weight	r8heightbio	r8height
3010	70	1	1	12	1	914000	3	3	21.5	0	140	58	0	4	74.1623	71.6672	1.64465	1.6256
3020	67	2	1	16	1	914000	1	3	23.7	0	139	72	0	1	66.9048	65.317	1.61925	1.6256
10003030	50	2	1	16	0	12000	3	5	47.5	4	124	98	0	2	64.4101	58.9667	1.5494	1.5748
10004010	66	1	1	16	1	1.832e+06	1	4	7	1	130	74	0	2	101.605	102.511	1.8415	1.8542
10004040	60	2	1	12	1	1.832e+06	1	2	28.25	0	125	67	0	2	77.3374	77.1103	1.6383	1.651
10013010	68	1	1	12	0	50	1	3	16.5	2	125	67	0	5	110.223	108.862	1.7272	1.7272
10038010	70	1	1	16	1	2.5e+06	2	2	22.5	1	107	52	0	2	73.4819	74.8423	1.70815	1.7526
10038040	63	2	1	16	1	2.5e+06	1	1	27.05	1	112	70	0	2	65.3173	64.4098	1.67005	1.6764
10050010	64	2	3	17	0	664066	1	2	27.025	1	136	90	0	5	75.9767	68.0385	1.5875	1.6256
10059020	70	2	1	16	1	1.2405e+07	2	3	19.5	0	112	54	1	2	58.967	57.6059	1.6637	1.6764

raw label	label	encoding	Description
hhidpn	id	None	identifier
r8agey_m	age	None	age in years in wave 8
ragender	gender	['male', 'female']	1 = male, 2 = female)
raracem	race	['white', 'black', 'other']	(1 = white, 2 = black, 3 = other)
rahispan	hispanic	None	(1 = yes)
raedyrs	education	None	education in years
h8cpl	couple	None	in a couple household (1 = yes)
r8bpavgs	blood pressure	None	average systolic BP
r8bpavgp	pulse	None	average pulse
r8smoken	smoker	None	currently smokes cigarettes

id	age	gender	race	education	couple	blood pressure	pulse	smoker	exercise	weight	height
3010	70	male	white	12	1	140	58	0	4	74.1623	1.64465
3020	67	female	white	16	1	139	72	0	1	66.9048	1.61925
10003030	50	female	white	16	0	124	98	0	2	64.4101	1.5494
10004010	66	male	white	16	1	130	74	0	2	101.605	1.8415
10004040	60	female	white	12	1	125	67	0	2	77.3374	1.6383
10013010	68	male	white	12	0	125	67	0	5	110.223	1.7272
10038010	70	male	white	16	1	107	52	0	2	73.4819	1.70815
10038040	63	female	white	16	1	112	70	0	2	65.3173	1.67005
10050010	64	female	other	17	0	136	90	0	5	75.9767	1.5875
10059020	70	female	white	16	1	112	54	1	2	58.967	1.6637

statistic	id	age	gender	race	hispanic	education	couple	blood pressure	pulse	smoker	exercise	weight	height
min	1.0059e+07	30	female	black	0	0	0	68	45	1	1	36.0606	1.4224
firstQtile	4.8508e+07	55			0	11	0	114	66	1	2	64.1833	1.6002
median	1.15627e+08	62			0	12	1	129	74	1	3	75.2963	1.67005
mean	1.98972e+08	62.0728			0.0615901	12.1299	0.62598	130.354	74.5801	1	2.98768	76.7292	1.67348
thirdQtile	4.8508e+07	55			0	11	0	114	66	1	2	64.1833	1.6002
max	5.02753e+08	87	male	white	1	17	1	217	119	1	5	134.263	1.9685
sum	1.77682e+11	55431			55	10832	559	116406	66600	893	2668	68519.2	1494.42

statistic	NoSmoke	Smoke
min	41.2769	41.2769
firstQtile	70.08	64.1833
median	80.059	73.0283
mean	81.0979	76.7909
thirdQtile	70.08	64.1833
max	131.088	127.006
sum	16219.6	15358.2

race	hispanic sum	count sum	race frac	hisp frac
black	10	766	0.120782	0.0130548
other	148	279	0.0439924	0.530466
white	297	5297	0.835225	0.0560695