In [1]:
# HIDDEN
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [2]:
# Read in the data from health records survey as a raw table
# Public data from the UC Michigan Institute for Social Research
# Health and Retirement Survey - rsonline.isr.umich.edu
#
hrec06 = Table.read_table("./hrsextract06.csv")

In [3]:
# the raw table - pretty yechy
hrec06


Out[3]:
hhidpn r8agey_m ragender raracem rahispan raedyrs h8cpl h8atota rayshlt r8shlt r8expyrs r8cesd r8bpavgs r8bpavgp r8smoken r8mdactx r8weightbio r8weight r8heightbio r8height
3010 70 1 1 0 12 1 914000 3 3 21.5 0 140 58 0 4 74.1623 71.6672 1.64465 1.6256
3020 67 2 1 0 16 1 914000 1 3 23.7 0 139 72 0 1 66.9048 65.317 1.61925 1.6256
10003030 50 2 1 0 16 0 12000 3 5 47.5 4 124 98 0 2 64.4101 58.9667 1.5494 1.5748
10004010 66 1 1 0 16 1 1.832e+06 1 4 7 1 130 74 0 2 101.605 102.511 1.8415 1.8542
10004040 60 2 1 0 12 1 1.832e+06 1 2 28.25 0 125 67 0 2 77.3374 77.1103 1.6383 1.651
10013010 68 1 1 0 12 0 50 1 3 16.5 2 125 67 0 5 110.223 108.862 1.7272 1.7272
10038010 70 1 1 0 16 1 2.5e+06 2 2 22.5 1 107 52 0 2 73.4819 74.8423 1.70815 1.7526
10038040 63 2 1 0 16 1 2.5e+06 1 1 27.05 1 112 70 0 2 65.3173 64.4098 1.67005 1.6764
10050010 64 2 3 0 17 0 664066 1 2 27.025 1 136 90 0 5 75.9767 68.0385 1.5875 1.6256
10059020 70 2 1 0 16 1 1.2405e+07 2 3 19.5 0 112 54 1 2 58.967 57.6059 1.6637 1.6764

... (6332 rows omitted)


In [ ]:
# Create a table that provides the mapping and decoding to a more readable form
health_map = Table.from_rows(
    [["hhidpn",  "id", None, "identifier"],
    ["r8agey_m", "age", None, "age in years in wave 8"],
    ["ragender", "gender", ['male','female'], "1 = male,  2 = female)"],
    ["raracem",  "race",   ['white','black','other'], "(1 = white,  2 = black,  3 = other)"],
    ["rahispan", "hispanic",  None, "(1 = yes)"],
    ["raedyrs",  "education", None, "education in years"],
    ["h8cpl",    "couple",    None, "in a couple household (1 = yes)"],
    ["r8bpavgs", "blood pressure", None,"average systolic BP"],
    ["r8bpavgp", "pulse", None, "average pulse"],
    ["r8smoken", "smoker",None, "currently smokes cigarettes"],
    ["r8mdactx", "exercise", None, "frequency of moderate exercise (1=everyday, 2=>1perweek, 3=1perweek, 4=1-3permonth\
, 5=never)"],
    ["r8weightbio", "weight", None, "objective weight in kg"],
    ["r8heightbio","height", None, "objective height in m"],
    ], 
    ["raw label", "label", "encoding", "Description"])
health_map

In [ ]:
def table_lookup(table,key_col,key,map_col):
    row = np.where(table[key_col]==key)
    if len(row[0]) == 1:
        return table[map_col][row[0]][0]
    else:
        return -1

In [ ]:
def map_raw_table(raw_table,map_table):
    mapped = Table()
    for raw_label in raw_table :
        if raw_label in map_table["raw label"] :
            new_label = table_lookup(map_table,'raw label',raw_label,'label')
            encoding = table_lookup(map_table,'raw label',raw_label,'encoding')
            if encoding is None :
                mapped[new_label] = raw_table[raw_label]
            else:
                mapped[new_label] = raw_table.apply(lambda x: encoding[x-1], raw_label)
    return mapped

In [ ]:
# create a more usable table by mapping the raw to finished
health = map_raw_table(hrec06,health_map)
health

In [ ]:
# Let's try what is the effect of smoking
smokers = health.where('smoker',1)
nosmokers = health.where('smoker',0)
print(smokers.num_rows, ' smokers')
print(nosmokers.num_rows, ' non-smokers')

In [ ]:
def firstQtile(x) : return np.percentile(x,25)
def thirdQtile(x) : return np.percentile(x,25)
summary_ops = (min, firstQtile, np.mean, thirdQtile, max)

In [ ]:
smokers.stats(summary_ops)

In [ ]:
nosmokers.stats(summary_ops)

It would appear that nosmokers are older, more educated, more 'coupled' and heavier - but of similar height and blood pressure


In [ ]:
# Lets draw two samples of equal size
n_sample = 200
smoker_sample = smokers.sample(n_sample)
nosmoker_sample = nosmokers.sample(n_sample)

In [ ]:
weight = Table([nosmoker_sample['weight'],smoker_sample['weight']],['NoSmoke','Smoke'])
weight.hist(overlay=True,bins=30,normed=True)

In [ ]:
bins=np.arange(39,139,5)
weight_dist = weight.bin(bins=bins, normed=True)
weight_dist['diff']=weight_dist['NoSmoke density']-weight_dist['Smoke density']
print('TVD: ',sum(np.abs(weight_dist['diff'])))
weight_dist.show()

In [ ]:
weight_dist.select(['bin','diff']).bar('bin')

In [ ]:
weight.stats(summary_ops)

In [ ]:
np.mean(weight['NoSmoke'])-np.mean(weight['Smoke'])

Is the difference observed between these samples representative of the larger population?


In [ ]:
combined = Table([np.append(nosmoker_sample['weight'],smoker_sample['weight'])],['all'])

In [ ]:
# permutation test, split the combined into two random groups, do the comparison of those
def getdiff():
    A,B = combined.split(300)
    return (np.mean(A['all'])-np.mean(B['all']))

In [ ]:
# Do the permutation many times and form the distribution of results
num_samples = 100
diff_samples = Table([[getdiff() for i in range(num_samples)]],['diffs'])

In [ ]:
diff_samples.hist(bins=20, normed=True)

In [ ]: