In [1]:
# HIDDEN
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
In [2]:
# Read in the data from health records survey as a raw table
# Public data from the UC Michigan Institute for Social Research
# Health and Retirement Survey - rsonline.isr.umich.edu
#
hrec06 = Table.read_table("./hrsextract06.csv")
In [3]:
# the raw table - pretty yechy
hrec06
Out[3]:
In [ ]:
# Create a table that provides the mapping and decoding to a more readable form
health_map = Table.from_rows(
[["hhidpn", "id", None, "identifier"],
["r8agey_m", "age", None, "age in years in wave 8"],
["ragender", "gender", ['male','female'], "1 = male, 2 = female)"],
["raracem", "race", ['white','black','other'], "(1 = white, 2 = black, 3 = other)"],
["rahispan", "hispanic", None, "(1 = yes)"],
["raedyrs", "education", None, "education in years"],
["h8cpl", "couple", None, "in a couple household (1 = yes)"],
["r8bpavgs", "blood pressure", None,"average systolic BP"],
["r8bpavgp", "pulse", None, "average pulse"],
["r8smoken", "smoker",None, "currently smokes cigarettes"],
["r8mdactx", "exercise", None, "frequency of moderate exercise (1=everyday, 2=>1perweek, 3=1perweek, 4=1-3permonth\
, 5=never)"],
["r8weightbio", "weight", None, "objective weight in kg"],
["r8heightbio","height", None, "objective height in m"],
],
["raw label", "label", "encoding", "Description"])
health_map
In [ ]:
def table_lookup(table,key_col,key,map_col):
row = np.where(table[key_col]==key)
if len(row[0]) == 1:
return table[map_col][row[0]][0]
else:
return -1
In [ ]:
def map_raw_table(raw_table,map_table):
mapped = Table()
for raw_label in raw_table :
if raw_label in map_table["raw label"] :
new_label = table_lookup(map_table,'raw label',raw_label,'label')
encoding = table_lookup(map_table,'raw label',raw_label,'encoding')
if encoding is None :
mapped[new_label] = raw_table[raw_label]
else:
mapped[new_label] = raw_table.apply(lambda x: encoding[x-1], raw_label)
return mapped
In [ ]:
# create a more usable table by mapping the raw to finished
health = map_raw_table(hrec06,health_map)
health
In [ ]:
# Let's try what is the effect of smoking
smokers = health.where('smoker',1)
nosmokers = health.where('smoker',0)
print(smokers.num_rows, ' smokers')
print(nosmokers.num_rows, ' non-smokers')
In [ ]:
def firstQtile(x) : return np.percentile(x,25)
def thirdQtile(x) : return np.percentile(x,25)
summary_ops = (min, firstQtile, np.mean, thirdQtile, max)
In [ ]:
smokers.stats(summary_ops)
In [ ]:
nosmokers.stats(summary_ops)
It would appear that nosmokers are older, more educated, more 'coupled' and heavier - but of similar height and blood pressure
In [ ]:
# Lets draw two samples of equal size
n_sample = 200
smoker_sample = smokers.sample(n_sample)
nosmoker_sample = nosmokers.sample(n_sample)
In [ ]:
weight = Table([nosmoker_sample['weight'],smoker_sample['weight']],['NoSmoke','Smoke'])
weight.hist(overlay=True,bins=30,normed=True)
In [ ]:
bins=np.arange(39,139,5)
weight_dist = weight.bin(bins=bins, normed=True)
weight_dist['diff']=weight_dist['NoSmoke density']-weight_dist['Smoke density']
print('TVD: ',sum(np.abs(weight_dist['diff'])))
weight_dist.show()
In [ ]:
weight_dist.select(['bin','diff']).bar('bin')
In [ ]:
weight.stats(summary_ops)
In [ ]:
np.mean(weight['NoSmoke'])-np.mean(weight['Smoke'])
Is the difference observed between these samples representative of the larger population?
In [ ]:
combined = Table([np.append(nosmoker_sample['weight'],smoker_sample['weight'])],['all'])
In [ ]:
# permutation test, split the combined into two random groups, do the comparison of those
def getdiff():
A,B = combined.split(300)
return (np.mean(A['all'])-np.mean(B['all']))
In [ ]:
# Do the permutation many times and form the distribution of results
num_samples = 100
diff_samples = Table([[getdiff() for i in range(num_samples)]],['diffs'])
In [ ]:
diff_samples.hist(bins=20, normed=True)
In [ ]: