In [ ]:
# HIDDEN
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np
In [ ]:
# Read in data from health records survey as a raw table
# Public data from the UC Michigan Institute for Social Research
# Health and Retirement Survey - rsonline.isr.umich.edu
#
hrec06 = Table.read_table("./hrsextract06.csv")
hrec06
In [ ]:
health_map = Table(["raw label", "label", "encoding", "Description"]).with_rows(
[["hhidpn", "id", None, "identifier"],
["r8agey_m", "age", None, "age in years in wave 8"],
["ragender", "gender", ['male','female'], "1 = male, 2 = female)"],
["raracem", "race", ['white','black','other'], "(1 = white, 2 = black, 3 = other)"],
["rahispan", "hispanic", None, "(1 = yes)"],
["raedyrs", "education", None, "education in years"],
["h8cpl", "couple", None, "in a couple household (1 = yes)"],
["r8bpavgs", "blood pressure", None,"average systolic BP"],
["r8bpavgp", "pulse", None, "average pulse"],
["r8smoken", "smoker",None, "currently smokes cigarettes"],
["r8mdactx", "exercise", None, "frequency of moderate exercise (1=everyday, 2=>1perweek, 3=1perweek, 4=1-3permonth\
, 5=never)"],
["r8weightbio", "weight", None, "objective weight in kg"],
["r8heightbio","height", None, "objective height in m"]])
health_map
In [ ]:
def table_lookup(table,key_col,key,map_col):
row = np.where(table[key_col]==key)
if len(row[0]) == 1:
return table[map_col][row[0]][0]
else:
return -1
In [ ]:
def map_raw_table(raw_table,map_table):
mapped = Table()
for raw_label in raw_table :
if raw_label in map_table["raw label"] :
new_label = table_lookup(map_table,'raw label',raw_label,'label')
encoding = table_lookup(map_table,'raw label',raw_label,'encoding')
if encoding is None :
mapped[new_label] = raw_table[raw_label]
else:
mapped[new_label] = raw_table.apply(lambda x: encoding[x-1], raw_label)
return mapped
In [ ]:
# create a more usable table by mapping the raw to finished
health = map_raw_table(hrec06,health_map)
health
In [ ]:
def firstQtile(x) : return np.percentile(x,25)
def thirdQtile(x) : return np.percentile(x,25)
summary_ops = (min, firstQtile, np.median, np.mean, thirdQtile, max, sum)
In [ ]:
# Let's try what is the effect of smoking
smokers = health.where('smoker',1)
nosmokers = health.where('smoker',0)
print(smokers.num_rows, ' smokers')
print(nosmokers.num_rows, ' non-smokers')
In [ ]:
smokers.stats(summary_ops)
In [ ]:
nosmokers.stats(summary_ops)
In [ ]:
help(smokers.hist)
In [ ]:
smokers.hist('weight', bins=20)
In [ ]:
nosmokers.hist('weight', bins=20)
In [ ]:
np.mean(nosmokers['weight'])-np.mean(smokers['weight'])
In [ ]:
# Lets draw two samples of equal size
n_sample = 200
smoker_sample = smokers.sample(n_sample)
nosmoker_sample = nosmokers.sample(n_sample)
weight = Table().with_columns([('NoSmoke', nosmoker_sample['weight']),('Smoke', smoker_sample['weight'])])
weight.hist(overlay=True,bins=30,normed=True)
In [ ]:
weight.stats(summary_ops)
Is the difference observed between these samples representative of the larger population?
In [ ]:
combined = Table().with_column('all', np.append(nosmoker_sample['weight'],smoker_sample['weight']))
In [ ]:
combined.num_rows
In [ ]:
# permutation test, split the combined into two random groups, do the comparison of those
def getdiff():
A,B = combined.split(n_sample)
return (np.mean(A['all'])-np.mean(B['all']))
In [ ]:
# Do the permutation many times and form the distribution of results
num_samples = 300
diff_samples = Table().with_column('diffs', [getdiff() for i in range(num_samples)])
diff_samples.hist(bins=np.arange(-5,5,0.5), normed=True)
In [ ]:
In [ ]:
# A sense of the overall population represented - older
health.select(['age','education']).hist(bins=20)
In [ ]:
# How does education correlate with age?
health.select(['age','education']).scatter('age', fit_line=True)
In [ ]:
health.pivot_hist('race','education',normed=True)
In [ ]:
# How are races represented in the dataset and how does hispanic overlay the three?
race = health.select(['race', 'hispanic'])
race['count']=1
by_race = race.group('race',sum)
by_race['race frac'] = by_race['count sum']/np.sum(by_race['count sum'])
by_race['hisp frac'] = by_race['hispanic sum'] / by_race['count sum']
by_race
In [ ]:
health.select(['height','weight']).scatter('height','weight',fit_line=True)
In [ ]: