In [1]:
# HIDDEN
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np
In [3]:
# Read in data from health records survey as a raw table
# Public data from the UC Michigan Institute for Social Research
# Health and Retirement Survey - rsonline.isr.umich.edu
#
hrec06 = Table.read_table("./data/hrsextract06.csv")
hrec06
Out[3]:
They say "all problems in computer science can be solved with an extra level of indirection." It certainly provides some real leverage in data wrangling. Rather than write a bunch of spaghetti code, we will build a table that defines the transformation we would like to perform on the raw data in order to have something cleaner to work with. In this we can map the indecipherable identifiers into something more understandable; we can establish formatters; we can translate field encodings into clear mnemonics, and so on.
We need a tool for finding elements in the translation table; that's table_lookup
. Then we can
build our mapping tool, map_raw_table
.
In [28]:
health_map = Table(["raw label", "label", "encoding", "Description"]).with_rows(
[["hhidpn", "id", None, "identifier"],
["r8agey_m", "age", None, "age in years in wave 8"],
["ragender", "gender", ['male','female'], "1 = male, 2 = female)"],
["raracem", "race", ['white','black','other'], "(1 = white, 2 = black, 3 = other)"],
["rahispan", "hispanic", None, "(1 = yes)"],
["raedyrs", "education", None, "education in years"],
["h8cpl", "couple", None, "in a couple household (1 = yes)"],
["r8bpavgs", "blood pressure", None,"average systolic BP"],
["r8bpavgp", "pulse", None, "average pulse"],
["r8smoken", "smoker",None, "currently smokes cigarettes"],
["r8mdactx", "exercise", None, "frequency of moderate exercise (1=everyday, 2=>1perweek, 3=1perweek, 4=1-3permonth\
, 5=never)"],
["r8weightbio", "weight", None, "objective weight in kg"],
["r8heightbio","height", None, "objective height in m"]])
health_map
Out[28]:
In [29]:
def table_lookup(table,key_col,key,map_col):
row = np.where(table[key_col]==key)
if len(row[0]) == 1:
return table[map_col][row[0]][0]
else:
return -1
In [30]:
def map_raw_table(raw_table,map_table):
mapped = Table()
for raw_label in raw_table :
if raw_label in map_table["raw label"] :
new_label = table_lookup(map_table,'raw label',raw_label,'label')
encoding = table_lookup(map_table,'raw label',raw_label,'encoding')
if encoding is None :
mapped[new_label] = raw_table[raw_label]
else:
mapped[new_label] = raw_table.apply(lambda x: encoding[x-1], raw_label)
return mapped
In [31]:
# create a more usable table by mapping the raw to finished
health = map_raw_table(hrec06,health_map)
health
Out[31]:
In [32]:
def firstQtile(x) : return np.percentile(x,25)
def thirdQtile(x) : return np.percentile(x,25)
summary_ops = (min, firstQtile, np.median, np.mean, thirdQtile, max, sum)
In [33]:
# Let's try what is the effect of smoking
smokers = health.where('smoker',1)
nosmokers = health.where('smoker',0)
print(smokers.num_rows, ' smokers')
print(nosmokers.num_rows, ' non-smokers')
In [34]:
smokers.stats(summary_ops)
Out[34]:
In [35]:
nosmokers.stats(summary_ops)
Out[35]:
In [36]:
help(smokers.hist)
In [37]:
smokers.hist('weight', bins=20)
In [38]:
nosmokers.hist('weight', bins=20)
In [39]:
np.mean(nosmokers['weight'])-np.mean(smokers['weight'])
Out[39]:
In [42]:
# Lets draw two samples of equal size
n_sample = 200
smoker_sample = smokers.sample(n_sample)
nosmoker_sample = nosmokers.sample(n_sample)
weight = Table().with_columns([('NoSmoke', nosmoker_sample['weight']),('Smoke', smoker_sample['weight'])])
weight.hist(overlay=True,bins=30,normed=True)
In [43]:
weight.stats(summary_ops)
Out[43]:
Is the difference observed between these samples representative of the larger population?
In [44]:
combined = Table().with_column('all', np.append(nosmoker_sample['weight'],smoker_sample['weight']))
In [45]:
combined.num_rows
Out[45]:
In [46]:
# permutation test, split the combined into two random groups, do the comparison of those
def getdiff():
A,B = combined.split(n_sample)
return (np.mean(A['all'])-np.mean(B['all']))
In [47]:
# Do the permutation many times and form the distribution of results
num_samples = 300
diff_samples = Table().with_column('diffs', [getdiff() for i in range(num_samples)])
diff_samples.hist(bins=np.arange(-5,5,0.5), normed=True)
The 4.5 kg difference is certainly not an artifact of the sample we started with. The smokers definitely weigh less. At the same time, these are not light people in this study. Better go back and understand what was the purpose of the study that led to the selection of these six thousand individuals.
In [48]:
# A sense of the overall population represented - older
health.select(['age','education']).hist(bins=20)
In [49]:
# How does education correlate with age?
health.select(['age','education']).scatter('age', fit_line=True)
In [50]:
health.pivot_hist('race','education',normed=True)
In [51]:
# How are races represented in the dataset and how does hispanic overlay the three?
race = health.select(['race', 'hispanic'])
race['count']=1
by_race = race.group('race',sum)
by_race['race frac'] = by_race['count sum']/np.sum(by_race['count sum'])
by_race['hisp frac'] = by_race['hispanic sum'] / by_race['count sum']
by_race
Out[51]:
In [52]:
health.select(['height','weight']).scatter('height','weight',fit_line=True)