In [ ]:
# HIDDEN
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np

In [ ]:
# Read in data from health records survey as a raw table
# Public data from the UC Michigan Institute for Social Research
# Health and Retirement Survey - rsonline.isr.umich.edu
#
hrec06 = Table.read_table("./hrsextract06.csv")
hrec06

In [ ]:
health_map = Table(["raw label", "label", "encoding", "Description"]).with_rows(
       [["hhidpn",  "id", None, "identifier"],
        ["r8agey_m", "age", None, "age in years in wave 8"],
        ["ragender", "gender", ['male','female'], "1 = male,  2 = female)"],
        ["raracem",  "race",   ['white','black','other'], "(1 = white,  2 = black,  3 = other)"],
        ["rahispan", "hispanic",  None, "(1 = yes)"],
        ["raedyrs",  "education", None, "education in years"],
        ["h8cpl",    "couple",    None, "in a couple household (1 = yes)"],
        ["r8bpavgs", "blood pressure", None,"average systolic BP"],
        ["r8bpavgp", "pulse", None, "average pulse"],
        ["r8smoken", "smoker",None, "currently smokes cigarettes"],
        ["r8mdactx", "exercise", None, "frequency of moderate exercise (1=everyday, 2=>1perweek, 3=1perweek, 4=1-3permonth\
, 5=never)"],
        ["r8weightbio", "weight", None, "objective weight in kg"],
        ["r8heightbio","height", None, "objective height in m"]])
health_map

In [ ]:
def table_lookup(table,key_col,key,map_col):
    row = np.where(table[key_col]==key)
    if len(row[0]) == 1:
        return table[map_col][row[0]][0]
    else:
        return -1

In [ ]:
def map_raw_table(raw_table,map_table):
    mapped = Table()
    for raw_label in raw_table :
        if raw_label in map_table["raw label"] :
            new_label = table_lookup(map_table,'raw label',raw_label,'label')
            encoding = table_lookup(map_table,'raw label',raw_label,'encoding')
            if encoding is None :
                mapped[new_label] = raw_table[raw_label]
            else:
                mapped[new_label] = raw_table.apply(lambda x: encoding[x-1], raw_label)
    return mapped

In [ ]:
# create a more usable table by mapping the raw to finished
health = map_raw_table(hrec06,health_map)
health

In [ ]:
def firstQtile(x) : return np.percentile(x,25)
def thirdQtile(x) : return np.percentile(x,25)
summary_ops = (min, firstQtile, np.median, np.mean, thirdQtile, max, sum)

In [ ]:
# Let's try what is the effect of smoking
smokers = health.where('smoker',1)
nosmokers = health.where('smoker',0)
print(smokers.num_rows, ' smokers')
print(nosmokers.num_rows, ' non-smokers')

In [ ]:
smokers.stats(summary_ops)

In [ ]:
nosmokers.stats(summary_ops)

In [ ]:
help(smokers.hist)

In [ ]:
smokers.hist('weight', bins=20)

In [ ]:
nosmokers.hist('weight', bins=20)

In [ ]:
np.mean(nosmokers['weight'])-np.mean(smokers['weight'])

In [ ]:
# Lets draw two samples of equal size
n_sample = 200
smoker_sample = smokers.sample(n_sample)
nosmoker_sample = nosmokers.sample(n_sample)
weight = Table().with_columns([('NoSmoke', nosmoker_sample['weight']),('Smoke', smoker_sample['weight'])])
weight.hist(overlay=True,bins=30,normed=True)

In [ ]:
weight.stats(summary_ops)

Is the difference observed between these samples representative of the larger population?


In [ ]:
combined = Table().with_column('all', np.append(nosmoker_sample['weight'],smoker_sample['weight']))

In [ ]:
combined.num_rows

In [ ]:
# permutation test, split the combined into two random groups, do the comparison of those
def getdiff():
    A,B = combined.split(n_sample)
    return (np.mean(A['all'])-np.mean(B['all']))

In [ ]:
# Do the permutation many times and form the distribution of results
num_samples = 300
diff_samples = Table().with_column('diffs', [getdiff() for i in range(num_samples)])
diff_samples.hist(bins=np.arange(-5,5,0.5), normed=True)

In [ ]:


In [ ]:
# A sense of the overall population represented - older
health.select(['age','education']).hist(bins=20)

In [ ]:
# How does education correlate with age?
health.select(['age','education']).scatter('age', fit_line=True)

In [ ]:
health.pivot_hist('race','education',normed=True)

In [ ]:
# How are races represented in the dataset and how does hispanic overlay the three?
race = health.select(['race', 'hispanic'])   
race['count']=1
by_race = race.group('race',sum)
by_race['race frac'] = by_race['count sum']/np.sum(by_race['count sum'])
by_race['hisp frac'] = by_race['hispanic sum'] / by_race['count sum']
by_race

In [ ]:
health.select(['height','weight']).scatter('height','weight',fit_line=True)

In [ ]: