notebook.community

Edit and run



In [1]:

    
# HIDDEN
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')



In [2]:

    
# Read in the data from health records survey as a raw table
# Public data from the UC Michigan Institute for Social Research
# Health and Retirement Survey - rsonline.isr.umich.edu
#
hrec06 = Table.read_table("./hrsextract06.csv")



In [3]:

    
# the raw table - pretty yechy
hrec06









    Out[3]:





    
        
            hhidpn  r8agey_m  ragender  raracem  rahispan  raedyrs  h8cpl  h8atota  rayshlt  r8shlt  r8expyrs  r8cesd  r8bpavgs  r8bpavgp  r8smoken  r8mdactx  r8weightbio  r8weight  r8heightbio  r8height
        
    
    
        
            3010     70       1        1       0        12      1     914000     3       3      21.5     0      140      58       0        4        74.1623     71.6672  1.64465     1.6256  
        
    
        
            3020     67       2        1       0        16      1     914000     1       3      23.7     0      139      72       0        1        66.9048     65.317   1.61925     1.6256  
        
    
        
            10003030  50       2        1       0        16      0     12000      3       5      47.5     4      124      98       0        2        64.4101     58.9667  1.5494      1.5748  
        
    
        
            10004010  66       1        1       0        16      1     1.832e+06  1       4      7        1      130      74       0        2        101.605     102.511  1.8415      1.8542  
        
    
        
            10004040  60       2        1       0        12      1     1.832e+06  1       2      28.25    0      125      67       0        2        77.3374     77.1103  1.6383      1.651   
        
    
        
            10013010  68       1        1       0        12      0     50         1       3      16.5     2      125      67       0        5        110.223     108.862  1.7272      1.7272  
        
    
        
            10038010  70       1        1       0        16      1     2.5e+06    2       2      22.5     1      107      52       0        2        73.4819     74.8423  1.70815     1.7526  
        
    
        
            10038040  63       2        1       0        16      1     2.5e+06    1       1      27.05    1      112      70       0        2        65.3173     64.4098  1.67005     1.6764  
        
    
        
            10050010  64       2        3       0        17      0     664066     1       2      27.025   1      136      90       0        5        75.9767     68.0385  1.5875      1.6256  
        
    
        
            10059020  70       2        1       0        16      1     1.2405e+07  2       3      19.5     0      112      54       1        2        58.967      57.6059  1.6637      1.6764  
        
    

... (6332 rows omitted)



In [ ]:

    
# Create a table that provides the mapping and decoding to a more readable form
health_map = Table.from_rows(
    [["hhidpn",  "id", None, "identifier"],
    ["r8agey_m", "age", None, "age in years in wave 8"],
    ["ragender", "gender", ['male','female'], "1 = male,  2 = female)"],
    ["raracem",  "race",   ['white','black','other'], "(1 = white,  2 = black,  3 = other)"],
    ["rahispan", "hispanic",  None, "(1 = yes)"],
    ["raedyrs",  "education", None, "education in years"],
    ["h8cpl",    "couple",    None, "in a couple household (1 = yes)"],
    ["r8bpavgs", "blood pressure", None,"average systolic BP"],
    ["r8bpavgp", "pulse", None, "average pulse"],
    ["r8smoken", "smoker",None, "currently smokes cigarettes"],
    ["r8mdactx", "exercise", None, "frequency of moderate exercise (1=everyday, 2=>1perweek, 3=1perweek, 4=1-3permonth\
, 5=never)"],
    ["r8weightbio", "weight", None, "objective weight in kg"],
    ["r8heightbio","height", None, "objective height in m"],
    ], 
    ["raw label", "label", "encoding", "Description"])
health_map



In [ ]:

    
def table_lookup(table,key_col,key,map_col):
    row = np.where(table[key_col]==key)
    if len(row[0]) == 1:
        return table[map_col][row[0]][0]
    else:
        return -1



In [ ]:

    
def map_raw_table(raw_table,map_table):
    mapped = Table()
    for raw_label in raw_table :
        if raw_label in map_table["raw label"] :
            new_label = table_lookup(map_table,'raw label',raw_label,'label')
            encoding = table_lookup(map_table,'raw label',raw_label,'encoding')
            if encoding is None :
                mapped[new_label] = raw_table[raw_label]
            else:
                mapped[new_label] = raw_table.apply(lambda x: encoding[x-1], raw_label)
    return mapped



In [ ]:

    
# create a more usable table by mapping the raw to finished
health = map_raw_table(hrec06,health_map)
health



In [ ]:

    
# Let's try what is the effect of smoking
smokers = health.where('smoker',1)
nosmokers = health.where('smoker',0)
print(smokers.num_rows, ' smokers')
print(nosmokers.num_rows, ' non-smokers')



In [ ]:

    
def firstQtile(x) : return np.percentile(x,25)
def thirdQtile(x) : return np.percentile(x,25)
summary_ops = (min, firstQtile, np.mean, thirdQtile, max)



In [ ]:

    
smokers.stats(summary_ops)



In [ ]:

    
nosmokers.stats(summary_ops)

It would appear that nosmokers are older, more educated, more 'coupled' and heavier - but of similar height and blood pressure



In [ ]:

    
# Lets draw two samples of equal size
n_sample = 200
smoker_sample = smokers.sample(n_sample)
nosmoker_sample = nosmokers.sample(n_sample)



In [ ]:

    
weight = Table([nosmoker_sample['weight'],smoker_sample['weight']],['NoSmoke','Smoke'])
weight.hist(overlay=True,bins=30,normed=True)



In [ ]:

    
bins=np.arange(39,139,5)
weight_dist = weight.bin(bins=bins, normed=True)
weight_dist['diff']=weight_dist['NoSmoke density']-weight_dist['Smoke density']
print('TVD: ',sum(np.abs(weight_dist['diff'])))
weight_dist.show()



In [ ]:

    
weight_dist.select(['bin','diff']).bar('bin')



In [ ]:

    
weight.stats(summary_ops)



In [ ]:

    
np.mean(weight['NoSmoke'])-np.mean(weight['Smoke'])

Is the difference observed between these samples representative of the larger population?



In [ ]:

    
combined = Table([np.append(nosmoker_sample['weight'],smoker_sample['weight'])],['all'])



In [ ]:

    
# permutation test, split the combined into two random groups, do the comparison of those
def getdiff():
    A,B = combined.split(300)
    return (np.mean(A['all'])-np.mean(B['all']))



In [ ]:

    
# Do the permutation many times and form the distribution of results
num_samples = 100
diff_samples = Table([[getdiff() for i in range(num_samples)]],['diffs'])



In [ ]:

    
diff_samples.hist(bins=20, normed=True)



In [ ]:

hhidpn	r8agey_m	ragender	raracem	raedyrs	h8cpl	h8atota	rayshlt	r8shlt	r8expyrs	r8cesd	r8bpavgs	r8bpavgp	r8smoken	r8mdactx	r8weightbio	r8weight	r8heightbio	r8height
3010	70	1	1	12	1	914000	3	3	21.5	0	140	58	0	4	74.1623	71.6672	1.64465	1.6256
3020	67	2	1	16	1	914000	1	3	23.7	0	139	72	0	1	66.9048	65.317	1.61925	1.6256
10003030	50	2	1	16	0	12000	3	5	47.5	4	124	98	0	2	64.4101	58.9667	1.5494	1.5748
10004010	66	1	1	16	1	1.832e+06	1	4	7	1	130	74	0	2	101.605	102.511	1.8415	1.8542
10004040	60	2	1	12	1	1.832e+06	1	2	28.25	0	125	67	0	2	77.3374	77.1103	1.6383	1.651
10013010	68	1	1	12	0	50	1	3	16.5	2	125	67	0	5	110.223	108.862	1.7272	1.7272
10038010	70	1	1	16	1	2.5e+06	2	2	22.5	1	107	52	0	2	73.4819	74.8423	1.70815	1.7526
10038040	63	2	1	16	1	2.5e+06	1	1	27.05	1	112	70	0	2	65.3173	64.4098	1.67005	1.6764
10050010	64	2	3	17	0	664066	1	2	27.025	1	136	90	0	5	75.9767	68.0385	1.5875	1.6256
10059020	70	2	1	16	1	1.2405e+07	2	3	19.5	0	112	54	1	2	58.967	57.6059	1.6637	1.6764