In [2]:
    
import numpy as np, pandas as pd
from matplotlib import pyplot as plt
import seaborn
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
# plotting options
seaborn.set()
plt.rcParams['figure.figsize'] = (15, 8)
%matplotlib inline
    
In [3]:
    
datadir = '/home/kristy/SDCard/ExtraDocuments/CountyExercisePredictions/'
datafile = '2015CHRAnalyticData.csv'
    
In [4]:
    
full_data = pd.read_csv(datadir+datafile, thousands=",")
# specify thousands separator so that numbers encoded as strings will be processed correctly
    
In [5]:
    
full_data.head()
    
    Out[5]:
In [6]:
    
full_data.shape
    
    Out[6]:
In [5]:
    
# Use full FIPS code as the index for each county.
full_data.index = full_data.STATECODE*1000 + full_data.COUNTYCODE
    
In [6]:
    
len(full_data.index.value_counts())
    
    Out[6]:
In [7]:
    
values_only = full_data.filter(regex='.*Value')
    
In [8]:
    
# Remove the redundant 'Value' in all column names 
values_only.rename(columns=lambda x: x.replace(' Value', ''), inplace=True)
    
In [11]:
    
values_only.head()
    
    Out[11]:
In [9]:
    
all_columns = values_only.columns
    
In [13]:
    
for idx, col in enumerate(all_columns):
    print idx, col
    
    
In [10]:
    
inactivity = all_columns[8] # this is the outcome variable
health_outcomes = all_columns[0:4] | all_columns[38:42]
health_behaviors = all_columns[5:7] | all_columns[9:13]
clinical_care = all_columns[14:20] | all_columns[47:51]
social_economic = all_columns[21:29] | all_columns[52:54]
physical_environment = all_columns[30:34]
population = all_columns[35:37]
    
In [15]:
    
health_behaviors
    
    Out[15]:
In [16]:
    
values_only[inactivity].hist();
plt.title(inactivity);
    
    
In [17]:
    
print(health_outcomes)
values_only[health_outcomes].hist(figsize=[30,15]);
    
    
    
In [11]:
    
# Start building list of covariates to include
covariates = [u'Diabetes', u'HIV prevalence rate', u'Infant mortality', u'Poor or fair health',
                             u'Premature age-adjusted mortality']
    
In [19]:
    
print(covariates)
    
    
In [20]:
    
print(health_behaviors)
values_only[health_behaviors].hist(figsize=[30,15]);
    
    
    
In [12]:
    
# Update list of covariates to include
covariates = covariates + [u'Access to exercise opportunities', u'Adult obesity', u'Adult smoking',
                                u'Excessive drinking', u'Sexually transmitted infections']
covariates
    
    Out[12]:
In [22]:
    
print(clinical_care)
values_only[clinical_care].hist(figsize=[30,15]);
    
    
    
In [13]:
    
# Update list of covariates to include
covariates = covariates + [u'Could not see doctor due to cost', u'Dentists', u'Diabetic screening',
                           u'Health care costs', u'Mental health providers', u'Preventable hospital stays', 
                           u'Primary care physicians', u'Uninsured']
print(covariates)
    
    
In [24]:
    
print(social_economic)
values_only[social_economic].hist(figsize=[30,15]);
    
    
    
In [14]:
    
# Update list of covariates to include
covariates = covariates + [ u'Children in poverty', u'Children in single-parent households', u'High school graduation', 
                           u'Income inequality', u'Median household income', u'Social associations', 
                           u'Some college', u'Unemployment', u'Violent crime']
print(covariates)
    
    
In [26]:
    
print(physical_environment)
values_only[physical_environment].hist(figsize=[30,15]);
    
    
    
In [15]:
    
# Update list of covariates to include
covariates = covariates + [u'Air pollution - particulate matter', u'Drinking water violations', 
                           u'Severe housing problems']
print(covariates)
    
    
In [28]:
    
print(population)
values_only[population].hist(figsize=[30,10]);
    
    
    
In [16]:
    
# Update list of covariates to include
covariates = covariates + [u'2011 population estimate', u'Population that is not proficient in English']
print(covariates)
    
    
In [17]:
    
all_factors = values_only[covariates]
outcomes = values_only[inactivity]
print(all_factors.shape)
print(outcomes.shape)
    
    
In [18]:
    
# We want 60% training, 20% cross-validation, 20% testing.
data_traincv, data_test, labels_traincv, labels_test = train_test_split(all_factors, outcomes, train_size=0.8)
data_train, data_valid, labels_train, labels_valid = train_test_split(data_traincv, labels_traincv, train_size=0.75)
    
In [19]:
    
# Fill in missing values
np.sum(~np.isfinite(data_train))
    
    Out[19]:
In [21]:
    
data_nonan_train = data_train.fillna(data_train.apply(np.nanmedian))
data_nonan_test = data_test.fillna(data_train.apply(np.nanmedian))
data_nonan_valid = data_valid.fillna(data_train.apply(np.nanmedian))
    
In [22]:
    
# Check that filling in NaNs didn't do anything strange.
plt.figure(figsize=[30,10]);
plt.subplot(121)
data_train['Poor or fair health'].hist();
plt.subplot(122)
data_nonan_train['Poor or fair health'].hist();
    
    
In [23]:
    
data_nonan_train.to_csv(datadir+'data_nonan_train.csv')
data_nonan_test.to_csv(datadir+'data_nonan_test.csv')
data_nonan_valid.to_csv(datadir+'data_nonan_valid.csv')
labels_train.to_csv(datadir+'labels_train.csv')
labels_test.to_csv(datadir+'labels_test.csv')
labels_valid.to_csv(datadir+'labels_valid.csv')