In [2]:
import numpy as np, pandas as pd
from matplotlib import pyplot as plt
import seaborn
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
# plotting options
seaborn.set()
plt.rcParams['figure.figsize'] = (15, 8)
%matplotlib inline
In [3]:
datadir = '/home/kristy/SDCard/ExtraDocuments/CountyExercisePredictions/'
datafile = '2015CHRAnalyticData.csv'
In [4]:
full_data = pd.read_csv(datadir+datafile, thousands=",")
# specify thousands separator so that numbers encoded as strings will be processed correctly
In [5]:
full_data.head()
Out[5]:
In [6]:
full_data.shape
Out[6]:
In [5]:
# Use full FIPS code as the index for each county.
full_data.index = full_data.STATECODE*1000 + full_data.COUNTYCODE
In [6]:
len(full_data.index.value_counts())
Out[6]:
In [7]:
values_only = full_data.filter(regex='.*Value')
In [8]:
# Remove the redundant 'Value' in all column names
values_only.rename(columns=lambda x: x.replace(' Value', ''), inplace=True)
In [11]:
values_only.head()
Out[11]:
In [9]:
all_columns = values_only.columns
In [13]:
for idx, col in enumerate(all_columns):
print idx, col
In [10]:
inactivity = all_columns[8] # this is the outcome variable
health_outcomes = all_columns[0:4] | all_columns[38:42]
health_behaviors = all_columns[5:7] | all_columns[9:13]
clinical_care = all_columns[14:20] | all_columns[47:51]
social_economic = all_columns[21:29] | all_columns[52:54]
physical_environment = all_columns[30:34]
population = all_columns[35:37]
In [15]:
health_behaviors
Out[15]:
In [16]:
values_only[inactivity].hist();
plt.title(inactivity);
In [17]:
print(health_outcomes)
values_only[health_outcomes].hist(figsize=[30,15]);
In [11]:
# Start building list of covariates to include
covariates = [u'Diabetes', u'HIV prevalence rate', u'Infant mortality', u'Poor or fair health',
u'Premature age-adjusted mortality']
In [19]:
print(covariates)
In [20]:
print(health_behaviors)
values_only[health_behaviors].hist(figsize=[30,15]);
In [12]:
# Update list of covariates to include
covariates = covariates + [u'Access to exercise opportunities', u'Adult obesity', u'Adult smoking',
u'Excessive drinking', u'Sexually transmitted infections']
covariates
Out[12]:
In [22]:
print(clinical_care)
values_only[clinical_care].hist(figsize=[30,15]);
In [13]:
# Update list of covariates to include
covariates = covariates + [u'Could not see doctor due to cost', u'Dentists', u'Diabetic screening',
u'Health care costs', u'Mental health providers', u'Preventable hospital stays',
u'Primary care physicians', u'Uninsured']
print(covariates)
In [24]:
print(social_economic)
values_only[social_economic].hist(figsize=[30,15]);
In [14]:
# Update list of covariates to include
covariates = covariates + [ u'Children in poverty', u'Children in single-parent households', u'High school graduation',
u'Income inequality', u'Median household income', u'Social associations',
u'Some college', u'Unemployment', u'Violent crime']
print(covariates)
In [26]:
print(physical_environment)
values_only[physical_environment].hist(figsize=[30,15]);
In [15]:
# Update list of covariates to include
covariates = covariates + [u'Air pollution - particulate matter', u'Drinking water violations',
u'Severe housing problems']
print(covariates)
In [28]:
print(population)
values_only[population].hist(figsize=[30,10]);
In [16]:
# Update list of covariates to include
covariates = covariates + [u'2011 population estimate', u'Population that is not proficient in English']
print(covariates)
In [17]:
all_factors = values_only[covariates]
outcomes = values_only[inactivity]
print(all_factors.shape)
print(outcomes.shape)
In [18]:
# We want 60% training, 20% cross-validation, 20% testing.
data_traincv, data_test, labels_traincv, labels_test = train_test_split(all_factors, outcomes, train_size=0.8)
data_train, data_valid, labels_train, labels_valid = train_test_split(data_traincv, labels_traincv, train_size=0.75)
In [19]:
# Fill in missing values
np.sum(~np.isfinite(data_train))
Out[19]:
In [21]:
data_nonan_train = data_train.fillna(data_train.apply(np.nanmedian))
data_nonan_test = data_test.fillna(data_train.apply(np.nanmedian))
data_nonan_valid = data_valid.fillna(data_train.apply(np.nanmedian))
In [22]:
# Check that filling in NaNs didn't do anything strange.
plt.figure(figsize=[30,10]);
plt.subplot(121)
data_train['Poor or fair health'].hist();
plt.subplot(122)
data_nonan_train['Poor or fair health'].hist();
In [23]:
data_nonan_train.to_csv(datadir+'data_nonan_train.csv')
data_nonan_test.to_csv(datadir+'data_nonan_test.csv')
data_nonan_valid.to_csv(datadir+'data_nonan_valid.csv')
labels_train.to_csv(datadir+'labels_train.csv')
labels_test.to_csv(datadir+'labels_test.csv')
labels_valid.to_csv(datadir+'labels_valid.csv')