Analysis of 2014 data from the CDC on the prevalence of STD's in U.S. counties.
In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re
%matplotlib inline
In [2]:
# Always display all the columns
pd.set_option('display.width', 5000)
pd.set_option('display.max_columns', 200)
# Plain Seaborn figures with matplotlib color codes mapped to the default seaborn palette
sns.set(style="white", color_codes=True)
In [3]:
df = pd.read_csv("../data/cdc/gonorrhea.csv")
In [4]:
df.shape
Out[4]:
In [5]:
df.columns
Out[5]:
In [6]:
df.dtypes
Out[6]:
In [7]:
df_test = df.convert_objects(convert_numeric=True)
df_test.dtypes
Out[7]:
In [8]:
df_test.head()
Out[8]:
In [9]:
df['Population'] = df['Population'].str.replace(',','')
df['Cases'] = df['Cases'].str.replace(',','')
In [10]:
df = df.convert_objects(convert_numeric=True)
df.dtypes
Out[10]:
In [11]:
df.head(77)
Out[11]:
In [12]:
df['Population'].describe()
Out[12]:
In [13]:
df['Population'].idxmax()
Out[13]:
In [14]:
df.loc[207]
Out[14]:
In [15]:
fig = plt.figure(figsize=(10, 6))
data = np.log10(df['Population'])
ax = data.plot.hist(50)
ax.set_xlabel("log(Population)")
ax.set_title("Gonorrhea")
plt.savefig('../graphics/county_population_gonorrhea.png', bbox_inches='tight', dpi=150)
In [16]:
fig = plt.figure(figsize=(10, 6))
data = np.log10(df['Cases']+1)
ax = data.plot.hist(50)
ax.set_xlabel("log(Cases)")
ax.set_title("Gonorrhea")
plt.savefig('../graphics/county_cases_gonorrhea.png', bbox_inches='tight', dpi=150)
In [17]:
fig = plt.figure(figsize=(10, 6))
ax = df['Rate'].plot.hist(100)
ax.set_xlabel("Rate (per 100,000 inhabitants)")
ax.set_title("Gonorrhea")
plt.savefig('../graphics/county_rate_gonorrhea.png', bbox_inches='tight', dpi=150)
In [18]:
outliers = df[df['Rate']<10]
In [19]:
outliers
Out[19]:
In [20]:
not_exclude_list = df["Cases"]<0
exclude_list = [not i for i in not_exclude_list]
df_sig = df[exclude_list].copy()
In [21]:
df = df_sig.copy()
In [22]:
df["Rate"].sort_values()
Out[22]:
In [23]:
len(df['Area'].unique())
Out[23]:
In [24]:
df.shape
Out[24]:
In [25]:
df.sort_values(by='Area')
Out[25]:
In [26]:
df['Area'].value_counts()
Out[26]:
In [27]:
df.isnull().values.any()
Out[27]:
In [28]:
null_list = df["Population"].isnull()
not_null_list = [not i for i in null_list]
df_clean = df[not_null_list].copy()
In [29]:
null_list = df_clean["Rate"].isnull()
not_null_list = [not i for i in null_list]
df_completely_clean = df_clean[not_null_list].copy()
In [30]:
df_completely_clean["Rate"].isnull().values.any()
Out[30]:
In [31]:
df_merged = pd.read_csv("../data/chlamydia_cdc_census.csv")
In [32]:
df_completely_clean[df_completely_clean["FIPS"]==1001].Cases[0]
Out[32]:
Replace the number of Chlamydia cases with the number of Gonorrhea cases
In [33]:
#print(df_merged[df_merged["FIPS"]==1001].Cases[0])
#df_merged.set_value(1, "FIPS", 10)
for county in df_merged["FIPS"]:
rowlist = df_merged[df_merged['FIPS'] == county].index.tolist()
gonorrhea_cases = df_completely_clean[df_completely_clean["FIPS"] == county].Cases.tolist()
df_merged.set_value(rowlist[0], 'Cases', gonorrhea_cases[0])
# print(df_merged["FIPS"][rowlist[0]])
#df_merged[df_merged["FIPS"]==county]
In [34]:
df_merged.head()
Out[34]:
In [35]:
df_merged.describe()
Out[35]:
In [36]:
df_merged.shape
Out[36]:
In [37]:
plt.scatter(np.log10(df_merged["Population"]), df_merged["Cases"]/df_merged["Population"])
Out[37]:
In [38]:
plt.scatter(df_merged["FIPS"], df_merged["Cases"]/df_merged["Population"])
Out[38]:
In [39]:
df_all = df_merged.copy()
df_all["Rate"] = df_all["Cases"]/df_all["Population"]
corr = df_all.corr()
In [40]:
pearsonr = corr["Rate"]
pearsonr
Out[40]:
In [41]:
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
In [42]:
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
square=True, xticklabels=2, yticklabels=2,
linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
plt.savefig('../graphics/cross-correlation_gonorrhea.png', bbox_inches='tight', dpi=150)
In [43]:
from sklearn.decomposition import PCA
In [44]:
df_merged.describe()
Out[44]:
In [45]:
df_new = df_merged.drop(["FIPS","Cases", "d002", "hd02s051", "hd02s143", "hd02s159", "hd02s184", "hd01s001"], axis=1)
X = df_new.values
columns = X.shape[1]
for column in np.arange(columns):
mean_temp = X[:,column].mean()
std_temp = X[:,column].std()
X[:,column] = (X[:,column]-mean_temp)/std_temp
plt.plot((np.std(X, axis=0)))
Out[45]:
In [46]:
X.shape
Out[46]:
In [47]:
pca = PCA(n_components=39)
pca.fit(X)
pca.explained_variance_ratio_
Out[47]:
In [48]:
plt.scatter(pca.components_[:,0], pca.components_[:,1])
Out[48]:
In [49]:
X_trans = pca.transform(X)
Y = df_merged["Cases"]/df_merged["Population"]
plt.scatter(X_trans[:,0],Y)
Out[49]:
In [50]:
from sklearn import linear_model
In [51]:
df_new["hd02s002"].hist()
Out[51]:
In [52]:
df_new.head()
Out[52]:
In [53]:
df_merged.head()
Out[53]:
Split data set into training/test and validation data
In [54]:
cutoff = 1
X = df_new[df_merged["Cases"]>cutoff].values
Y = df_merged[df_merged["Cases"]>cutoff].Cases/(df_merged[df_merged["Cases"]>cutoff].Population+1.0)
X_full = df_new.values
Y_full = df_merged.Cases/(df_merged.Population+1.0)
#normalize all columns to the same normalization
columns = X.shape[1]
means = np.zeros(columns)
stds = np.zeros(columns)
for column in np.arange(columns):
mean_temp = X[:,column].mean()
std_temp = X[:,column].std()
means[column] = mean_temp
stds[column] = std_temp
X[:,column] = (X[:,column]-mean_temp)/std_temp
X_full[:,column] = (X_full[:,column]-mean_temp)/std_temp
Ymean = Y_full.mean()
Ystd = Y_full.std()
Y = (Y-Ymean)/Ystd
Y_full = (Y_full-Ymean)/Ystd
ones = np.ones(round(0.75*len(X)), dtype=bool)
zeros = np.zeros(len(X)-round(0.75*len(X)), dtype=bool)
training_list = np.hstack((ones, zeros))
np.random.shuffle(training_list)
test_list = np.zeros(len(X),dtype=bool)
test_list = np.array([not i for i in training_list])
X_train = X[training_list]
X_test = X[test_list]
Y_train = Y[training_list]
Y_test = Y[test_list]
X.shape, X_train.shape, X_test.shape, Y.shape, Y_train.shape, Y_test.shape
Out[54]:
In [55]:
X.shape, Y.shape
Out[55]:
In [56]:
Y_test.describe()
Out[56]:
In [57]:
#X_weights = df_merged.values
#X_train_weights = X_weights[training_list]
weights = 1 #X_train_weights[:,2]
regr = linear_model.LinearRegression()
regr.fit(X_train, Y_train, sample_weight=weights)
Out[57]:
In [58]:
print(regr.coef_)
In [59]:
1.0-np.sum((regr.predict(X_test)-Y_test)**2)/np.sum((Y_test-np.mean(Y_test))**2)
Out[59]:
In [60]:
plt.scatter(Y,regr.predict(X))
plt.plot(np.linspace(Y.min(),Y.max(),num=10),np.linspace(Y.min(),Y.max(),num=10),color='red')
Out[60]:
In [61]:
print('Variance score: %.5f\t(%.5f)' % (regr.score(X_test, Y_test), regr.score(X_full, Y_full)))
In [62]:
from sklearn import cross_validation
cv = cross_validation.ShuffleSplit(len(Y_train), n_iter=5, test_size=0.2, random_state=0)
scores_regression = cross_validation.cross_val_score(regr, X_train, Y_train, cv=cv)
scores_regression
#cross_val_score(regr, X_train, Y_train, cv=6, n_jobs=1)
#scores
Out[62]:
In [63]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_regression.mean(), scores_regression.std() * 2))
In [64]:
from sklearn.metrics import explained_variance_score
In [65]:
explained_variance_score(Y_train, regr.predict(X_train))
Out[65]:
In [66]:
from sklearn.metrics import mean_absolute_error
In [67]:
mean_absolute_error(Y_train, regr.predict(X_train))
Out[67]:
In [68]:
from sklearn.metrics import mean_squared_error
In [69]:
mean_squared_error(Y_train, regr.predict(X_train))
Out[69]:
In [70]:
from sklearn.metrics import median_absolute_error
In [71]:
median_absolute_error(Y_train, regr.predict(X_train))
Out[71]:
In [72]:
from sklearn.metrics import r2_score
In [73]:
r2_score(Y_train, regr.predict(X_train))
Out[73]:
In [74]:
from sklearn.preprocessing import PolynomialFeatures
In [75]:
poly = PolynomialFeatures(degree=2)
In [76]:
X_train_poly = poly.fit_transform(X_train)
In [77]:
poly_regr = linear_model.LinearRegression(fit_intercept=False)
poly_regr.fit(X_train_poly, Y_train)
Out[77]:
In [78]:
plt.scatter(Y,poly_regr.predict(poly.fit_transform(X)))
plt.plot(np.linspace(Y_train.min(),Y_train.max(),num=10),np.linspace(Y_train.min(),Y_train.max(),num=10),color='red')
Out[78]:
In [79]:
print('Variance score: %.5f\t(%.5f)' % (poly_regr.score(poly.fit_transform(X_test), Y_test), poly_regr.score(poly.fit_transform(X), Y)))
In [80]:
from sklearn import linear_model
In [81]:
rregr = linear_model.Ridge(alpha=0.5)
rregr.fit(X_train, Y_train)
Out[81]:
In [82]:
print('Variance score: %.5f\t(%.5f)' % (rregr.score(X_test, Y_test), rregr.score(X_full, Y_full)))
In [83]:
scores_rregression = cross_validation.cross_val_score(rregr, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_rregression.mean(), scores_rregression.std() * 2))
In [84]:
from sklearn.ensemble import ExtraTreesRegressor
In [85]:
clf = ExtraTreesRegressor(n_estimators=250, bootstrap=True, oob_score=True, max_features='sqrt')
clf.fit(X_train, Y_train)
Out[85]:
In [86]:
print('Variance score: %.5f\t(%.5f)\nOut of bag error score: %.5f' % (clf.score(X_test, Y_test), clf.score(X_full, Y_full),clf.oob_score_))
In [87]:
scores_etregression = cross_validation.cross_val_score(clf, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_etregression.mean(), scores_etregression.std() * 2))
In [88]:
from sklearn.ensemble import AdaBoostRegressor
In [89]:
clf = AdaBoostRegressor(n_estimators=500, learning_rate=0.01, loss='linear')
clf.fit(X_train, Y_train)
Out[89]:
In [90]:
print('Variance score: %.5f\t(%.5f)' % (clf.score(X_test, Y_test), clf.score(X_full, Y_full)))
In [91]:
scores_adaregression = cross_validation.cross_val_score(clf, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_adaregression.mean(), scores_adaregression.std() * 2))
In [92]:
from sklearn.ensemble import BaggingRegressor
In [93]:
clf = BaggingRegressor(n_estimators=250, bootstrap=True, oob_score=True, max_features=20)
clf.fit(X_train, Y_train)
Out[93]:
In [94]:
print('Variance score: %.5f\t(%.5f)\nOut of bag error score: %.5f' % (clf.score(X_test, Y_test), clf.score(X_full, Y_full),clf.oob_score_))
In [95]:
scores_bagregression = cross_validation.cross_val_score(clf, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_bagregression.mean(), scores_bagregression.std() * 2))
In [96]:
from sklearn.ensemble import GradientBoostingRegressor
In [97]:
clf = GradientBoostingRegressor(n_estimators=250, max_features=10,max_depth=5)
clf.fit(X_train, Y_train)
Out[97]:
In [98]:
print('Variance score: %.5f\t(%.5f)' % (clf.score(X_test, Y_test), clf.score(X_full, Y_full)))
In [99]:
plt.scatter(Y_full,clf.predict(X_full))
plt.plot(np.linspace(Y_full.min(),Y_full.max(),num=10),np.linspace(Y_full.min(),Y_full.max(),num=10),color='red')
Out[99]:
In [100]:
scores_gradboostregression = cross_validation.cross_val_score(clf, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_gradboostregression.mean(), scores_gradboostregression.std() * 2))
In [101]:
clf.fit(X_train, Y_train)
Out[101]:
In [102]:
import pickle
In [103]:
with open("../data/gradientboosting_params_gonorrhea.pickle", "wb") as myfile:
pickle.dump(clf, myfile)
In [104]:
from sklearn.ensemble import RandomForestRegressor
In [105]:
clf = RandomForestRegressor(n_estimators=250, oob_score=True, max_features='sqrt',min_samples_split=2, n_jobs=4)
clf.fit(X_train, Y_train)
Out[105]:
In [106]:
print('Variance score: %.5f\t(%.5f)\nOut of bag error score: %.5f' % (clf.score(X_test, Y_test), clf.score(X_full, Y_full),clf.oob_score_))
In [107]:
scores_randomforest = cross_validation.cross_val_score(clf, X_train, Y_train, cv=cv, n_jobs=4)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_randomforest.mean(), scores_randomforest.std() * 2))
In [108]:
clf = RandomForestRegressor(n_estimators=250, oob_score=True, max_features='sqrt',min_samples_split=2, n_jobs=4)
clf.fit(X_train, Y_train)
Out[108]:
In [109]:
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
#ax.text(-1.5, 3, r'$R^2 = $%.2f'%(clf.oob_score_), fontsize=30)
ax.set_xlabel("CDC data", fontsize=20)
ax.set_ylabel("Model prediction", fontsize=20)
ax = plt.scatter(Y,clf.predict(X))
ax2 = plt.plot(np.linspace(Y.min(),Y.max(),10),np.linspace(Y.min(),Y.max(),10),color='red')
plt.savefig('../graphics/data_vs_model_gonorrhea.png', bbox_inches='tight', dpi=150)
In [828]:
from collections import OrderedDict
RANDOM_STATE = 123
ensemble_clfs = [
("RandomForestRegressor, max_features='sqrt'",
RandomForestRegressor(warm_start=False, oob_score=True,
max_features="sqrt",
random_state=RANDOM_STATE)),
("RandomForestRegressor, max_features='log2'",
RandomForestRegressor(warm_start=False, max_features='log2',
oob_score=True,
random_state=RANDOM_STATE)),
("RandomForestRegressor, max_features=None",
RandomForestRegressor(warm_start=False, max_features=None,
oob_score=True,
random_state=RANDOM_STATE))
]
# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)
# Range of `n_estimators` values to explore.
min_estimators = 10
max_estimators = 250
for label, clf in ensemble_clfs:
for i in range(min_estimators, max_estimators,10):
clf.set_params(n_estimators=i)
clf.fit(X_train, Y_train)
# Record the OOB error for each `n_estimators=i` setting.
oob_error = clf.oob_score_
error_rate[label].append((i, oob_error))
# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
xs, ys = zip(*clf_err)
plt.plot(xs, ys, label=label)
plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB score")
plt.legend(loc="lower right")
plt.savefig('../graphics/features_estimators_gonorrhea.png', bbox_inches='tight', dpi=150)
In [111]:
feature_importance = (np.vstack((np.arange(len(clf.feature_importances_)), clf.feature_importances_)).T)
ranking = feature_importance[feature_importance[:,1].argsort()[::-1]]
In [112]:
for rank, importance in ranking:
print(df_new.columns[rank], importance)
In [113]:
labels_dict = {
"Population": "Population",
"hd01s001": "log10(Population)",
"hd02s002": "0-14",
"hd02s005": "15-19",
"hd02s006": "20-24",
"hd02s007": "25-29",
"hd02s008": "30-34",
"hd02s009": "35-39",
"hd02s010": "40-44",
"hd02s011": "45-54",
"hd02s013": "55-64",
"hd02s015": "65+",
"hd01s020": "Median age",
"hd02s026": "Male percent",
"hd02s051": "Female percent",
"hd02s078": "White",
"hd02s079": "Black",
"hd02s080": "Native",
"hd02s081": "Asian",
"hd02s089": "Pacific/Hawaiian",
"hd02s095": "Two or more races",
"hd02s107": "Hispanic",
"hd02s131": "In households",
"hd02s132": "Householder",
"hd02s133": "Spouse",
"hd02s134": "Child",
"hd02s135": "Child w own child under 18",
"hd02s136": "Other relatives",
"hd02s143": "In group quarters",
"hd02s151": "Family households",
"hd02s152": "Family households w own child under 18",
"hd02s153": "Husband-wife family",
"hd02s154": "Husband-wife family w own child under 18",
"hd02s159": "Nonfamily households",
"hd01s167": "Average household size",
"hd01s168": "Average family size",
"hd02s181": "Owner occupied housing units",
"hd02s184": "Renter occupied housing units",
"hd01vd01": "Median income",
"d001": "Total households",
"d002": "Husband-wife households",
"d014": "Unmarried-partner households: - Male householder and male partner",
"d019": "Unmarried-partner households: - Male householder and female partner",
"d024": "Unmarried-partner households: - Female householder and female partner",
"d029": "Unmarried-partner households: - Female householder and male partner",
"lnd110210d": "Population density"}
In [114]:
ranked_list = []
ranked_labels = []
for rank, importance in ranking:
print(labels_dict[df_new.columns[rank]], importance)
ranked_list.append(importance)
ranked_labels.append(labels_dict[df_new.columns[rank]])
In [115]:
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
ax.set_xlabel("Feature importance", fontsize=20)
ax.set_ylabel("Feature", fontsize=20)
ax = sns.barplot(x=ranked_list, y=ranked_labels)
plt.savefig('../graphics/feature_importance_ranking_gonorrhea.png', bbox_inches='tight', dpi=150)
In [116]:
len(ranked_list)
Out[116]:
Save model parameters for use in web app:
In [110]:
import pickle
In [111]:
with open("../data/randomforest_params_gonorrhea.pickle", "wb") as myfile:
pickle.dump(clf, myfile)
In [112]:
with open("../data/Ymean_gonorrhea.pickle", "wb") as myfile:
pickle.dump(Ymean, myfile)
In [113]:
with open("../data/Ystd_gonorrhea.pickle", "wb") as myfile:
pickle.dump(Ystd, myfile)
In [114]:
deployed_model = pickle.load(open('../data/randomforest_params_gonorrhea.pickle', "rb" ))
In [115]:
print('Variance score: %.5f\t(%.5f)' % (deployed_model.score(X_test, Y_test), deployed_model.score(X_full, Y_full)))
In [123]:
from sklearn.svm import SVR
In [124]:
svr_rbf = SVR(kernel='rbf', C=15, gamma=0.0001, epsilon=0.0005, tol=0.00001)
#svr_lin = SVR(kernel='linear', C=1, epsilon=0.001)
#svr_poly = SVR(kernel='poly', C=1, degree=2, epsilon=0.001)
svr_rbf.fit(X_train, Y_train)
#svr_lin.fit(X_train, Y_train)
#svr_poly.fit(X_train, Y_train)
#print('Variance score:\n\t%.5f\t(rbf)\n\t%.5f\t(lin)\n\t%.5f\t(poly)' % (svr_rbf.score(X_train, Y_train), svr_lin.score(X_train, Y_train),svr_poly.score(X_train, Y_train)))
print('Variance score:\n\t%.5f\t(rbf)' % (svr_rbf.score(X_test, Y_test)))
In [125]:
scores_svm = cross_validation.cross_val_score(svr_rbf, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_svm.mean(), scores_svm.std() * 2))
In [126]:
model_scores = []
model_errors = []
model_names = []
model_scores.append(np.mean(scores_regression))
model_errors.append(np.std(scores_regression)*2)
model_names.append("Linear")
model_scores.append(np.mean(scores_rregression))
model_errors.append(np.std(scores_rregression)*2)
model_names.append("Ridge")
model_scores.append(np.mean(scores_etregression))
model_errors.append(np.std(scores_etregression)*2)
model_names.append("Extra trees")
model_scores.append(np.mean(scores_adaregression))
model_errors.append(np.std(scores_adaregression)*2)
model_names.append("ADA boost")
model_scores.append(np.mean(scores_bagregression))
model_errors.append(np.std(scores_bagregression)*2)
model_names.append("Bagging")
model_scores.append(np.mean(scores_gradboostregression))
model_errors.append(np.std(scores_gradboostregression)*2)
model_names.append("Gradient boost")
model_scores.append(np.mean(scores_svm))
model_errors.append(np.std(scores_svm)*2)
model_names.append("Suport vector")
model_scores.append(np.mean(scores_randomforest))
model_errors.append(np.std(scores_randomforest)*2)
model_names.append("Random forest")
In [127]:
model_names, model_scores, model_errors
Out[127]:
In [128]:
# Set up the matplotlib figure
sns.set_context("poster", font_scale=1)
f, ax = plt.subplots(figsize=(11, 9))
ax.set_xlabel("Model", fontsize=20)
ax.set_ylabel("Cross validation score", fontsize=20)
ax = sns.barplot(x=model_names, y=model_scores, yerr=model_errors)
plt.xticks(rotation=90)
plt.savefig('../graphics/model_performance_gonorrhea.png', bbox_inches='tight', dpi=150)
In [ ]:
In [ ]: