STD & me

Analysis of 2014 data from the CDC on the prevalence of STD's in U.S. counties.


In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re
%matplotlib inline

In [2]:
# Always display all the columns
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 200) 

# Plain Seaborn figures with matplotlib color codes mapped to the default seaborn palette 
sns.set(style="white", color_codes=True)

CDC data on Gonorrhea


In [3]:
df = pd.read_csv("../data/cdc/gonorrhea.csv")

In [4]:
df.shape


Out[4]:
(3228, 12)

In [5]:
df.columns


Out[5]:
Index(['Disease', 'Area', 'State Abbreviation', 'FIPS', 'Year', 'Race', 'Sex', 'Age group', 'Transmission Category', 'Population', 'Cases', 'Rate'], dtype='object')

In [6]:
df.dtypes


Out[6]:
Disease                  object
Area                     object
State Abbreviation       object
FIPS                      int64
Year                      int64
Race                     object
Sex                      object
Age group                object
Transmission Category    object
Population               object
Cases                    object
Rate                     object
dtype: object

In [7]:
df_test = df.convert_objects(convert_numeric=True)
df_test.dtypes


/Users/akuepper/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  if __name__ == '__main__':
Out[7]:
Disease                   object
Area                      object
State Abbreviation        object
FIPS                       int64
Year                       int64
Race                      object
Sex                       object
Age group                 object
Transmission Category     object
Population               float64
Cases                    float64
Rate                     float64
dtype: object

In [8]:
df_test.head()


Out[8]:
Disease Area State Abbreviation FIPS Year Race Sex Age group Transmission Category Population Cases Rate
0 Gonorrhea Autauga County AL 1001 2014 All races/ethnicities Both sexes All age groups All transmission categories NaN 48 86.9
1 Gonorrhea Baldwin County AL 1003 2014 All races/ethnicities Both sexes All age groups All transmission categories NaN 153 78.2
2 Gonorrhea Barbour County AL 1005 2014 All races/ethnicities Both sexes All age groups All transmission categories NaN 52 192.1
3 Gonorrhea Bibb County AL 1007 2014 All races/ethnicities Both sexes All age groups All transmission categories NaN 22 97.7
4 Gonorrhea Blount County AL 1009 2014 All races/ethnicities Both sexes All age groups All transmission categories NaN 6 10.4

In [9]:
df['Population'] = df['Population'].str.replace(',','')
df['Cases'] = df['Cases'].str.replace(',','')

In [10]:
df = df.convert_objects(convert_numeric=True)
df.dtypes


/Users/akuepper/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  if __name__ == '__main__':
Out[10]:
Disease                   object
Area                      object
State Abbreviation        object
FIPS                       int64
Year                       int64
Race                      object
Sex                       object
Age group                 object
Transmission Category     object
Population               float64
Cases                    float64
Rate                     float64
dtype: object

In [11]:
df.head(77)


Out[11]:
Disease Area State Abbreviation FIPS Year Race Sex Age group Transmission Category Population Cases Rate
0 Gonorrhea Autauga County AL 1001 2014 All races/ethnicities Both sexes All age groups All transmission categories 55246 48 86.9
1 Gonorrhea Baldwin County AL 1003 2014 All races/ethnicities Both sexes All age groups All transmission categories 195540 153 78.2
2 Gonorrhea Barbour County AL 1005 2014 All races/ethnicities Both sexes All age groups All transmission categories 27076 52 192.1
3 Gonorrhea Bibb County AL 1007 2014 All races/ethnicities Both sexes All age groups All transmission categories 22512 22 97.7
4 Gonorrhea Blount County AL 1009 2014 All races/ethnicities Both sexes All age groups All transmission categories 57872 6 10.4
5 Gonorrhea Bullock County AL 1011 2014 All races/ethnicities Both sexes All age groups All transmission categories 10639 27 253.8
6 Gonorrhea Butler County AL 1013 2014 All races/ethnicities Both sexes All age groups All transmission categories 20265 29 143.1
7 Gonorrhea Calhoun County AL 1015 2014 All races/ethnicities Both sexes All age groups All transmission categories 116736 229 196.2
8 Gonorrhea Chambers County AL 1017 2014 All races/ethnicities Both sexes All age groups All transmission categories 34162 73 213.7
9 Gonorrhea Cherokee County AL 1019 2014 All races/ethnicities Both sexes All age groups All transmission categories 26203 24 91.6
10 Gonorrhea Chilton County AL 1021 2014 All races/ethnicities Both sexes All age groups All transmission categories 43951 44 100.1
11 Gonorrhea Choctaw County AL 1023 2014 All races/ethnicities Both sexes All age groups All transmission categories 13426 8 59.6
12 Gonorrhea Clarke County AL 1025 2014 All races/ethnicities Both sexes All age groups All transmission categories 25207 28 111.1
13 Gonorrhea Clay County AL 1027 2014 All races/ethnicities Both sexes All age groups All transmission categories 13486 14 103.8
14 Gonorrhea Cleburne County AL 1029 2014 All races/ethnicities Both sexes All age groups All transmission categories 14994 6 40.0
15 Gonorrhea Coffee County AL 1031 2014 All races/ethnicities Both sexes All age groups All transmission categories 50938 64 125.6
16 Gonorrhea Colbert County AL 1033 2014 All races/ethnicities Both sexes All age groups All transmission categories 54520 62 113.7
17 Gonorrhea Conecuh County AL 1035 2014 All races/ethnicities Both sexes All age groups All transmission categories 12887 9 69.8
18 Gonorrhea Coosa County AL 1037 2014 All races/ethnicities Both sexes All age groups All transmission categories 10898 11 100.9
19 Gonorrhea Covington County AL 1039 2014 All races/ethnicities Both sexes All age groups All transmission categories 37886 25 66.0
20 Gonorrhea Crenshaw County AL 1041 2014 All races/ethnicities Both sexes All age groups All transmission categories 13986 18 128.7
21 Gonorrhea Cullman County AL 1043 2014 All races/ethnicities Both sexes All age groups All transmission categories 80811 21 26.0
22 Gonorrhea Dale County AL 1045 2014 All races/ethnicities Both sexes All age groups All transmission categories 49884 60 120.3
23 Gonorrhea Dallas County AL 1047 2014 All races/ethnicities Both sexes All age groups All transmission categories 41996 60 142.9
24 Gonorrhea Dekalb County AL 1049 2014 All races/ethnicities Both sexes All age groups All transmission categories 71013 43 60.6
25 Gonorrhea Elmore County AL 1051 2014 All races/ethnicities Both sexes All age groups All transmission categories 80902 97 119.9
26 Gonorrhea Escambia County AL 1053 2014 All races/ethnicities Both sexes All age groups All transmission categories 37983 52 136.9
27 Gonorrhea Etowah County AL 1055 2014 All races/ethnicities Both sexes All age groups All transmission categories 103931 185 178.0
28 Gonorrhea Fayette County AL 1057 2014 All races/ethnicities Both sexes All age groups All transmission categories 16909 5 29.6
29 Gonorrhea Franklin County AL 1059 2014 All races/ethnicities Both sexes All age groups All transmission categories 31532 8 25.4
... ... ... ... ... ... ... ... ... ... ... ... ...
47 Gonorrhea Marshall County AL 1095 2014 All races/ethnicities Both sexes All age groups All transmission categories 94760 25 26.4
48 Gonorrhea Mobile County AL 1097 2014 All races/ethnicities Both sexes All age groups All transmission categories 414079 715 172.7
49 Gonorrhea Monroe County AL 1099 2014 All races/ethnicities Both sexes All age groups All transmission categories 22236 11 49.5
50 Gonorrhea Montgomery County AL 1101 2014 All races/ethnicities Both sexes All age groups All transmission categories 226659 959 423.1
51 Gonorrhea Morgan County AL 1103 2014 All races/ethnicities Both sexes All age groups All transmission categories 119787 111 92.7
52 Gonorrhea Perry County AL 1105 2014 All races/ethnicities Both sexes All age groups All transmission categories 10020 13 129.7
53 Gonorrhea Pickens County AL 1107 2014 All races/ethnicities Both sexes All age groups All transmission categories 19401 27 139.2
54 Gonorrhea Pike County AL 1109 2014 All races/ethnicities Both sexes All age groups All transmission categories 33339 90 270.0
55 Gonorrhea Randolph County AL 1111 2014 All races/ethnicities Both sexes All age groups All transmission categories 22727 11 48.4
56 Gonorrhea Russell County AL 1113 2014 All races/ethnicities Both sexes All age groups All transmission categories 59585 119 199.7
57 Gonorrhea St. Clair County AL 1115 2014 All races/ethnicities Both sexes All age groups All transmission categories 86308 54 62.6
58 Gonorrhea Shelby County AL 1117 2014 All races/ethnicities Both sexes All age groups All transmission categories 204180 94 46.0
59 Gonorrhea Sumter County AL 1119 2014 All races/ethnicities Both sexes All age groups All transmission categories 13361 22 164.7
60 Gonorrhea Talladega County AL 1121 2014 All races/ethnicities Both sexes All age groups All transmission categories 81096 212 261.4
61 Gonorrhea Tallapoosa County AL 1123 2014 All races/ethnicities Both sexes All age groups All transmission categories 41203 54 131.1
62 Gonorrhea Tuscaloosa County AL 1125 2014 All races/ethnicities Both sexes All age groups All transmission categories 200821 302 150.4
63 Gonorrhea Walker County AL 1127 2014 All races/ethnicities Both sexes All age groups All transmission categories 65998 77 116.7
64 Gonorrhea Washington County AL 1129 2014 All races/ethnicities Both sexes All age groups All transmission categories 16877 18 106.7
65 Gonorrhea Wilcox County AL 1131 2014 All races/ethnicities Both sexes All age groups All transmission categories 11307 14 123.8
66 Gonorrhea Winston County AL 1133 2014 All races/ethnicities Both sexes All age groups All transmission categories 24146 2 8.3
67 Gonorrhea Aleutians East Borough AK 2013 2014 All races/ethnicities Both sexes All age groups All transmission categories 3092 4 129.4
68 Gonorrhea Aleutians West Census Area AK 2016 2014 All races/ethnicities Both sexes All age groups All transmission categories 5511 1 18.1
69 Gonorrhea Anchorage Municipality AK 2020 2014 All races/ethnicities Both sexes All age groups All transmission categories 300950 782 259.8
70 Gonorrhea Bethel Census Area AK 2050 2014 All races/ethnicities Both sexes All age groups All transmission categories 17758 79 444.9
71 Gonorrhea Bristol Bay Borough AK 2060 2014 All races/ethnicities Both sexes All age groups All transmission categories 960 1 104.2
72 Gonorrhea Denali Borough AK 2068 2014 All races/ethnicities Both sexes All age groups All transmission categories 1867 0 0.0
73 Gonorrhea Dillingham Census Area AK 2070 2014 All races/ethnicities Both sexes All age groups All transmission categories 5010 23 459.1
74 Gonorrhea Fairbanks North Star Borough AK 2090 2014 All races/ethnicities Both sexes All age groups All transmission categories 100436 84 83.6
75 Gonorrhea Haines Borough AK 2100 2014 All races/ethnicities Both sexes All age groups All transmission categories 2592 0 0.0
76 Gonorrhea Hoonah-Angoon Census Area AK 2105 2014 All races/ethnicities Both sexes All age groups All transmission categories NaN NaN NaN

77 rows × 12 columns


In [12]:
df['Population'].describe()


Out[12]:
count        3220.000000
mean        99360.995963
std        318648.364529
min            90.000000
25%         11267.750000
50%         26165.500000
75%         66834.250000
max      10017068.000000
Name: Population, dtype: float64

In [13]:
df['Population'].idxmax()


Out[13]:
207

In [14]:
df.loc[207]


Out[14]:
Disease                                    Gonorrhea
Area                              Los Angeles County
State Abbreviation                                CA
FIPS                                            6037
Year                                            2014
Race                           All races/ethnicities
Sex                                       Both sexes
Age group                             All age groups
Transmission Category    All transmission categories
Population                               1.00171e+07
Cases                                          15316
Rate                                           152.9
Name: 207, dtype: object

In [15]:
fig = plt.figure(figsize=(10, 6))
data = np.log10(df['Population'])
ax = data.plot.hist(50)
ax.set_xlabel("log(Population)")
ax.set_title("Gonorrhea")
plt.savefig('../graphics/county_population_gonorrhea.png', bbox_inches='tight', dpi=150)



In [16]:
fig = plt.figure(figsize=(10, 6))
data = np.log10(df['Cases']+1)
ax = data.plot.hist(50)
ax.set_xlabel("log(Cases)")
ax.set_title("Gonorrhea")
plt.savefig('../graphics/county_cases_gonorrhea.png', bbox_inches='tight', dpi=150)



In [17]:
fig = plt.figure(figsize=(10, 6))
ax = df['Rate'].plot.hist(100)
ax.set_xlabel("Rate (per 100,000 inhabitants)")
ax.set_title("Gonorrhea")
plt.savefig('../graphics/county_rate_gonorrhea.png', bbox_inches='tight', dpi=150)



In [18]:
outliers = df[df['Rate']<10]

In [19]:
outliers


Out[19]:
Disease Area State Abbreviation FIPS Year Race Sex Age group Transmission Category Population Cases Rate
66 Gonorrhea Winston County AL 1133 2014 All races/ethnicities Both sexes All age groups All transmission categories 24146 2 8.3
72 Gonorrhea Denali Borough AK 2068 2014 All races/ethnicities Both sexes All age groups All transmission categories 1867 0 0.0
75 Gonorrhea Haines Borough AK 2100 2014 All races/ethnicities Both sexes All age groups All transmission categories 2592 0 0.0
88 Gonorrhea Prince of Wales - Outer Ketchikan AK 2201 2014 All races/ethnicities Both sexes All age groups All transmission categories 5786 0 0.0
93 Gonorrhea Valdez-Cordova Census Area AK 2261 2014 All races/ethnicities Both sexes All age groups All transmission categories 9763 0 0.0
96 Gonorrhea Wrangell-Petersburg Census Area AK 2280 2014 All races/ethnicities Both sexes All age groups All transmission categories 6174 0 0.0
97 Gonorrhea Yakutat City and Borough AK 2282 2014 All races/ethnicities Both sexes All age groups All transmission categories 642 0 0.0
125 Gonorrhea Cleburne County AR 5023 2014 All races/ethnicities Both sexes All age groups All transmission categories 25686 1 3.9
138 Gonorrhea Fulton County AR 5049 2014 All races/ethnicities Both sexes All age groups All transmission categories 12304 0 0.0
157 Gonorrhea Madison County AR 5087 2014 All races/ethnicities Both sexes All age groups All transmission categories 15701 0 0.0
158 Gonorrhea Marion County AR 5089 2014 All races/ethnicities Both sexes All age groups All transmission categories 16430 0 0.0
164 Gonorrhea Newton County AR 5101 2014 All races/ethnicities Both sexes All age groups All transmission categories 8064 0 0.0
166 Gonorrhea Perry County AR 5105 2014 All races/ethnicities Both sexes All age groups All transmission categories 10345 0 0.0
170 Gonorrhea Polk County AR 5113 2014 All races/ethnicities Both sexes All age groups All transmission categories 20406 2 9.8
190 Gonorrhea Alpine County CA 6003 2014 All races/ethnicities Both sexes All age groups All transmission categories 1159 0 0.0
251 Gonorrhea Baca County CO 8009 2014 All races/ethnicities Both sexes All age groups All transmission categories 3682 0 0.0
256 Gonorrhea Cheyenne County CO 8017 2014 All races/ethnicities Both sexes All age groups All transmission categories 1890 0 0.0
259 Gonorrhea Costilla County CO 8023 2014 All races/ethnicities Both sexes All age groups All transmission categories 3518 0 0.0
262 Gonorrhea Delta County CO 8029 2014 All races/ethnicities Both sexes All age groups All transmission categories 30483 1 3.3
264 Gonorrhea Dolores County CO 8033 2014 All races/ethnicities Both sexes All age groups All transmission categories 2029 0 0.0
266 Gonorrhea Eagle County CO 8037 2014 All races/ethnicities Both sexes All age groups All transmission categories 52460 5 9.5
273 Gonorrhea Gunnison County CO 8051 2014 All races/ethnicities Both sexes All age groups All transmission categories 15507 1 6.4
274 Gonorrhea Hinsdale County CO 8053 2014 All races/ethnicities Both sexes All age groups All transmission categories 813 0 0.0
276 Gonorrhea Jackson County CO 8057 2014 All races/ethnicities Both sexes All age groups All transmission categories 1365 0 0.0
278 Gonorrhea Kiowa County CO 8061 2014 All races/ethnicities Both sexes All age groups All transmission categories 1423 0 0.0
279 Gonorrhea Kit Carson County CO 8063 2014 All races/ethnicities Both sexes All age groups All transmission categories 8037 0 0.0
285 Gonorrhea Logan County CO 8075 2014 All races/ethnicities Both sexes All age groups All transmission categories 22450 1 4.5
287 Gonorrhea Mineral County CO 8079 2014 All races/ethnicities Both sexes All age groups All transmission categories 721 0 0.0
290 Gonorrhea Montrose County CO 8085 2014 All races/ethnicities Both sexes All age groups All transmission categories 40713 1 2.5
292 Gonorrhea Otero County CO 8089 2014 All races/ethnicities Both sexes All age groups All transmission categories 18703 1 5.3
... ... ... ... ... ... ... ... ... ... ... ... ...
3177 Gonorrhea Guànica Municipio PR 72055 2014 All races/ethnicities Both sexes All age groups All transmission categories 17852 0 0.0
3178 Gonorrhea Guayama Municipio PR 72057 2014 All races/ethnicities Both sexes All age groups All transmission categories 43467 2 4.6
3179 Gonorrhea Guayanilla Municipio PR 72059 2014 All races/ethnicities Both sexes All age groups All transmission categories 20148 1 5.0
3180 Gonorrhea Guaynabo Municipio PR 72061 2014 All races/ethnicities Both sexes All age groups All transmission categories 92799 6 6.5
3182 Gonorrhea Hatillo Municipio PR 72065 2014 All races/ethnicities Both sexes All age groups All transmission categories 41618 2 4.8
3186 Gonorrhea Jayuya Municipio PR 72073 2014 All races/ethnicities Both sexes All age groups All transmission categories 15693 0 0.0
3189 Gonorrhea Lajas Municipio PR 72079 2014 All races/ethnicities Both sexes All age groups All transmission categories 24465 0 0.0
3190 Gonorrhea Lares Municipio PR 72081 2014 All races/ethnicities Both sexes All age groups All transmission categories 28208 0 0.0
3191 Gonorrhea Las MarÕas Municipio PR 72083 2014 All races/ethnicities Both sexes All age groups All transmission categories 9158 0 0.0
3196 Gonorrhea Maricao Municipio PR 72093 2014 All races/ethnicities Both sexes All age groups All transmission categories 6022 0 0.0
3197 Gonorrhea Maunabo Municipio PR 72095 2014 All races/ethnicities Both sexes All age groups All transmission categories 11565 0 0.0
3199 Gonorrhea Moca Municipio PR 72099 2014 All races/ethnicities Both sexes All age groups All transmission categories 38461 0 0.0
3200 Gonorrhea Morovis Municipio PR 72101 2014 All races/ethnicities Both sexes All age groups All transmission categories 32194 1 3.1
3202 Gonorrhea Naranjito Municipio PR 72105 2014 All races/ethnicities Both sexes All age groups All transmission categories 29602 1 3.4
3203 Gonorrhea Orocovis Municipio PR 72107 2014 All races/ethnicities Both sexes All age groups All transmission categories 22392 1 4.5
3204 Gonorrhea Patillas Municipio PR 72109 2014 All races/ethnicities Both sexes All age groups All transmission categories 18261 0 0.0
3205 Gonorrhea PeÐuelas Municipio PR 72111 2014 All races/ethnicities Both sexes All age groups All transmission categories 22365 1 4.5
3207 Gonorrhea Quebradillas Municipio PR 72115 2014 All races/ethnicities Both sexes All age groups All transmission categories 25042 0 0.0
3210 Gonorrhea Sabana Grande Municipio PR 72121 2014 All races/ethnicities Both sexes All age groups All transmission categories 24121 0 0.0
3211 Gonorrhea Salinas Municipio PR 72123 2014 All races/ethnicities Both sexes All age groups All transmission categories 29881 2 6.7
3212 Gonorrhea San Germàn Municipio PR 72125 2014 All races/ethnicities Both sexes All age groups All transmission categories 33725 1 3.0
3215 Gonorrhea San Sebastiàn Municipio PR 72131 2014 All races/ethnicities Both sexes All age groups All transmission categories 39969 1 2.5
3216 Gonorrhea Santa Isabel Municipio PR 72133 2014 All races/ethnicities Both sexes All age groups All transmission categories 22860 1 4.4
3217 Gonorrhea Toa Alta Municipio PR 72135 2014 All races/ethnicities Both sexes All age groups All transmission categories 74837 7 9.4
3220 Gonorrhea Utuado Municipio PR 72141 2014 All races/ethnicities Both sexes All age groups All transmission categories 31050 0 0.0
3221 Gonorrhea Vega Alta Municipio PR 72143 2014 All races/ethnicities Both sexes All age groups All transmission categories 39236 3 7.6
3222 Gonorrhea Vega Baja Municipio PR 72145 2014 All races/ethnicities Both sexes All age groups All transmission categories 56166 2 3.6
3224 Gonorrhea Villalba Municipio PR 72149 2014 All races/ethnicities Both sexes All age groups All transmission categories 24389 0 0.0
3225 Gonorrhea Yabucoa Municipio PR 72151 2014 All races/ethnicities Both sexes All age groups All transmission categories 35879 3 8.4
3226 Gonorrhea Yauco Municipio PR 72153 2014 All races/ethnicities Both sexes All age groups All transmission categories 38782 0 0.0

672 rows × 12 columns


In [20]:
not_exclude_list = df["Cases"]<0
exclude_list = [not i for i in not_exclude_list]
df_sig = df[exclude_list].copy()

In [21]:
df = df_sig.copy()

In [22]:
df["Rate"].sort_values()


Out[22]:
880       0.0
1654      0.0
1655      0.0
3083      0.0
1234      0.0
3089      0.0
3090      0.0
1658      0.0
660       0.0
1659      0.0
1660      0.0
2807      0.0
1651      0.0
1661      0.0
2423      0.0
1665      0.0
310       0.0
1190      0.0
308       0.0
305       0.0
675       0.0
303       0.0
302       0.0
1671      0.0
1676      0.0
1664      0.0
1650      0.0
1649      0.0
1235      0.0
620       0.0
        ...  
1152    451.9
148     456.3
1149    458.3
73      459.1
2713    462.8
2948    478.7
81      485.4
1600    486.2
2375    508.6
1983    528.9
159     529.9
1995    567.1
2944    633.1
2384    716.1
2424    801.4
2379    830.4
2420    835.8
1643    844.9
2032    850.4
2035    857.8
94      877.5
76        NaN
83        NaN
86        NaN
87        NaN
90        NaN
95        NaN
2919      NaN
3146      NaN
3148      NaN
Name: Rate, dtype: float64

In [23]:
len(df['Area'].unique())


Out[23]:
1962

In [24]:
df.shape


Out[24]:
(3228, 12)

In [25]:
df.sort_values(by='Area')


Out[25]:
Disease Area State Abbreviation FIPS Year Race Sex Age group Transmission Category Population Cases Rate
2319 Gonorrhea Abbeville County SC 45001 2014 All races/ethnicities Both sexes All age groups All transmission categories 25007 43 172.0
1116 Gonorrhea Acadia Parish LA 22001 2014 All races/ethnicities Both sexes All age groups All transmission categories 62204 106 170.4
2823 Gonorrhea Accomack County VA 51001 2014 All races/ethnicities Both sexes All age groups All transmission categories 33148 28 84.5
554 Gonorrhea Ada County ID 16001 2014 All races/ethnicities Both sexes All age groups All transmission categories 416464 205 49.2
996 Gonorrhea Adair County KY 21001 2014 All races/ethnicities Both sexes All age groups All transmission categories 18732 6 32.0
1486 Gonorrhea Adair County MO 29001 2014 All races/ethnicities Both sexes All age groups All transmission categories 25572 8 31.3
792 Gonorrhea Adair County IA 19001 2014 All races/ethnicities Both sexes All age groups All transmission categories 7472 2 26.8
2134 Gonorrhea Adair County OK 40001 2014 All races/ethnicities Both sexes All age groups All transmission categories 22194 11 49.6
1657 Gonorrhea Adams County NE 31001 2014 All races/ethnicities Both sexes All age groups All transmission categories 31610 8 25.3
700 Gonorrhea Adams County IN 18001 2014 All races/ethnicities Both sexes All age groups All transmission categories 34614 6 17.3
555 Gonorrhea Adams County ID 16003 2014 All races/ethnicities Both sexes All age groups All transmission categories 3828 0 0.0
1404 Gonorrhea Adams County MS 28001 2014 All races/ethnicities Both sexes All age groups All transmission categories 32090 56 174.5
793 Gonorrhea Adams County IA 19003 2014 All races/ethnicities Both sexes All age groups All transmission categories 3894 0 0.0
598 Gonorrhea Adams County IL 17001 2014 All races/ethnicities Both sexes All age groups All transmission categories 67130 33 49.2
2046 Gonorrhea Adams County OH 39001 2014 All races/ethnicities Both sexes All age groups All transmission categories 28105 2 7.1
1993 Gonorrhea Adams County ND 38001 2014 All races/ethnicities Both sexes All age groups All transmission categories 2360 1 42.4
247 Gonorrhea Adams County CO 8001 2014 All races/ethnicities Both sexes All age groups All transmission categories 469193 233 49.7
3051 Gonorrhea Adams County WI 55001 2014 All races/ethnicities Both sexes All age groups All transmission categories 20480 3 14.6
2957 Gonorrhea Adams County WA 53001 2014 All races/ethnicities Both sexes All age groups All transmission categories 19067 10 52.4
2247 Gonorrhea Adams County PA 42001 2014 All races/ethnicities Both sexes All age groups All transmission categories 101546 18 17.7
2809 Gonorrhea Addison County VT 50001 2014 All races/ethnicities Both sexes All age groups All transmission categories 36791 4 10.9
3149 Gonorrhea Adjuntas Municipio PR 72001 2014 All races/ethnicities Both sexes All age groups All transmission categories 18900 2 10.6
3150 Gonorrhea Aguada Municipio PR 72003 2014 All races/ethnicities Both sexes All age groups All transmission categories 40329 3 7.4
3151 Gonorrhea Aguadilla Municipio PR 72005 2014 All races/ethnicities Both sexes All age groups All transmission categories 57290 7 12.2
3152 Gonorrhea Aguas Buenas Municipio PR 72007 2014 All races/ethnicities Both sexes All age groups All transmission categories 27473 2 7.3
3153 Gonorrhea Aibonito Municipio PR 72009 2014 All races/ethnicities Both sexes All age groups All transmission categories 24561 3 12.2
2320 Gonorrhea Aiken County SC 45003 2014 All races/ethnicities Both sexes All age groups All transmission categories 164176 280 170.5
1317 Gonorrhea Aitkin County MN 27001 2014 All races/ethnicities Both sexes All age groups All transmission categories 15742 5 31.8
323 Gonorrhea Alachua County FL 12001 2014 All races/ethnicities Both sexes All age groups All transmission categories 253451 411 162.2
1893 Gonorrhea Alamance County NC 37001 2014 All races/ethnicities Both sexes All age groups All transmission categories 154378 262 169.7
... ... ... ... ... ... ... ... ... ... ... ... ...
3225 Gonorrhea Yabucoa Municipio PR 72151 2014 All races/ethnicities Both sexes All age groups All transmission categories 35879 3 8.4
1991 Gonorrhea Yadkin County NC 37197 2014 All races/ethnicities Both sexes All age groups All transmission categories 38038 13 34.2
2995 Gonorrhea Yakima County WA 53077 2014 All races/ethnicities Both sexes All age groups All transmission categories 247044 409 165.6
97 Gonorrhea Yakutat City and Borough AK 2282 2014 All races/ethnicities Both sexes All age groups All transmission categories 642 0 0.0
1484 Gonorrhea Yalobusha County MS 28161 2014 All races/ethnicities Both sexes All age groups All transmission categories 12373 18 145.5
2246 Gonorrhea Yamhill County OR 41071 2014 All races/ethnicities Both sexes All age groups All transmission categories 100725 18 17.9
1992 Gonorrhea Yancey County NC 37199 2014 All races/ethnicities Both sexes All age groups All transmission categories 17566 2 11.4
2429 Gonorrhea Yankton County SD 46135 2014 All races/ethnicities Both sexes All age groups All transmission categories 22696 8 35.2
1892 Gonorrhea Yates County NY 36123 2014 All races/ethnicities Both sexes All age groups All transmission categories 25156 2 8.0
3226 Gonorrhea Yauco Municipio PR 72153 2014 All races/ethnicities Both sexes All age groups All transmission categories 38782 0 0.0
112 Gonorrhea Yavapai County AZ 4025 2014 All races/ethnicities Both sexes All age groups All transmission categories 215133 51 23.7
1485 Gonorrhea Yazoo County MS 28163 2014 All races/ethnicities Both sexes All age groups All transmission categories 27883 103 369.4
188 Gonorrhea Yell County AR 5149 2014 All races/ethnicities Both sexes All age groups All transmission categories 21893 8 36.5
1403 Gonorrhea Yellow Medicine County MN 27173 2014 All races/ethnicities Both sexes All age groups All transmission categories 10143 1 9.9
1656 Gonorrhea Yellowstone County MT 30111 2014 All races/ethnicities Both sexes All age groups All transmission categories 154162 101 65.5
2776 Gonorrhea Yoakum County TX 48501 2014 All races/ethnicities Both sexes All age groups All transmission categories 8184 5 61.1
245 Gonorrhea Yolo County CA 6113 2014 All races/ethnicities Both sexes All age groups All transmission categories 204593 194 94.8
2917 Gonorrhea York County VA 51199 2014 All races/ethnicities Both sexes All age groups All transmission categories 66269 34 51.3
2364 Gonorrhea York County SC 45091 2014 All races/ethnicities Both sexes All age groups All transmission categories 239363 311 129.9
1749 Gonorrhea York County NE 31185 2014 All races/ethnicities Both sexes All age groups All transmission categories 13883 5 36.0
1195 Gonorrhea York County ME 23031 2014 All races/ethnicities Both sexes All age groups All transmission categories 199431 35 17.5
2313 Gonorrhea York County PA 42133 2014 All races/ethnicities Both sexes All age groups All transmission categories 438965 294 67.0
2777 Gonorrhea Young County TX 48503 2014 All races/ethnicities Both sexes All age groups All transmission categories 18341 11 60.0
246 Gonorrhea Yuba County CA 6115 2014 All races/ethnicities Both sexes All age groups All transmission categories 73340 86 117.3
98 Gonorrhea Yukon-Koyukuk Census Area AK 2290 2014 All races/ethnicities Both sexes All age groups All transmission categories 5695 20 351.2
113 Gonorrhea Yuma County AZ 4027 2014 All races/ethnicities Both sexes All age groups All transmission categories 201201 124 61.6
310 Gonorrhea Yuma County CO 8125 2014 All races/ethnicities Both sexes All age groups All transmission categories 10151 0 0.0
2778 Gonorrhea Zapata County TX 48505 2014 All races/ethnicities Both sexes All age groups All transmission categories 14390 1 6.9
2779 Gonorrhea Zavala County TX 48507 2014 All races/ethnicities Both sexes All age groups All transmission categories 12156 5 41.1
2430 Gonorrhea Ziebach County SD 46137 2014 All races/ethnicities Both sexes All age groups All transmission categories 2834 11 388.1

3228 rows × 12 columns


In [26]:
df['Area'].value_counts()


Out[26]:
Washington County            30
Jefferson County             25
Franklin County              24
Jackson County               23
Lincoln County               23
Madison County               19
Montgomery County            18
Clay County                  18
Marion County                17
Monroe County                17
Union County                 17
Wayne County                 16
Grant County                 14
Warren County                14
Greene County                14
Carroll County               13
Clark County                 12
Marshall County              12
Lee County                   12
Polk County                  12
Lake County                  12
Johnson County               12
Adams County                 12
Douglas County               12
Morgan County                11
Scott County                 11
Calhoun County               11
Crawford County              11
Fayette County               11
Lawrence County              11
                             ..
Dearborn County               1
Harnett County                1
Gentry County                 1
Prince Edward County          1
Petersburg Census Area        1
Loving County                 1
East Carroll Parish           1
Ouachita County               1
Alger County                  1
Pickaway County               1
Rockland County               1
Torrance County               1
Union Parish                  1
Kimball County                1
Vanderburgh County            1
Aleutians East Borough        1
Pima County                   1
Clarion County                1
Wrangell City and Borough     1
Reynolds County               1
La Paz County                 1
Kosciusko County              1
Natrona County                1
Bremer County                 1
Galveston County              1
Refugio County                1
Estill County                 1
Nome Census Area              1
Charlevoix County             1
Schleicher County             1
Name: Area, dtype: int64

In [27]:
df.isnull().values.any()


Out[27]:
True

In [28]:
null_list = df["Population"].isnull()
not_null_list = [not i for i in null_list]
df_clean = df[not_null_list].copy()

In [29]:
null_list = df_clean["Rate"].isnull()
not_null_list = [not i for i in null_list]
df_completely_clean = df_clean[not_null_list].copy()

In [30]:
df_completely_clean["Rate"].isnull().values.any()


Out[30]:
False

Model the Gonorrhea rate


In [31]:
df_merged = pd.read_csv("../data/chlamydia_cdc_census.csv")

In [32]:
df_completely_clean[df_completely_clean["FIPS"]==1001].Cases[0]


Out[32]:
48.0

Replace the number of Chlamydia cases with the number of Gonorrhea cases


In [33]:
#print(df_merged[df_merged["FIPS"]==1001].Cases[0])
#df_merged.set_value(1, "FIPS", 10)
for county in df_merged["FIPS"]:
    rowlist = df_merged[df_merged['FIPS'] == county].index.tolist()
    gonorrhea_cases = df_completely_clean[df_completely_clean["FIPS"] == county].Cases.tolist()
    df_merged.set_value(rowlist[0], 'Cases', gonorrhea_cases[0])
#    print(df_merged["FIPS"][rowlist[0]])
    #df_merged[df_merged["FIPS"]==county]

In [34]:
df_merged.head()


Out[34]:
FIPS Population Cases hd01s001 hd02s002 hd02s005 hd02s006 hd02s007 hd02s008 hd02s009 hd02s010 hd02s011 hd02s013 hd02s015 hd01s020 hd02s026 hd02s051 hd02s078 hd02s079 hd02s080 hd02s081 hd02s089 hd02s095 hd02s107 hd02s131 hd02s132 hd02s133 hd02s134 hd02s135 hd02s136 hd02s143 hd02s151 hd02s152 hd02s153 hd02s154 hd02s159 hd01s167 hd01s168 hd02s181 hd02s184 hd01vd01 d002 d014 d019 d024 d029 lnd110210d
0 1001 55246 48 4.736962 21.8 7.9 5.6 5.8 6.1 7.6 7.5 15.0 10.7 12.0 37.0 48.7 51.3 78.5 17.7 0.4 0.9 0.1 1.6 2.4 99.2 37.1 20.8 31.8 23.6 6.1 0.8 74.5 34.9 56.2 25.3 25.5 2.68 3.13 75.4 24.6 52475 0.562138 0.003017 0.020029 0.002868 0.017704 92.781808
1 1003 195540 153 5.260703 19.0 6.4 5.2 5.6 5.9 6.3 6.6 14.8 13.5 16.9 41.1 48.9 51.1 85.7 9.4 0.7 0.7 0.0 1.5 4.4 98.7 40.2 21.9 26.8 20.1 5.6 1.3 69.9 28.0 54.5 19.9 30.1 2.46 2.93 72.5 27.5 50183 0.545409 0.002747 0.023886 0.003444 0.020292 122.920831
2 1005 27076 52 4.438653 18.0 6.3 6.5 7.3 6.6 6.6 6.6 14.7 13.2 14.3 39.0 53.1 46.9 48.0 46.9 0.4 0.4 0.1 0.9 5.1 88.4 35.8 15.6 25.7 17.7 7.8 11.6 68.4 27.4 43.7 14.4 31.6 2.47 3.01 66.8 33.2 35634 0.437169 0.002342 0.019348 0.003666 0.022200 30.563959
3 1007 22512 22 4.360120 18.4 6.7 6.5 7.0 7.2 7.6 7.1 14.8 11.9 12.6 37.8 53.7 46.3 75.8 22.0 0.3 0.1 0.1 0.9 1.8 90.3 34.7 18.2 26.8 18.7 7.5 9.7 72.3 29.5 52.5 20.1 27.7 2.60 3.09 75.6 24.4 37984 0.524582 0.001886 0.020244 0.002012 0.020370 36.101222
4 1009 57872 6 4.758321 20.2 7.0 5.4 6.0 6.0 6.8 7.0 14.1 12.6 14.6 39.0 49.5 50.5 92.6 1.3 0.5 0.2 0.1 1.2 8.1 99.1 37.6 22.8 29.2 21.3 6.4 0.9 75.0 31.1 60.6 24.1 25.0 2.63 3.07 80.6 19.4 44409 0.606034 0.001946 0.017981 0.003707 0.013440 89.615659

In [35]:
df_merged.describe()


Out[35]:
FIPS Population Cases hd01s001 hd02s002 hd02s005 hd02s006 hd02s007 hd02s008 hd02s009 hd02s010 hd02s011 hd02s013 hd02s015 hd01s020 hd02s026 hd02s051 hd02s078 hd02s079 hd02s080 hd02s081 hd02s089 hd02s095 hd02s107 hd02s131 hd02s132 hd02s133 hd02s134 hd02s135 hd02s136 hd02s143 hd02s151 hd02s152 hd02s153 hd02s154 hd02s159 hd01s167 hd01s168 hd02s181 hd02s184 hd01vd01 d002 d014 d019 d024 d029 lnd110210d
count 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000
mean 30492.802614 100423.684967 103.278758 4.460522 19.163007 6.908072 5.977222 5.819314 5.695294 5.899739 6.297484 15.002190 13.250327 15.985556 40.491667 50.003856 49.996144 84.076340 8.121307 1.550719 1.165033 0.082418 1.966765 8.417353 96.604379 39.254118 20.403791 27.047647 20.409281 5.134248 3.395654 67.828627 27.814542 52.057712 19.173268 32.171438 2.476042 2.988451 72.576307 27.423791 46800.265359 0.520577 0.002492 0.030613 0.003221 0.024522 239.640924
std 15071.256920 323873.098080 492.184847 0.634640 2.846254 1.135197 2.546279 1.223101 0.950620 0.878120 0.830619 1.506377 2.129781 4.166765 4.959052 2.204346 2.204346 15.045667 13.038226 5.071524 2.533382 0.967483 1.549707 13.303866 4.451890 3.484761 2.711350 3.484574 3.001770 2.163983 4.451875 4.990887 4.766401 5.948292 4.289568 4.990872 0.203247 0.181387 7.350396 7.350283 12025.134705 0.059487 0.001291 0.007625 0.001295 0.007228 1610.657866
min 1001.000000 90.000000 0.000000 1.913814 0.000000 0.000000 1.300000 2.300000 2.400000 1.200000 2.800000 6.300000 4.000000 3.500000 22.600000 43.200000 27.900000 14.200000 0.000000 0.000000 0.000000 0.000000 0.100000 0.000000 45.000000 17.600000 7.500000 4.400000 0.000000 0.700000 0.000000 18.800000 0.000000 11.600000 0.000000 13.400000 1.260000 2.000000 1.400000 10.200000 19146.000000 0.115942 0.000000 0.000000 0.000000 0.000000 0.069674
25% 19006.500000 11208.000000 2.000000 4.051722 17.400000 6.300000 4.700000 5.100000 5.100000 5.400000 5.800000 14.200000 12.000000 13.300000 37.700000 48.900000 49.600000 76.575000 0.500000 0.200000 0.300000 0.000000 1.100000 1.600000 96.300000 37.400000 18.800000 25.000000 18.600000 3.500000 1.100000 65.200000 25.000000 48.800000 16.500000 29.300000 2.350000 2.880000 69.200000 22.600000 38810.000000 0.488003 0.001797 0.025435 0.002469 0.020005 17.078483
50% 29188.000000 25987.000000 10.000000 4.414815 19.100000 6.800000 5.500000 5.600000 5.600000 5.900000 6.300000 15.100000 13.100000 15.700000 40.500000 49.500000 50.500000 89.450000 1.900000 0.400000 0.500000 0.000000 1.600000 3.300000 98.200000 39.500000 20.700000 27.100000 20.400000 4.900000 1.800000 68.100000 27.450000 52.500000 18.700000 31.900000 2.450000 2.970000 73.800000 26.200000 45009.500000 0.524803 0.002367 0.030222 0.003160 0.024365 45.094109
75% 45085.500000 67582.250000 43.000000 4.825413 20.700000 7.300000 6.500000 6.400000 6.200000 6.400000 6.800000 15.800000 14.300000 18.300000 43.400000 50.400000 51.100000 95.625000 9.400000 0.800000 1.000000 0.100000 2.300000 8.400000 98.900000 41.300000 22.225000 29.000000 22.000000 6.500000 3.700000 70.700000 30.100000 55.700000 21.000000 34.800000 2.570000 3.070000 77.400000 30.800000 52013.500000 0.556712 0.002993 0.035313 0.003841 0.028793 114.277352
max 56045.000000 10017068.000000 15316.000000 6.992050 33.600000 18.300000 28.100000 16.100000 11.700000 9.700000 11.900000 24.500000 28.100000 43.400000 62.700000 72.100000 56.800000 99.200000 84.400000 75.500000 43.900000 48.900000 29.500000 95.700000 100.000000 76.700000 28.800000 42.800000 34.400000 16.000000 55.000000 86.600000 50.300000 79.200000 41.700000 81.200000 3.680000 4.050000 89.800000 98.600000 123966.000000 0.792199 0.022064 0.080335 0.017631 0.077751 68239.991607

In [36]:
df_merged.shape


Out[36]:
(3060, 47)

In [37]:
plt.scatter(np.log10(df_merged["Population"]), df_merged["Cases"]/df_merged["Population"])


Out[37]:
<matplotlib.collections.PathCollection at 0x10effb048>
/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [38]:
plt.scatter(df_merged["FIPS"], df_merged["Cases"]/df_merged["Population"])


Out[38]:
<matplotlib.collections.PathCollection at 0x10f053eb8>
/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

Plot the correlation matrix


In [39]:
df_all = df_merged.copy()
df_all["Rate"] = df_all["Cases"]/df_all["Population"]
corr = df_all.corr()

In [40]:
pearsonr = corr["Rate"]
pearsonr


Out[40]:
FIPS         -0.074673
Population    0.200290
Cases         0.299517
hd01s001      0.326293
hd02s002      0.177230
hd02s005      0.180993
hd02s006      0.261563
hd02s007      0.389383
hd02s008      0.305542
hd02s009      0.185169
hd02s010      0.048447
hd02s011     -0.264997
hd02s013     -0.303107
hd02s015     -0.312399
hd01s020     -0.366802
hd02s026     -0.156963
hd02s051      0.156963
hd02s078     -0.677478
hd02s079      0.638579
hd02s080      0.142858
hd02s081      0.137539
hd02s089     -0.004756
hd02s095      0.167221
hd02s107      0.086671
hd02s131     -0.083829
hd02s132     -0.214553
hd02s133     -0.583597
hd02s134      0.172910
hd02s135     -0.016189
hd02s136      0.539012
hd02s143      0.083823
hd02s151     -0.037727
hd02s152      0.143635
hd02s153     -0.523363
hd02s154     -0.166232
hd02s159      0.037720
hd01s167      0.179159
hd01s168      0.279396
hd02s181     -0.419810
hd02s184      0.419816
hd01vd01     -0.149611
d002         -0.523460
d014          0.176792
d019         -0.130580
d024          0.159114
d029          0.258884
lnd110210d    0.144497
Rate          1.000000
Name: Rate, dtype: float64

In [41]:
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

In [42]:
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
            square=True, xticklabels=2, yticklabels=2,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)

plt.savefig('../graphics/cross-correlation_gonorrhea.png', bbox_inches='tight', dpi=150)


/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

Make principal component analysis


In [43]:
from sklearn.decomposition import PCA

In [44]:
df_merged.describe()


Out[44]:
FIPS Population Cases hd01s001 hd02s002 hd02s005 hd02s006 hd02s007 hd02s008 hd02s009 hd02s010 hd02s011 hd02s013 hd02s015 hd01s020 hd02s026 hd02s051 hd02s078 hd02s079 hd02s080 hd02s081 hd02s089 hd02s095 hd02s107 hd02s131 hd02s132 hd02s133 hd02s134 hd02s135 hd02s136 hd02s143 hd02s151 hd02s152 hd02s153 hd02s154 hd02s159 hd01s167 hd01s168 hd02s181 hd02s184 hd01vd01 d002 d014 d019 d024 d029 lnd110210d
count 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000 3060.000000
mean 30492.802614 100423.684967 103.278758 4.460522 19.163007 6.908072 5.977222 5.819314 5.695294 5.899739 6.297484 15.002190 13.250327 15.985556 40.491667 50.003856 49.996144 84.076340 8.121307 1.550719 1.165033 0.082418 1.966765 8.417353 96.604379 39.254118 20.403791 27.047647 20.409281 5.134248 3.395654 67.828627 27.814542 52.057712 19.173268 32.171438 2.476042 2.988451 72.576307 27.423791 46800.265359 0.520577 0.002492 0.030613 0.003221 0.024522 239.640924
std 15071.256920 323873.098080 492.184847 0.634640 2.846254 1.135197 2.546279 1.223101 0.950620 0.878120 0.830619 1.506377 2.129781 4.166765 4.959052 2.204346 2.204346 15.045667 13.038226 5.071524 2.533382 0.967483 1.549707 13.303866 4.451890 3.484761 2.711350 3.484574 3.001770 2.163983 4.451875 4.990887 4.766401 5.948292 4.289568 4.990872 0.203247 0.181387 7.350396 7.350283 12025.134705 0.059487 0.001291 0.007625 0.001295 0.007228 1610.657866
min 1001.000000 90.000000 0.000000 1.913814 0.000000 0.000000 1.300000 2.300000 2.400000 1.200000 2.800000 6.300000 4.000000 3.500000 22.600000 43.200000 27.900000 14.200000 0.000000 0.000000 0.000000 0.000000 0.100000 0.000000 45.000000 17.600000 7.500000 4.400000 0.000000 0.700000 0.000000 18.800000 0.000000 11.600000 0.000000 13.400000 1.260000 2.000000 1.400000 10.200000 19146.000000 0.115942 0.000000 0.000000 0.000000 0.000000 0.069674
25% 19006.500000 11208.000000 2.000000 4.051722 17.400000 6.300000 4.700000 5.100000 5.100000 5.400000 5.800000 14.200000 12.000000 13.300000 37.700000 48.900000 49.600000 76.575000 0.500000 0.200000 0.300000 0.000000 1.100000 1.600000 96.300000 37.400000 18.800000 25.000000 18.600000 3.500000 1.100000 65.200000 25.000000 48.800000 16.500000 29.300000 2.350000 2.880000 69.200000 22.600000 38810.000000 0.488003 0.001797 0.025435 0.002469 0.020005 17.078483
50% 29188.000000 25987.000000 10.000000 4.414815 19.100000 6.800000 5.500000 5.600000 5.600000 5.900000 6.300000 15.100000 13.100000 15.700000 40.500000 49.500000 50.500000 89.450000 1.900000 0.400000 0.500000 0.000000 1.600000 3.300000 98.200000 39.500000 20.700000 27.100000 20.400000 4.900000 1.800000 68.100000 27.450000 52.500000 18.700000 31.900000 2.450000 2.970000 73.800000 26.200000 45009.500000 0.524803 0.002367 0.030222 0.003160 0.024365 45.094109
75% 45085.500000 67582.250000 43.000000 4.825413 20.700000 7.300000 6.500000 6.400000 6.200000 6.400000 6.800000 15.800000 14.300000 18.300000 43.400000 50.400000 51.100000 95.625000 9.400000 0.800000 1.000000 0.100000 2.300000 8.400000 98.900000 41.300000 22.225000 29.000000 22.000000 6.500000 3.700000 70.700000 30.100000 55.700000 21.000000 34.800000 2.570000 3.070000 77.400000 30.800000 52013.500000 0.556712 0.002993 0.035313 0.003841 0.028793 114.277352
max 56045.000000 10017068.000000 15316.000000 6.992050 33.600000 18.300000 28.100000 16.100000 11.700000 9.700000 11.900000 24.500000 28.100000 43.400000 62.700000 72.100000 56.800000 99.200000 84.400000 75.500000 43.900000 48.900000 29.500000 95.700000 100.000000 76.700000 28.800000 42.800000 34.400000 16.000000 55.000000 86.600000 50.300000 79.200000 41.700000 81.200000 3.680000 4.050000 89.800000 98.600000 123966.000000 0.792199 0.022064 0.080335 0.017631 0.077751 68239.991607

In [45]:
df_new = df_merged.drop(["FIPS","Cases", "d002", "hd02s051", "hd02s143", "hd02s159", "hd02s184", "hd01s001"], axis=1)
X = df_new.values

columns = X.shape[1]
for column in np.arange(columns):
    mean_temp = X[:,column].mean()
    std_temp = X[:,column].std()
    X[:,column] = (X[:,column]-mean_temp)/std_temp

plt.plot((np.std(X, axis=0)))


Out[45]:
[<matplotlib.lines.Line2D at 0x1151f4518>]

In [46]:
X.shape


Out[46]:
(3060, 39)

In [47]:
pca = PCA(n_components=39)
pca.fit(X)
pca.explained_variance_ratio_


Out[47]:
array([  2.94525564e-01,   1.64308375e-01,   7.98070500e-02,
         6.79178134e-02,   6.47768303e-02,   4.95135898e-02,
         4.30534407e-02,   3.07112910e-02,   2.94860465e-02,
         2.71764298e-02,   2.12387110e-02,   1.65600184e-02,
         1.57898743e-02,   1.35634732e-02,   1.22893006e-02,
         1.20671080e-02,   8.77515194e-03,   8.08329827e-03,
         6.89497673e-03,   5.86716605e-03,   4.85048055e-03,
         4.22251140e-03,   3.79732597e-03,   3.26146886e-03,
         2.58523806e-03,   2.24116473e-03,   1.79765723e-03,
         1.48926992e-03,   1.15747422e-03,   9.54360190e-04,
         3.18175827e-04,   2.56011170e-04,   1.86629727e-04,
         1.76638055e-04,   1.58593900e-04,   7.52418315e-05,
         4.39366076e-05,   1.35256555e-05,   8.78706740e-06])

In [48]:
plt.scatter(pca.components_[:,0], pca.components_[:,1])


Out[48]:
<matplotlib.collections.PathCollection at 0x115e925f8>
/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [49]:
X_trans = pca.transform(X)
Y = df_merged["Cases"]/df_merged["Population"]
plt.scatter(X_trans[:,0],Y)


Out[49]:
<matplotlib.collections.PathCollection at 0x115ef6630>
/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

Linear regression


In [50]:
from sklearn import linear_model

In [51]:
df_new["hd02s002"].hist()


Out[51]:
<matplotlib.axes._subplots.AxesSubplot at 0x115227fd0>

In [52]:
df_new.head()


Out[52]:
Population hd02s002 hd02s005 hd02s006 hd02s007 hd02s008 hd02s009 hd02s010 hd02s011 hd02s013 hd02s015 hd01s020 hd02s026 hd02s078 hd02s079 hd02s080 hd02s081 hd02s089 hd02s095 hd02s107 hd02s131 hd02s132 hd02s133 hd02s134 hd02s135 hd02s136 hd02s151 hd02s152 hd02s153 hd02s154 hd01s167 hd01s168 hd02s181 hd01vd01 d014 d019 d024 d029 lnd110210d
0 55246 21.8 7.9 5.6 5.8 6.1 7.6 7.5 15.0 10.7 12.0 37.0 48.7 78.5 17.7 0.4 0.9 0.1 1.6 2.4 99.2 37.1 20.8 31.8 23.6 6.1 74.5 34.9 56.2 25.3 2.68 3.13 75.4 52475 0.003017 0.020029 0.002868 0.017704 92.781808
1 195540 19.0 6.4 5.2 5.6 5.9 6.3 6.6 14.8 13.5 16.9 41.1 48.9 85.7 9.4 0.7 0.7 0.0 1.5 4.4 98.7 40.2 21.9 26.8 20.1 5.6 69.9 28.0 54.5 19.9 2.46 2.93 72.5 50183 0.002747 0.023886 0.003444 0.020292 122.920831
2 27076 18.0 6.3 6.5 7.3 6.6 6.6 6.6 14.7 13.2 14.3 39.0 53.1 48.0 46.9 0.4 0.4 0.1 0.9 5.1 88.4 35.8 15.6 25.7 17.7 7.8 68.4 27.4 43.7 14.4 2.47 3.01 66.8 35634 0.002342 0.019348 0.003666 0.022200 30.563959
3 22512 18.4 6.7 6.5 7.0 7.2 7.6 7.1 14.8 11.9 12.6 37.8 53.7 75.8 22.0 0.3 0.1 0.1 0.9 1.8 90.3 34.7 18.2 26.8 18.7 7.5 72.3 29.5 52.5 20.1 2.60 3.09 75.6 37984 0.001886 0.020244 0.002012 0.020370 36.101222
4 57872 20.2 7.0 5.4 6.0 6.0 6.8 7.0 14.1 12.6 14.6 39.0 49.5 92.6 1.3 0.5 0.2 0.1 1.2 8.1 99.1 37.6 22.8 29.2 21.3 6.4 75.0 31.1 60.6 24.1 2.63 3.07 80.6 44409 0.001946 0.017981 0.003707 0.013440 89.615659

In [53]:
df_merged.head()


Out[53]:
FIPS Population Cases hd01s001 hd02s002 hd02s005 hd02s006 hd02s007 hd02s008 hd02s009 hd02s010 hd02s011 hd02s013 hd02s015 hd01s020 hd02s026 hd02s051 hd02s078 hd02s079 hd02s080 hd02s081 hd02s089 hd02s095 hd02s107 hd02s131 hd02s132 hd02s133 hd02s134 hd02s135 hd02s136 hd02s143 hd02s151 hd02s152 hd02s153 hd02s154 hd02s159 hd01s167 hd01s168 hd02s181 hd02s184 hd01vd01 d002 d014 d019 d024 d029 lnd110210d
0 1001 55246 48 4.736962 21.8 7.9 5.6 5.8 6.1 7.6 7.5 15.0 10.7 12.0 37.0 48.7 51.3 78.5 17.7 0.4 0.9 0.1 1.6 2.4 99.2 37.1 20.8 31.8 23.6 6.1 0.8 74.5 34.9 56.2 25.3 25.5 2.68 3.13 75.4 24.6 52475 0.562138 0.003017 0.020029 0.002868 0.017704 92.781808
1 1003 195540 153 5.260703 19.0 6.4 5.2 5.6 5.9 6.3 6.6 14.8 13.5 16.9 41.1 48.9 51.1 85.7 9.4 0.7 0.7 0.0 1.5 4.4 98.7 40.2 21.9 26.8 20.1 5.6 1.3 69.9 28.0 54.5 19.9 30.1 2.46 2.93 72.5 27.5 50183 0.545409 0.002747 0.023886 0.003444 0.020292 122.920831
2 1005 27076 52 4.438653 18.0 6.3 6.5 7.3 6.6 6.6 6.6 14.7 13.2 14.3 39.0 53.1 46.9 48.0 46.9 0.4 0.4 0.1 0.9 5.1 88.4 35.8 15.6 25.7 17.7 7.8 11.6 68.4 27.4 43.7 14.4 31.6 2.47 3.01 66.8 33.2 35634 0.437169 0.002342 0.019348 0.003666 0.022200 30.563959
3 1007 22512 22 4.360120 18.4 6.7 6.5 7.0 7.2 7.6 7.1 14.8 11.9 12.6 37.8 53.7 46.3 75.8 22.0 0.3 0.1 0.1 0.9 1.8 90.3 34.7 18.2 26.8 18.7 7.5 9.7 72.3 29.5 52.5 20.1 27.7 2.60 3.09 75.6 24.4 37984 0.524582 0.001886 0.020244 0.002012 0.020370 36.101222
4 1009 57872 6 4.758321 20.2 7.0 5.4 6.0 6.0 6.8 7.0 14.1 12.6 14.6 39.0 49.5 50.5 92.6 1.3 0.5 0.2 0.1 1.2 8.1 99.1 37.6 22.8 29.2 21.3 6.4 0.9 75.0 31.1 60.6 24.1 25.0 2.63 3.07 80.6 19.4 44409 0.606034 0.001946 0.017981 0.003707 0.013440 89.615659

Split data set into training/test and validation data


In [54]:
cutoff = 1

X = df_new[df_merged["Cases"]>cutoff].values
Y = df_merged[df_merged["Cases"]>cutoff].Cases/(df_merged[df_merged["Cases"]>cutoff].Population+1.0)

X_full = df_new.values
Y_full = df_merged.Cases/(df_merged.Population+1.0)

#normalize all columns to the same normalization
columns = X.shape[1]
means = np.zeros(columns)
stds = np.zeros(columns)
for column in np.arange(columns):
    mean_temp = X[:,column].mean()
    std_temp = X[:,column].std()
    means[column] = mean_temp
    stds[column] = std_temp
    X[:,column] = (X[:,column]-mean_temp)/std_temp
    X_full[:,column] = (X_full[:,column]-mean_temp)/std_temp

Ymean = Y_full.mean()
Ystd = Y_full.std()

Y = (Y-Ymean)/Ystd
Y_full = (Y_full-Ymean)/Ystd
    
ones = np.ones(round(0.75*len(X)), dtype=bool)
zeros = np.zeros(len(X)-round(0.75*len(X)), dtype=bool)
training_list = np.hstack((ones, zeros))
np.random.shuffle(training_list)
test_list = np.zeros(len(X),dtype=bool)
test_list = np.array([not i for i in training_list])

X_train = X[training_list]
X_test = X[test_list]
Y_train = Y[training_list]
Y_test = Y[test_list]

X.shape, X_train.shape, X_test.shape, Y.shape, Y_train.shape, Y_test.shape


Out[54]:
((2400, 39), (1800, 39), (600, 39), (2400,), (1800,), (600,))

In [55]:
X.shape, Y.shape


Out[55]:
((2400, 39), (2400,))

In [56]:
Y_test.describe()


Out[56]:
count    600.000000
mean       0.183267
std        1.027183
min       -0.852713
25%       -0.517351
50%       -0.207258
75%        0.565745
max        6.344650
dtype: float64

In [57]:
#X_weights = df_merged.values
#X_train_weights = X_weights[training_list]
weights = 1 #X_train_weights[:,2]
regr = linear_model.LinearRegression()
regr.fit(X_train, Y_train, sample_weight=weights)


Out[57]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [58]:
print(regr.coef_)


[ 0.06999576 -0.28355523 -0.23556967 -0.36628503 -0.20079447 -0.10982781
 -0.17001598 -0.14272377 -0.13889428 -0.38876025 -0.57277059 -0.35333905
 -0.09109317 -0.03240245  0.23917996  0.03822984  0.05814973 -0.07549794
  0.06339112  0.00536592  0.1823018   0.63270391 -0.78847206 -0.85551799
  0.26370773 -0.20390549  1.11361052  0.56902164  0.38174778 -1.08045281
 -1.05423933  1.08499758 -0.13462548  0.18570043  0.04571197 -0.09023666
 -0.01984907 -0.03387758 -0.08901632]

In [59]:
1.0-np.sum((regr.predict(X_test)-Y_test)**2)/np.sum((Y_test-np.mean(Y_test))**2)


Out[59]:
0.57055915646598554

In [60]:
plt.scatter(Y,regr.predict(X))
plt.plot(np.linspace(Y.min(),Y.max(),num=10),np.linspace(Y.min(),Y.max(),num=10),color='red')


Out[60]:
[<matplotlib.lines.Line2D at 0x10f142588>]
/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [61]:
print('Variance score: %.5f\t(%.5f)' % (regr.score(X_test, Y_test), regr.score(X_full, Y_full)))


Variance score: 0.57056	(0.59701)

In [62]:
from sklearn import cross_validation
cv = cross_validation.ShuffleSplit(len(Y_train), n_iter=5, test_size=0.2, random_state=0)
scores_regression = cross_validation.cross_val_score(regr, X_train, Y_train, cv=cv)
scores_regression
#cross_val_score(regr, X_train, Y_train, cv=6, n_jobs=1)
#scores


Out[62]:
array([ 0.51001046,  0.632968  ,  0.6034146 ,  0.60008855,  0.55867592])

In [63]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_regression.mean(), scores_regression.std() * 2))


Accuracy: 0.58 (+/- 0.09)

In [64]:
from sklearn.metrics import explained_variance_score

In [65]:
explained_variance_score(Y_train, regr.predict(X_train))


Out[65]:
0.6014318615994938

In [66]:
from sklearn.metrics import mean_absolute_error

In [67]:
mean_absolute_error(Y_train, regr.predict(X_train))


Out[67]:
0.44396426675554196

In [68]:
from sklearn.metrics import mean_squared_error

In [69]:
mean_squared_error(Y_train, regr.predict(X_train))


Out[69]:
0.41339284741867016

In [70]:
from sklearn.metrics import median_absolute_error

In [71]:
median_absolute_error(Y_train, regr.predict(X_train))


Out[71]:
0.31700773322247322

In [72]:
from sklearn.metrics import r2_score

In [73]:
r2_score(Y_train, regr.predict(X_train))


Out[73]:
0.6014318615994938

Polynomial regression


In [74]:
from sklearn.preprocessing import PolynomialFeatures

In [75]:
poly = PolynomialFeatures(degree=2)

In [76]:
X_train_poly = poly.fit_transform(X_train)

In [77]:
poly_regr = linear_model.LinearRegression(fit_intercept=False)
poly_regr.fit(X_train_poly, Y_train)


Out[77]:
LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [78]:
plt.scatter(Y,poly_regr.predict(poly.fit_transform(X)))
plt.plot(np.linspace(Y_train.min(),Y_train.max(),num=10),np.linspace(Y_train.min(),Y_train.max(),num=10),color='red')


Out[78]:
[<matplotlib.lines.Line2D at 0x10f364828>]
/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [79]:
print('Variance score: %.5f\t(%.5f)' % (poly_regr.score(poly.fit_transform(X_test), Y_test), poly_regr.score(poly.fit_transform(X), Y)))


Variance score: -99.89978	(-24.62157)

Ridge regression


In [80]:
from sklearn import linear_model

In [81]:
rregr = linear_model.Ridge(alpha=0.5)
rregr.fit(X_train, Y_train)


Out[81]:
Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [82]:
print('Variance score: %.5f\t(%.5f)' % (rregr.score(X_test, Y_test), rregr.score(X_full, Y_full)))


Variance score: 0.57282	(0.59722)

In [83]:
scores_rregression = cross_validation.cross_val_score(rregr, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_rregression.mean(), scores_rregression.std() * 2))


Accuracy: 0.58 (+/- 0.09)

Extra Trees Regression


In [84]:
from sklearn.ensemble import ExtraTreesRegressor

In [85]:
clf = ExtraTreesRegressor(n_estimators=250, bootstrap=True, oob_score=True, max_features='sqrt')
clf.fit(X_train, Y_train)


Out[85]:
ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=None,
          max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=250, n_jobs=1, oob_score=True, random_state=None,
          verbose=0, warm_start=False)

In [86]:
print('Variance score: %.5f\t(%.5f)\nOut of bag error score: %.5f' % (clf.score(X_test, Y_test), clf.score(X_full, Y_full),clf.oob_score_))


Variance score: 0.62724	(0.80759)
Out of bag error score: 0.60974

In [87]:
scores_etregression = cross_validation.cross_val_score(clf, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_etregression.mean(), scores_etregression.std() * 2))


Accuracy: 0.62 (+/- 0.06)

Ada Boost Regressor


In [88]:
from sklearn.ensemble import AdaBoostRegressor

In [89]:
clf = AdaBoostRegressor(n_estimators=500, learning_rate=0.01, loss='linear')
clf.fit(X_train, Y_train)


Out[89]:
AdaBoostRegressor(base_estimator=None, learning_rate=0.01, loss='linear',
         n_estimators=500, random_state=None)

In [90]:
print('Variance score: %.5f\t(%.5f)' % (clf.score(X_test, Y_test), clf.score(X_full, Y_full)))


Variance score: 0.53526	(0.57273)

In [91]:
scores_adaregression = cross_validation.cross_val_score(clf, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_adaregression.mean(), scores_adaregression.std() * 2))


Accuracy: 0.55 (+/- 0.06)

Bagging regressor


In [92]:
from sklearn.ensemble import BaggingRegressor

In [93]:
clf = BaggingRegressor(n_estimators=250, bootstrap=True, oob_score=True, max_features=20)
clf.fit(X_train, Y_train)


Out[93]:
BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=20, max_samples=1.0,
         n_estimators=250, n_jobs=1, oob_score=True, random_state=None,
         verbose=0, warm_start=False)

In [94]:
print('Variance score: %.5f\t(%.5f)\nOut of bag error score: %.5f' % (clf.score(X_test, Y_test), clf.score(X_full, Y_full),clf.oob_score_))


Variance score: 0.65425	(0.76777)
Out of bag error score: 0.62089

In [95]:
scores_bagregression = cross_validation.cross_val_score(clf, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_bagregression.mean(), scores_bagregression.std() * 2))


Accuracy: 0.63 (+/- 0.07)

Gradient Boosting Regressor


In [96]:
from sklearn.ensemble import GradientBoostingRegressor

In [97]:
clf = GradientBoostingRegressor(n_estimators=250, max_features=10,max_depth=5)
clf.fit(X_train, Y_train)


Out[97]:
GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             max_depth=5, max_features=10, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=250,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [98]:
print('Variance score: %.5f\t(%.5f)' % (clf.score(X_test, Y_test), clf.score(X_full, Y_full)))


Variance score: 0.66302	(0.76559)

In [99]:
plt.scatter(Y_full,clf.predict(X_full))
plt.plot(np.linspace(Y_full.min(),Y_full.max(),num=10),np.linspace(Y_full.min(),Y_full.max(),num=10),color='red')


Out[99]:
[<matplotlib.lines.Line2D at 0x1154f37b8>]
/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [100]:
scores_gradboostregression = cross_validation.cross_val_score(clf, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_gradboostregression.mean(), scores_gradboostregression.std() * 2))


Accuracy: 0.63 (+/- 0.10)

In [101]:
clf.fit(X_train, Y_train)


Out[101]:
GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             max_depth=5, max_features=10, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=250,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [102]:
import pickle

In [103]:
with open("../data/gradientboosting_params_gonorrhea.pickle", "wb") as myfile:
    pickle.dump(clf, myfile)

Random forest


In [104]:
from sklearn.ensemble import RandomForestRegressor

In [105]:
clf = RandomForestRegressor(n_estimators=250, oob_score=True, max_features='sqrt',min_samples_split=2, n_jobs=4)
clf.fit(X_train, Y_train)


Out[105]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=250, n_jobs=4, oob_score=True, random_state=None,
           verbose=0, warm_start=False)

In [106]:
print('Variance score: %.5f\t(%.5f)\nOut of bag error score: %.5f' % (clf.score(X_test, Y_test), clf.score(X_full, Y_full),clf.oob_score_))


Variance score: 0.65100	(0.77673)
Out of bag error score: 0.61952

In [107]:
scores_randomforest = cross_validation.cross_val_score(clf, X_train, Y_train, cv=cv, n_jobs=4)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_randomforest.mean(), scores_randomforest.std() * 2))


Accuracy: 0.63 (+/- 0.08)

In [108]:
clf = RandomForestRegressor(n_estimators=250, oob_score=True, max_features='sqrt',min_samples_split=2, n_jobs=4)
clf.fit(X_train, Y_train)


Out[108]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=250, n_jobs=4, oob_score=True, random_state=None,
           verbose=0, warm_start=False)

In [109]:
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

#ax.text(-1.5, 3, r'$R^2 = $%.2f'%(clf.oob_score_), fontsize=30)


ax.set_xlabel("CDC data", fontsize=20)
ax.set_ylabel("Model prediction", fontsize=20)

ax = plt.scatter(Y,clf.predict(X))
ax2 = plt.plot(np.linspace(Y.min(),Y.max(),10),np.linspace(Y.min(),Y.max(),10),color='red')

plt.savefig('../graphics/data_vs_model_gonorrhea.png', bbox_inches='tight', dpi=150)


/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [828]:
from collections import OrderedDict
RANDOM_STATE = 123
ensemble_clfs = [
    ("RandomForestRegressor, max_features='sqrt'",
        RandomForestRegressor(warm_start=False, oob_score=True,
                               max_features="sqrt",
                               random_state=RANDOM_STATE)),
    ("RandomForestRegressor, max_features='log2'",
        RandomForestRegressor(warm_start=False, max_features='log2',
                               oob_score=True,
                               random_state=RANDOM_STATE)),
    ("RandomForestRegressor, max_features=None",
        RandomForestRegressor(warm_start=False, max_features=None,
                               oob_score=True,
                               random_state=RANDOM_STATE))
]

# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

# Range of `n_estimators` values to explore.
min_estimators = 10
max_estimators = 250

for label, clf in ensemble_clfs:
    for i in range(min_estimators, max_estimators,10):
        clf.set_params(n_estimators=i)
        clf.fit(X_train, Y_train)

        # Record the OOB error for each `n_estimators=i` setting.
        oob_error = clf.oob_score_
        error_rate[label].append((i, oob_error))

# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
    xs, ys = zip(*clf_err)
    plt.plot(xs, ys, label=label)

plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB score")
plt.legend(loc="lower right")

plt.savefig('../graphics/features_estimators_gonorrhea.png', bbox_inches='tight', dpi=150)


/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/ensemble/forest.py:687: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/ensemble/forest.py:687: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/ensemble/forest.py:687: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "

In [111]:
feature_importance = (np.vstack((np.arange(len(clf.feature_importances_)), clf.feature_importances_)).T)
ranking = feature_importance[feature_importance[:,1].argsort()[::-1]]

In [112]:
for rank, importance in ranking: 
    print(df_new.columns[rank], importance)


hd02s078 0.131555315591
hd02s079 0.131042669253
hd02s153 0.077605262343
hd02s133 0.0724125131037
hd02s136 0.056609677503
hd02s026 0.0381425939706
hd02s181 0.0350510997217
d019 0.0235904527678
Population 0.0232923192023
lnd110210d 0.0196776197322
hd02s154 0.0195579171815
hd02s002 0.0183205296601
d029 0.0180897502352
hd02s006 0.0178635196094
hd02s011 0.017195152399
hd02s007 0.0161211203852
hd01vd01 0.0157915705716
hd02s095 0.0156136762701
hd01s020 0.015395242309
hd02s107 0.0153760048708
d024 0.0150625900875
d014 0.015013875519
hd02s134 0.0144646508373
hd02s132 0.0143730075973
hd02s151 0.0142209752962
hd01s168 0.0138617799277
hd02s135 0.012932051565
hd02s080 0.0126747567138
hd02s008 0.0123131135207
hd02s010 0.012283776033
hd02s131 0.0117112470565
hd01s167 0.0114938183702
hd02s152 0.0108361262187
hd02s015 0.0100120041693
hd02s009 0.00969465542139
hd02s013 0.00955429544546
hd02s005 0.00896322359447
hd02s081 0.00893532837002
hd02s089 0.00329471757626

In [113]:
labels_dict = {
    "Population": "Population",
    "hd01s001":  "log10(Population)",
    "hd02s002":  "0-14",
    "hd02s005":  "15-19",
    "hd02s006":  "20-24",
    "hd02s007":  "25-29",
    "hd02s008":  "30-34",
    "hd02s009":  "35-39",
    "hd02s010":  "40-44",
    "hd02s011":  "45-54",
    "hd02s013":  "55-64",
    "hd02s015":  "65+",
    "hd01s020":  "Median age",
    "hd02s026":  "Male percent",
    "hd02s051":  "Female percent",
    "hd02s078":  "White",
    "hd02s079":  "Black",
    "hd02s080":  "Native",
    "hd02s081":  "Asian",
    "hd02s089":  "Pacific/Hawaiian",
    "hd02s095":  "Two or more races",
    "hd02s107":  "Hispanic",
    "hd02s131":  "In households",
    "hd02s132":  "Householder",
    "hd02s133":  "Spouse",
    "hd02s134":  "Child",
    "hd02s135":  "Child w own child under 18",
    "hd02s136":  "Other relatives",
    "hd02s143":  "In group quarters",
    "hd02s151":  "Family households",
    "hd02s152":  "Family households w own child under 18",
    "hd02s153":  "Husband-wife family",
    "hd02s154":  "Husband-wife family w own child under 18",
    "hd02s159":  "Nonfamily households",
    "hd01s167":  "Average household size",
    "hd01s168":  "Average family size",
    "hd02s181":  "Owner occupied housing units",
    "hd02s184":  "Renter occupied housing units",
    "hd01vd01":  "Median income",
    "d001":      "Total households",
    "d002":      "Husband-wife households",
    "d014":      "Unmarried-partner households: - Male householder and male partner",
    "d019":      "Unmarried-partner households: - Male householder and female partner",
    "d024":      "Unmarried-partner households: - Female householder and female partner",
    "d029":      "Unmarried-partner households: - Female householder and male partner",
    "lnd110210d": "Population density"}

In [114]:
ranked_list = []
ranked_labels = []
for rank, importance in ranking: 
    print(labels_dict[df_new.columns[rank]], importance)
    ranked_list.append(importance)
    ranked_labels.append(labels_dict[df_new.columns[rank]])


White 0.131555315591
Black 0.131042669253
Husband-wife family 0.077605262343
Spouse 0.0724125131037
Other relatives 0.056609677503
Male percent 0.0381425939706
Owner occupied housing units 0.0350510997217
Unmarried-partner households: - Male householder and female partner 0.0235904527678
Population 0.0232923192023
Population density 0.0196776197322
Husband-wife family w own child under 18 0.0195579171815
0-14 0.0183205296601
Unmarried-partner households: - Female householder and male partner 0.0180897502352
20-24 0.0178635196094
45-54 0.017195152399
25-29 0.0161211203852
Median income 0.0157915705716
Two or more races 0.0156136762701
Median age 0.015395242309
Hispanic 0.0153760048708
Unmarried-partner households: - Female householder and female partner 0.0150625900875
Unmarried-partner households: - Male householder and male partner 0.015013875519
Child 0.0144646508373
Householder 0.0143730075973
Family households 0.0142209752962
Average family size 0.0138617799277
Child w own child under 18 0.012932051565
Native 0.0126747567138
30-34 0.0123131135207
40-44 0.012283776033
In households 0.0117112470565
Average household size 0.0114938183702
Family households w own child under 18 0.0108361262187
65+ 0.0100120041693
35-39 0.00969465542139
55-64 0.00955429544546
15-19 0.00896322359447
Asian 0.00893532837002
Pacific/Hawaiian 0.00329471757626

In [115]:
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

ax.set_xlabel("Feature importance", fontsize=20)
ax.set_ylabel("Feature", fontsize=20)

ax = sns.barplot(x=ranked_list, y=ranked_labels)
plt.savefig('../graphics/feature_importance_ranking_gonorrhea.png', bbox_inches='tight', dpi=150)



In [116]:
len(ranked_list)


Out[116]:
39

Save model parameters for use in web app:


In [110]:
import pickle

In [111]:
with open("../data/randomforest_params_gonorrhea.pickle", "wb") as myfile:
    pickle.dump(clf, myfile)

In [112]:
with open("../data/Ymean_gonorrhea.pickle", "wb") as myfile:
    pickle.dump(Ymean, myfile)

In [113]:
with open("../data/Ystd_gonorrhea.pickle", "wb") as myfile:
    pickle.dump(Ystd, myfile)

In [114]:
deployed_model = pickle.load(open('../data/randomforest_params_gonorrhea.pickle', "rb" ))

In [115]:
print('Variance score: %.5f\t(%.5f)' % (deployed_model.score(X_test, Y_test), deployed_model.score(X_full, Y_full)))


Variance score: 0.64313	(0.77777)

Suport Vector Regression


In [123]:
from sklearn.svm import SVR

In [124]:
svr_rbf = SVR(kernel='rbf', C=15, gamma=0.0001, epsilon=0.0005, tol=0.00001)
#svr_lin = SVR(kernel='linear', C=1, epsilon=0.001)
#svr_poly = SVR(kernel='poly', C=1, degree=2, epsilon=0.001)
svr_rbf.fit(X_train, Y_train)
#svr_lin.fit(X_train, Y_train)
#svr_poly.fit(X_train, Y_train)
#print('Variance score:\n\t%.5f\t(rbf)\n\t%.5f\t(lin)\n\t%.5f\t(poly)' % (svr_rbf.score(X_train, Y_train), svr_lin.score(X_train, Y_train),svr_poly.score(X_train, Y_train)))
print('Variance score:\n\t%.5f\t(rbf)' % (svr_rbf.score(X_test, Y_test)))


Variance score:
	0.49249	(rbf)

In [125]:
scores_svm = cross_validation.cross_val_score(svr_rbf, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_svm.mean(), scores_svm.std() * 2))


Accuracy: 0.56 (+/- 0.05)

Model comparison


In [126]:
model_scores = []
model_errors = []
model_names = []
model_scores.append(np.mean(scores_regression))
model_errors.append(np.std(scores_regression)*2)
model_names.append("Linear")
model_scores.append(np.mean(scores_rregression))
model_errors.append(np.std(scores_rregression)*2)
model_names.append("Ridge")
model_scores.append(np.mean(scores_etregression))
model_errors.append(np.std(scores_etregression)*2)
model_names.append("Extra trees")
model_scores.append(np.mean(scores_adaregression))
model_errors.append(np.std(scores_adaregression)*2)
model_names.append("ADA boost")
model_scores.append(np.mean(scores_bagregression))
model_errors.append(np.std(scores_bagregression)*2)
model_names.append("Bagging")
model_scores.append(np.mean(scores_gradboostregression))
model_errors.append(np.std(scores_gradboostregression)*2)
model_names.append("Gradient boost")
model_scores.append(np.mean(scores_svm))
model_errors.append(np.std(scores_svm)*2)
model_names.append("Suport vector")
model_scores.append(np.mean(scores_randomforest))
model_errors.append(np.std(scores_randomforest)*2)
model_names.append("Random forest")

In [127]:
model_names, model_scores, model_errors


Out[127]:
(['Linear',
  'Ridge',
  'Extra trees',
  'ADA boost',
  'Bagging',
  'Gradient boost',
  'Suport vector',
  'Random forest'],
 [0.58456807197975214,
  0.58724151623728538,
  0.62247664219371257,
  0.55762532394194042,
  0.63789431535633934,
  0.64292067288243726,
  0.55546802562597519,
  0.63547396612249241],
 [0.020748703236739038,
  0.022872012731342393,
  0.046991141818972265,
  0.061103844940426887,
  0.048239678568851994,
  0.045742974864002715,
  0.045432117257869865,
  0.040558816555148106])

In [128]:
# Set up the matplotlib figure
sns.set_context("poster", font_scale=1)
f, ax = plt.subplots(figsize=(11, 9))

ax.set_xlabel("Model", fontsize=20)
ax.set_ylabel("Cross validation score", fontsize=20)

ax = sns.barplot(x=model_names, y=model_scores, yerr=model_errors)
plt.xticks(rotation=90)
plt.savefig('../graphics/model_performance_gonorrhea.png', bbox_inches='tight', dpi=150)


/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [ ]:


In [ ]: