STD & me

Analysis of 2014 data from the CDC on the prevalence of STD's in U.S. counties.



In [1]:

    
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re
%matplotlib inline



In [2]:

    
# Always display all the columns
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 200) 

# Plain Seaborn figures with matplotlib color codes mapped to the default seaborn palette 
sns.set(style="white", color_codes=True)

CDC data on Gonorrhea



In [3]:

    
df = pd.read_csv("../data/cdc/gonorrhea.csv")



In [4]:

    
df.shape









    Out[4]:





(3228, 12)



In [5]:

    
df.columns









    Out[5]:





Index(['Disease', 'Area', 'State Abbreviation', 'FIPS', 'Year', 'Race', 'Sex', 'Age group', 'Transmission Category', 'Population', 'Cases', 'Rate'], dtype='object')



In [6]:

    
df.dtypes









    Out[6]:





Disease                  object
Area                     object
State Abbreviation       object
FIPS                      int64
Year                      int64
Race                     object
Sex                      object
Age group                object
Transmission Category    object
Population               object
Cases                    object
Rate                     object
dtype: object



In [7]:

    
df_test = df.convert_objects(convert_numeric=True)
df_test.dtypes









    



/Users/akuepper/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  if __name__ == '__main__':






    Out[7]:





Disease                   object
Area                      object
State Abbreviation        object
FIPS                       int64
Year                       int64
Race                      object
Sex                       object
Age group                 object
Transmission Category     object
Population               float64
Cases                    float64
Rate                     float64
dtype: object



In [8]:

    
df_test.head()









    Out[8]:






  
    
      
      Disease
      Area
      State Abbreviation
      FIPS
      Year
      Race
      Sex
      Age group
      Transmission Category
      Population
      Cases
      Rate
    
  
  
    
      0
      Gonorrhea
      Autauga County
      AL
      1001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      NaN
      48
      86.9
    
    
      1
      Gonorrhea
      Baldwin County
      AL
      1003
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      NaN
      153
      78.2
    
    
      2
      Gonorrhea
      Barbour County
      AL
      1005
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      NaN
      52
      192.1
    
    
      3
      Gonorrhea
      Bibb County
      AL
      1007
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      NaN
      22
      97.7
    
    
      4
      Gonorrhea
      Blount County
      AL
      1009
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      NaN
      6
      10.4



In [9]:

    
df['Population'] = df['Population'].str.replace(',','')
df['Cases'] = df['Cases'].str.replace(',','')



In [10]:

    
df = df.convert_objects(convert_numeric=True)
df.dtypes









    



/Users/akuepper/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  if __name__ == '__main__':






    Out[10]:





Disease                   object
Area                      object
State Abbreviation        object
FIPS                       int64
Year                       int64
Race                      object
Sex                       object
Age group                 object
Transmission Category     object
Population               float64
Cases                    float64
Rate                     float64
dtype: object



In [11]:

    
df.head(77)









    Out[11]:






  
    
      
      Disease
      Area
      State Abbreviation
      FIPS
      Year
      Race
      Sex
      Age group
      Transmission Category
      Population
      Cases
      Rate
    
  
  
    
      0
      Gonorrhea
      Autauga County
      AL
      1001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      55246
      48
      86.9
    
    
      1
      Gonorrhea
      Baldwin County
      AL
      1003
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      195540
      153
      78.2
    
    
      2
      Gonorrhea
      Barbour County
      AL
      1005
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      27076
      52
      192.1
    
    
      3
      Gonorrhea
      Bibb County
      AL
      1007
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      22512
      22
      97.7
    
    
      4
      Gonorrhea
      Blount County
      AL
      1009
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      57872
      6
      10.4
    
    
      5
      Gonorrhea
      Bullock County
      AL
      1011
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      10639
      27
      253.8
    
    
      6
      Gonorrhea
      Butler County
      AL
      1013
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      20265
      29
      143.1
    
    
      7
      Gonorrhea
      Calhoun County
      AL
      1015
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      116736
      229
      196.2
    
    
      8
      Gonorrhea
      Chambers County
      AL
      1017
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      34162
      73
      213.7
    
    
      9
      Gonorrhea
      Cherokee County
      AL
      1019
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      26203
      24
      91.6
    
    
      10
      Gonorrhea
      Chilton County
      AL
      1021
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      43951
      44
      100.1
    
    
      11
      Gonorrhea
      Choctaw County
      AL
      1023
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      13426
      8
      59.6
    
    
      12
      Gonorrhea
      Clarke County
      AL
      1025
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      25207
      28
      111.1
    
    
      13
      Gonorrhea
      Clay County
      AL
      1027
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      13486
      14
      103.8
    
    
      14
      Gonorrhea
      Cleburne County
      AL
      1029
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      14994
      6
      40.0
    
    
      15
      Gonorrhea
      Coffee County
      AL
      1031
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      50938
      64
      125.6
    
    
      16
      Gonorrhea
      Colbert County
      AL
      1033
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      54520
      62
      113.7
    
    
      17
      Gonorrhea
      Conecuh County
      AL
      1035
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      12887
      9
      69.8
    
    
      18
      Gonorrhea
      Coosa County
      AL
      1037
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      10898
      11
      100.9
    
    
      19
      Gonorrhea
      Covington County
      AL
      1039
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      37886
      25
      66.0
    
    
      20
      Gonorrhea
      Crenshaw County
      AL
      1041
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      13986
      18
      128.7
    
    
      21
      Gonorrhea
      Cullman County
      AL
      1043
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      80811
      21
      26.0
    
    
      22
      Gonorrhea
      Dale County
      AL
      1045
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      49884
      60
      120.3
    
    
      23
      Gonorrhea
      Dallas County
      AL
      1047
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      41996
      60
      142.9
    
    
      24
      Gonorrhea
      Dekalb County
      AL
      1049
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      71013
      43
      60.6
    
    
      25
      Gonorrhea
      Elmore County
      AL
      1051
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      80902
      97
      119.9
    
    
      26
      Gonorrhea
      Escambia County
      AL
      1053
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      37983
      52
      136.9
    
    
      27
      Gonorrhea
      Etowah County
      AL
      1055
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      103931
      185
      178.0
    
    
      28
      Gonorrhea
      Fayette County
      AL
      1057
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      16909
      5
      29.6
    
    
      29
      Gonorrhea
      Franklin County
      AL
      1059
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      31532
      8
      25.4
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      47
      Gonorrhea
      Marshall County
      AL
      1095
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      94760
      25
      26.4
    
    
      48
      Gonorrhea
      Mobile County
      AL
      1097
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      414079
      715
      172.7
    
    
      49
      Gonorrhea
      Monroe County
      AL
      1099
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      22236
      11
      49.5
    
    
      50
      Gonorrhea
      Montgomery County
      AL
      1101
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      226659
      959
      423.1
    
    
      51
      Gonorrhea
      Morgan County
      AL
      1103
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      119787
      111
      92.7
    
    
      52
      Gonorrhea
      Perry County
      AL
      1105
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      10020
      13
      129.7
    
    
      53
      Gonorrhea
      Pickens County
      AL
      1107
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      19401
      27
      139.2
    
    
      54
      Gonorrhea
      Pike County
      AL
      1109
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      33339
      90
      270.0
    
    
      55
      Gonorrhea
      Randolph County
      AL
      1111
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      22727
      11
      48.4
    
    
      56
      Gonorrhea
      Russell County
      AL
      1113
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      59585
      119
      199.7
    
    
      57
      Gonorrhea
      St. Clair County
      AL
      1115
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      86308
      54
      62.6
    
    
      58
      Gonorrhea
      Shelby County
      AL
      1117
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      204180
      94
      46.0
    
    
      59
      Gonorrhea
      Sumter County
      AL
      1119
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      13361
      22
      164.7
    
    
      60
      Gonorrhea
      Talladega County
      AL
      1121
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      81096
      212
      261.4
    
    
      61
      Gonorrhea
      Tallapoosa County
      AL
      1123
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      41203
      54
      131.1
    
    
      62
      Gonorrhea
      Tuscaloosa County
      AL
      1125
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      200821
      302
      150.4
    
    
      63
      Gonorrhea
      Walker County
      AL
      1127
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      65998
      77
      116.7
    
    
      64
      Gonorrhea
      Washington County
      AL
      1129
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      16877
      18
      106.7
    
    
      65
      Gonorrhea
      Wilcox County
      AL
      1131
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      11307
      14
      123.8
    
    
      66
      Gonorrhea
      Winston County
      AL
      1133
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      24146
      2
      8.3
    
    
      67
      Gonorrhea
      Aleutians East Borough
      AK
      2013
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      3092
      4
      129.4
    
    
      68
      Gonorrhea
      Aleutians West Census Area
      AK
      2016
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      5511
      1
      18.1
    
    
      69
      Gonorrhea
      Anchorage Municipality
      AK
      2020
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      300950
      782
      259.8
    
    
      70
      Gonorrhea
      Bethel Census Area
      AK
      2050
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      17758
      79
      444.9
    
    
      71
      Gonorrhea
      Bristol Bay Borough
      AK
      2060
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      960
      1
      104.2
    
    
      72
      Gonorrhea
      Denali Borough
      AK
      2068
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      1867
      0
      0.0
    
    
      73
      Gonorrhea
      Dillingham Census Area
      AK
      2070
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      5010
      23
      459.1
    
    
      74
      Gonorrhea
      Fairbanks North Star Borough
      AK
      2090
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      100436
      84
      83.6
    
    
      75
      Gonorrhea
      Haines Borough
      AK
      2100
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      2592
      0
      0.0
    
    
      76
      Gonorrhea
      Hoonah-Angoon Census Area
      AK
      2105
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      NaN
      NaN
      NaN
    
  

77 rows × 12 columns



In [12]:

    
df['Population'].describe()









    Out[12]:





count        3220.000000
mean        99360.995963
std        318648.364529
min            90.000000
25%         11267.750000
50%         26165.500000
75%         66834.250000
max      10017068.000000
Name: Population, dtype: float64



In [13]:

    
df['Population'].idxmax()









    Out[13]:





207



In [14]:

    
df.loc[207]









    Out[14]:





Disease                                    Gonorrhea
Area                              Los Angeles County
State Abbreviation                                CA
FIPS                                            6037
Year                                            2014
Race                           All races/ethnicities
Sex                                       Both sexes
Age group                             All age groups
Transmission Category    All transmission categories
Population                               1.00171e+07
Cases                                          15316
Rate                                           152.9
Name: 207, dtype: object



In [15]:

    
fig = plt.figure(figsize=(10, 6))
data = np.log10(df['Population'])
ax = data.plot.hist(50)
ax.set_xlabel("log(Population)")
ax.set_title("Gonorrhea")
plt.savefig('../graphics/county_population_gonorrhea.png', bbox_inches='tight', dpi=150)



In [16]:

    
fig = plt.figure(figsize=(10, 6))
data = np.log10(df['Cases']+1)
ax = data.plot.hist(50)
ax.set_xlabel("log(Cases)")
ax.set_title("Gonorrhea")
plt.savefig('../graphics/county_cases_gonorrhea.png', bbox_inches='tight', dpi=150)



In [17]:

    
fig = plt.figure(figsize=(10, 6))
ax = df['Rate'].plot.hist(100)
ax.set_xlabel("Rate (per 100,000 inhabitants)")
ax.set_title("Gonorrhea")
plt.savefig('../graphics/county_rate_gonorrhea.png', bbox_inches='tight', dpi=150)



In [18]:

    
outliers = df[df['Rate']<10]



In [19]:

    
outliers









    Out[19]:






  
    
      
      Disease
      Area
      State Abbreviation
      FIPS
      Year
      Race
      Sex
      Age group
      Transmission Category
      Population
      Cases
      Rate
    
  
  
    
      66
      Gonorrhea
      Winston County
      AL
      1133
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      24146
      2
      8.3
    
    
      72
      Gonorrhea
      Denali Borough
      AK
      2068
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      1867
      0
      0.0
    
    
      75
      Gonorrhea
      Haines Borough
      AK
      2100
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      2592
      0
      0.0
    
    
      88
      Gonorrhea
      Prince of Wales - Outer Ketchikan
      AK
      2201
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      5786
      0
      0.0
    
    
      93
      Gonorrhea
      Valdez-Cordova Census Area
      AK
      2261
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      9763
      0
      0.0
    
    
      96
      Gonorrhea
      Wrangell-Petersburg Census Area
      AK
      2280
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      6174
      0
      0.0
    
    
      97
      Gonorrhea
      Yakutat City and Borough
      AK
      2282
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      642
      0
      0.0
    
    
      125
      Gonorrhea
      Cleburne County
      AR
      5023
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      25686
      1
      3.9
    
    
      138
      Gonorrhea
      Fulton County
      AR
      5049
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      12304
      0
      0.0
    
    
      157
      Gonorrhea
      Madison County
      AR
      5087
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      15701
      0
      0.0
    
    
      158
      Gonorrhea
      Marion County
      AR
      5089
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      16430
      0
      0.0
    
    
      164
      Gonorrhea
      Newton County
      AR
      5101
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      8064
      0
      0.0
    
    
      166
      Gonorrhea
      Perry County
      AR
      5105
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      10345
      0
      0.0
    
    
      170
      Gonorrhea
      Polk County
      AR
      5113
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      20406
      2
      9.8
    
    
      190
      Gonorrhea
      Alpine County
      CA
      6003
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      1159
      0
      0.0
    
    
      251
      Gonorrhea
      Baca County
      CO
      8009
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      3682
      0
      0.0
    
    
      256
      Gonorrhea
      Cheyenne County
      CO
      8017
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      1890
      0
      0.0
    
    
      259
      Gonorrhea
      Costilla County
      CO
      8023
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      3518
      0
      0.0
    
    
      262
      Gonorrhea
      Delta County
      CO
      8029
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      30483
      1
      3.3
    
    
      264
      Gonorrhea
      Dolores County
      CO
      8033
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      2029
      0
      0.0
    
    
      266
      Gonorrhea
      Eagle County
      CO
      8037
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      52460
      5
      9.5
    
    
      273
      Gonorrhea
      Gunnison County
      CO
      8051
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      15507
      1
      6.4
    
    
      274
      Gonorrhea
      Hinsdale County
      CO
      8053
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      813
      0
      0.0
    
    
      276
      Gonorrhea
      Jackson County
      CO
      8057
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      1365
      0
      0.0
    
    
      278
      Gonorrhea
      Kiowa County
      CO
      8061
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      1423
      0
      0.0
    
    
      279
      Gonorrhea
      Kit Carson County
      CO
      8063
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      8037
      0
      0.0
    
    
      285
      Gonorrhea
      Logan County
      CO
      8075
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      22450
      1
      4.5
    
    
      287
      Gonorrhea
      Mineral County
      CO
      8079
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      721
      0
      0.0
    
    
      290
      Gonorrhea
      Montrose County
      CO
      8085
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      40713
      1
      2.5
    
    
      292
      Gonorrhea
      Otero County
      CO
      8089
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      18703
      1
      5.3
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      3177
      Gonorrhea
      Guànica Municipio
      PR
      72055
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      17852
      0
      0.0
    
    
      3178
      Gonorrhea
      Guayama Municipio
      PR
      72057
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      43467
      2
      4.6
    
    
      3179
      Gonorrhea
      Guayanilla Municipio
      PR
      72059
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      20148
      1
      5.0
    
    
      3180
      Gonorrhea
      Guaynabo Municipio
      PR
      72061
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      92799
      6
      6.5
    
    
      3182
      Gonorrhea
      Hatillo Municipio
      PR
      72065
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      41618
      2
      4.8
    
    
      3186
      Gonorrhea
      Jayuya Municipio
      PR
      72073
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      15693
      0
      0.0
    
    
      3189
      Gonorrhea
      Lajas Municipio
      PR
      72079
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      24465
      0
      0.0
    
    
      3190
      Gonorrhea
      Lares Municipio
      PR
      72081
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      28208
      0
      0.0
    
    
      3191
      Gonorrhea
      Las MarÕas Municipio
      PR
      72083
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      9158
      0
      0.0
    
    
      3196
      Gonorrhea
      Maricao Municipio
      PR
      72093
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      6022
      0
      0.0
    
    
      3197
      Gonorrhea
      Maunabo Municipio
      PR
      72095
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      11565
      0
      0.0
    
    
      3199
      Gonorrhea
      Moca Municipio
      PR
      72099
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      38461
      0
      0.0
    
    
      3200
      Gonorrhea
      Morovis Municipio
      PR
      72101
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      32194
      1
      3.1
    
    
      3202
      Gonorrhea
      Naranjito Municipio
      PR
      72105
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      29602
      1
      3.4
    
    
      3203
      Gonorrhea
      Orocovis Municipio
      PR
      72107
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      22392
      1
      4.5
    
    
      3204
      Gonorrhea
      Patillas Municipio
      PR
      72109
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      18261
      0
      0.0
    
    
      3205
      Gonorrhea
      PeÐuelas Municipio
      PR
      72111
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      22365
      1
      4.5
    
    
      3207
      Gonorrhea
      Quebradillas Municipio
      PR
      72115
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      25042
      0
      0.0
    
    
      3210
      Gonorrhea
      Sabana Grande Municipio
      PR
      72121
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      24121
      0
      0.0
    
    
      3211
      Gonorrhea
      Salinas Municipio
      PR
      72123
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      29881
      2
      6.7
    
    
      3212
      Gonorrhea
      San Germàn Municipio
      PR
      72125
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      33725
      1
      3.0
    
    
      3215
      Gonorrhea
      San Sebastiàn Municipio
      PR
      72131
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      39969
      1
      2.5
    
    
      3216
      Gonorrhea
      Santa Isabel Municipio
      PR
      72133
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      22860
      1
      4.4
    
    
      3217
      Gonorrhea
      Toa Alta Municipio
      PR
      72135
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      74837
      7
      9.4
    
    
      3220
      Gonorrhea
      Utuado Municipio
      PR
      72141
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      31050
      0
      0.0
    
    
      3221
      Gonorrhea
      Vega Alta Municipio
      PR
      72143
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      39236
      3
      7.6
    
    
      3222
      Gonorrhea
      Vega Baja Municipio
      PR
      72145
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      56166
      2
      3.6
    
    
      3224
      Gonorrhea
      Villalba Municipio
      PR
      72149
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      24389
      0
      0.0
    
    
      3225
      Gonorrhea
      Yabucoa Municipio
      PR
      72151
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      35879
      3
      8.4
    
    
      3226
      Gonorrhea
      Yauco Municipio
      PR
      72153
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      38782
      0
      0.0
    
  

672 rows × 12 columns



In [20]:

    
not_exclude_list = df["Cases"]<0
exclude_list = [not i for i in not_exclude_list]
df_sig = df[exclude_list].copy()



In [21]:

    
df = df_sig.copy()



In [22]:

    
df["Rate"].sort_values()









    Out[22]:





880       0.0
1654      0.0
1655      0.0
3083      0.0
1234      0.0
3089      0.0
3090      0.0
1658      0.0
660       0.0
1659      0.0
1660      0.0
2807      0.0
1651      0.0
1661      0.0
2423      0.0
1665      0.0
310       0.0
1190      0.0
308       0.0
305       0.0
675       0.0
303       0.0
302       0.0
1671      0.0
1676      0.0
1664      0.0
1650      0.0
1649      0.0
1235      0.0
620       0.0
        ...  
1152    451.9
148     456.3
1149    458.3
73      459.1
2713    462.8
2948    478.7
81      485.4
1600    486.2
2375    508.6
1983    528.9
159     529.9
1995    567.1
2944    633.1
2384    716.1
2424    801.4
2379    830.4
2420    835.8
1643    844.9
2032    850.4
2035    857.8
94      877.5
76        NaN
83        NaN
86        NaN
87        NaN
90        NaN
95        NaN
2919      NaN
3146      NaN
3148      NaN
Name: Rate, dtype: float64



In [23]:

    
len(df['Area'].unique())









    Out[23]:





1962



In [24]:

    
df.shape









    Out[24]:





(3228, 12)



In [25]:

    
df.sort_values(by='Area')









    Out[25]:






  
    
      
      Disease
      Area
      State Abbreviation
      FIPS
      Year
      Race
      Sex
      Age group
      Transmission Category
      Population
      Cases
      Rate
    
  
  
    
      2319
      Gonorrhea
      Abbeville County
      SC
      45001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      25007
      43
      172.0
    
    
      1116
      Gonorrhea
      Acadia Parish
      LA
      22001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      62204
      106
      170.4
    
    
      2823
      Gonorrhea
      Accomack County
      VA
      51001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      33148
      28
      84.5
    
    
      554
      Gonorrhea
      Ada County
      ID
      16001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      416464
      205
      49.2
    
    
      996
      Gonorrhea
      Adair County
      KY
      21001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      18732
      6
      32.0
    
    
      1486
      Gonorrhea
      Adair County
      MO
      29001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      25572
      8
      31.3
    
    
      792
      Gonorrhea
      Adair County
      IA
      19001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      7472
      2
      26.8
    
    
      2134
      Gonorrhea
      Adair County
      OK
      40001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      22194
      11
      49.6
    
    
      1657
      Gonorrhea
      Adams County
      NE
      31001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      31610
      8
      25.3
    
    
      700
      Gonorrhea
      Adams County
      IN
      18001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      34614
      6
      17.3
    
    
      555
      Gonorrhea
      Adams County
      ID
      16003
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      3828
      0
      0.0
    
    
      1404
      Gonorrhea
      Adams County
      MS
      28001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      32090
      56
      174.5
    
    
      793
      Gonorrhea
      Adams County
      IA
      19003
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      3894
      0
      0.0
    
    
      598
      Gonorrhea
      Adams County
      IL
      17001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      67130
      33
      49.2
    
    
      2046
      Gonorrhea
      Adams County
      OH
      39001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      28105
      2
      7.1
    
    
      1993
      Gonorrhea
      Adams County
      ND
      38001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      2360
      1
      42.4
    
    
      247
      Gonorrhea
      Adams County
      CO
      8001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      469193
      233
      49.7
    
    
      3051
      Gonorrhea
      Adams County
      WI
      55001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      20480
      3
      14.6
    
    
      2957
      Gonorrhea
      Adams County
      WA
      53001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      19067
      10
      52.4
    
    
      2247
      Gonorrhea
      Adams County
      PA
      42001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      101546
      18
      17.7
    
    
      2809
      Gonorrhea
      Addison County
      VT
      50001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      36791
      4
      10.9
    
    
      3149
      Gonorrhea
      Adjuntas Municipio
      PR
      72001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      18900
      2
      10.6
    
    
      3150
      Gonorrhea
      Aguada Municipio
      PR
      72003
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      40329
      3
      7.4
    
    
      3151
      Gonorrhea
      Aguadilla Municipio
      PR
      72005
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      57290
      7
      12.2
    
    
      3152
      Gonorrhea
      Aguas Buenas Municipio
      PR
      72007
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      27473
      2
      7.3
    
    
      3153
      Gonorrhea
      Aibonito Municipio
      PR
      72009
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      24561
      3
      12.2
    
    
      2320
      Gonorrhea
      Aiken County
      SC
      45003
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      164176
      280
      170.5
    
    
      1317
      Gonorrhea
      Aitkin County
      MN
      27001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      15742
      5
      31.8
    
    
      323
      Gonorrhea
      Alachua County
      FL
      12001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      253451
      411
      162.2
    
    
      1893
      Gonorrhea
      Alamance County
      NC
      37001
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      154378
      262
      169.7
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      3225
      Gonorrhea
      Yabucoa Municipio
      PR
      72151
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      35879
      3
      8.4
    
    
      1991
      Gonorrhea
      Yadkin County
      NC
      37197
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      38038
      13
      34.2
    
    
      2995
      Gonorrhea
      Yakima County
      WA
      53077
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      247044
      409
      165.6
    
    
      97
      Gonorrhea
      Yakutat City and Borough
      AK
      2282
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      642
      0
      0.0
    
    
      1484
      Gonorrhea
      Yalobusha County
      MS
      28161
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      12373
      18
      145.5
    
    
      2246
      Gonorrhea
      Yamhill County
      OR
      41071
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      100725
      18
      17.9
    
    
      1992
      Gonorrhea
      Yancey County
      NC
      37199
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      17566
      2
      11.4
    
    
      2429
      Gonorrhea
      Yankton County
      SD
      46135
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      22696
      8
      35.2
    
    
      1892
      Gonorrhea
      Yates County
      NY
      36123
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      25156
      2
      8.0
    
    
      3226
      Gonorrhea
      Yauco Municipio
      PR
      72153
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      38782
      0
      0.0
    
    
      112
      Gonorrhea
      Yavapai County
      AZ
      4025
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      215133
      51
      23.7
    
    
      1485
      Gonorrhea
      Yazoo County
      MS
      28163
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      27883
      103
      369.4
    
    
      188
      Gonorrhea
      Yell County
      AR
      5149
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      21893
      8
      36.5
    
    
      1403
      Gonorrhea
      Yellow Medicine County
      MN
      27173
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      10143
      1
      9.9
    
    
      1656
      Gonorrhea
      Yellowstone County
      MT
      30111
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      154162
      101
      65.5
    
    
      2776
      Gonorrhea
      Yoakum County
      TX
      48501
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      8184
      5
      61.1
    
    
      245
      Gonorrhea
      Yolo County
      CA
      6113
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      204593
      194
      94.8
    
    
      2917
      Gonorrhea
      York County
      VA
      51199
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      66269
      34
      51.3
    
    
      2364
      Gonorrhea
      York County
      SC
      45091
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      239363
      311
      129.9
    
    
      1749
      Gonorrhea
      York County
      NE
      31185
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      13883
      5
      36.0
    
    
      1195
      Gonorrhea
      York County
      ME
      23031
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      199431
      35
      17.5
    
    
      2313
      Gonorrhea
      York County
      PA
      42133
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      438965
      294
      67.0
    
    
      2777
      Gonorrhea
      Young County
      TX
      48503
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      18341
      11
      60.0
    
    
      246
      Gonorrhea
      Yuba County
      CA
      6115
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      73340
      86
      117.3
    
    
      98
      Gonorrhea
      Yukon-Koyukuk Census Area
      AK
      2290
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      5695
      20
      351.2
    
    
      113
      Gonorrhea
      Yuma County
      AZ
      4027
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      201201
      124
      61.6
    
    
      310
      Gonorrhea
      Yuma County
      CO
      8125
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      10151
      0
      0.0
    
    
      2778
      Gonorrhea
      Zapata County
      TX
      48505
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      14390
      1
      6.9
    
    
      2779
      Gonorrhea
      Zavala County
      TX
      48507
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      12156
      5
      41.1
    
    
      2430
      Gonorrhea
      Ziebach County
      SD
      46137
      2014
      All races/ethnicities
      Both sexes
      All age groups
      All transmission categories
      2834
      11
      388.1
    
  

3228 rows × 12 columns



In [26]:

    
df['Area'].value_counts()









    Out[26]:





Washington County            30
Jefferson County             25
Franklin County              24
Jackson County               23
Lincoln County               23
Madison County               19
Montgomery County            18
Clay County                  18
Marion County                17
Monroe County                17
Union County                 17
Wayne County                 16
Grant County                 14
Warren County                14
Greene County                14
Carroll County               13
Clark County                 12
Marshall County              12
Lee County                   12
Polk County                  12
Lake County                  12
Johnson County               12
Adams County                 12
Douglas County               12
Morgan County                11
Scott County                 11
Calhoun County               11
Crawford County              11
Fayette County               11
Lawrence County              11
                             ..
Dearborn County               1
Harnett County                1
Gentry County                 1
Prince Edward County          1
Petersburg Census Area        1
Loving County                 1
East Carroll Parish           1
Ouachita County               1
Alger County                  1
Pickaway County               1
Rockland County               1
Torrance County               1
Union Parish                  1
Kimball County                1
Vanderburgh County            1
Aleutians East Borough        1
Pima County                   1
Clarion County                1
Wrangell City and Borough     1
Reynolds County               1
La Paz County                 1
Kosciusko County              1
Natrona County                1
Bremer County                 1
Galveston County              1
Refugio County                1
Estill County                 1
Nome Census Area              1
Charlevoix County             1
Schleicher County             1
Name: Area, dtype: int64



In [27]:

    
df.isnull().values.any()









    Out[27]:





True



In [28]:

    
null_list = df["Population"].isnull()
not_null_list = [not i for i in null_list]
df_clean = df[not_null_list].copy()



In [29]:

    
null_list = df_clean["Rate"].isnull()
not_null_list = [not i for i in null_list]
df_completely_clean = df_clean[not_null_list].copy()



In [30]:

    
df_completely_clean["Rate"].isnull().values.any()









    Out[30]:





False

Model the Gonorrhea rate



In [31]:

    
df_merged = pd.read_csv("../data/chlamydia_cdc_census.csv")



In [32]:

    
df_completely_clean[df_completely_clean["FIPS"]==1001].Cases[0]









    Out[32]:





48.0

Replace the number of Chlamydia cases with the number of Gonorrhea cases



In [33]:

    
#print(df_merged[df_merged["FIPS"]==1001].Cases[0])
#df_merged.set_value(1, "FIPS", 10)
for county in df_merged["FIPS"]:
    rowlist = df_merged[df_merged['FIPS'] == county].index.tolist()
    gonorrhea_cases = df_completely_clean[df_completely_clean["FIPS"] == county].Cases.tolist()
    df_merged.set_value(rowlist[0], 'Cases', gonorrhea_cases[0])
#    print(df_merged["FIPS"][rowlist[0]])
    #df_merged[df_merged["FIPS"]==county]



In [34]:

    
df_merged.head()



In [35]:

    
df_merged.describe()









    Out[35]:






  
    
      
      FIPS
      Population
      Cases
      hd01s001
      hd02s002
      hd02s005
      hd02s006
      hd02s007
      hd02s008
      hd02s009
      hd02s010
      hd02s011
      hd02s013
      hd02s015
      hd01s020
      hd02s026
      hd02s051
      hd02s078
      hd02s079
      hd02s080
      hd02s081
      hd02s089
      hd02s095
      hd02s107
      hd02s131
      hd02s132
      hd02s133
      hd02s134
      hd02s135
      hd02s136
      hd02s143
      hd02s151
      hd02s152
      hd02s153
      hd02s154
      hd02s159
      hd01s167
      hd01s168
      hd02s181
      hd02s184
      hd01vd01
      d002
      d014
      d019
      d024
      d029
      lnd110210d
    
  
  
    
      count
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
    
    
      mean
      30492.802614
      100423.684967
      103.278758
      4.460522
      19.163007
      6.908072
      5.977222
      5.819314
      5.695294
      5.899739
      6.297484
      15.002190
      13.250327
      15.985556
      40.491667
      50.003856
      49.996144
      84.076340
      8.121307
      1.550719
      1.165033
      0.082418
      1.966765
      8.417353
      96.604379
      39.254118
      20.403791
      27.047647
      20.409281
      5.134248
      3.395654
      67.828627
      27.814542
      52.057712
      19.173268
      32.171438
      2.476042
      2.988451
      72.576307
      27.423791
      46800.265359
      0.520577
      0.002492
      0.030613
      0.003221
      0.024522
      239.640924
    
    
      std
      15071.256920
      323873.098080
      492.184847
      0.634640
      2.846254
      1.135197
      2.546279
      1.223101
      0.950620
      0.878120
      0.830619
      1.506377
      2.129781
      4.166765
      4.959052
      2.204346
      2.204346
      15.045667
      13.038226
      5.071524
      2.533382
      0.967483
      1.549707
      13.303866
      4.451890
      3.484761
      2.711350
      3.484574
      3.001770
      2.163983
      4.451875
      4.990887
      4.766401
      5.948292
      4.289568
      4.990872
      0.203247
      0.181387
      7.350396
      7.350283
      12025.134705
      0.059487
      0.001291
      0.007625
      0.001295
      0.007228
      1610.657866
    
    
      min
      1001.000000
      90.000000
      0.000000
      1.913814
      0.000000
      0.000000
      1.300000
      2.300000
      2.400000
      1.200000
      2.800000
      6.300000
      4.000000
      3.500000
      22.600000
      43.200000
      27.900000
      14.200000
      0.000000
      0.000000
      0.000000
      0.000000
      0.100000
      0.000000
      45.000000
      17.600000
      7.500000
      4.400000
      0.000000
      0.700000
      0.000000
      18.800000
      0.000000
      11.600000
      0.000000
      13.400000
      1.260000
      2.000000
      1.400000
      10.200000
      19146.000000
      0.115942
      0.000000
      0.000000
      0.000000
      0.000000
      0.069674
    
    
      25%
      19006.500000
      11208.000000
      2.000000
      4.051722
      17.400000
      6.300000
      4.700000
      5.100000
      5.100000
      5.400000
      5.800000
      14.200000
      12.000000
      13.300000
      37.700000
      48.900000
      49.600000
      76.575000
      0.500000
      0.200000
      0.300000
      0.000000
      1.100000
      1.600000
      96.300000
      37.400000
      18.800000
      25.000000
      18.600000
      3.500000
      1.100000
      65.200000
      25.000000
      48.800000
      16.500000
      29.300000
      2.350000
      2.880000
      69.200000
      22.600000
      38810.000000
      0.488003
      0.001797
      0.025435
      0.002469
      0.020005
      17.078483
    
    
      50%
      29188.000000
      25987.000000
      10.000000
      4.414815
      19.100000
      6.800000
      5.500000
      5.600000
      5.600000
      5.900000
      6.300000
      15.100000
      13.100000
      15.700000
      40.500000
      49.500000
      50.500000
      89.450000
      1.900000
      0.400000
      0.500000
      0.000000
      1.600000
      3.300000
      98.200000
      39.500000
      20.700000
      27.100000
      20.400000
      4.900000
      1.800000
      68.100000
      27.450000
      52.500000
      18.700000
      31.900000
      2.450000
      2.970000
      73.800000
      26.200000
      45009.500000
      0.524803
      0.002367
      0.030222
      0.003160
      0.024365
      45.094109
    
    
      75%
      45085.500000
      67582.250000
      43.000000
      4.825413
      20.700000
      7.300000
      6.500000
      6.400000
      6.200000
      6.400000
      6.800000
      15.800000
      14.300000
      18.300000
      43.400000
      50.400000
      51.100000
      95.625000
      9.400000
      0.800000
      1.000000
      0.100000
      2.300000
      8.400000
      98.900000
      41.300000
      22.225000
      29.000000
      22.000000
      6.500000
      3.700000
      70.700000
      30.100000
      55.700000
      21.000000
      34.800000
      2.570000
      3.070000
      77.400000
      30.800000
      52013.500000
      0.556712
      0.002993
      0.035313
      0.003841
      0.028793
      114.277352
    
    
      max
      56045.000000
      10017068.000000
      15316.000000
      6.992050
      33.600000
      18.300000
      28.100000
      16.100000
      11.700000
      9.700000
      11.900000
      24.500000
      28.100000
      43.400000
      62.700000
      72.100000
      56.800000
      99.200000
      84.400000
      75.500000
      43.900000
      48.900000
      29.500000
      95.700000
      100.000000
      76.700000
      28.800000
      42.800000
      34.400000
      16.000000
      55.000000
      86.600000
      50.300000
      79.200000
      41.700000
      81.200000
      3.680000
      4.050000
      89.800000
      98.600000
      123966.000000
      0.792199
      0.022064
      0.080335
      0.017631
      0.077751
      68239.991607



In [36]:

    
df_merged.shape









    Out[36]:





(3060, 47)



In [37]:

    
plt.scatter(np.log10(df_merged["Population"]), df_merged["Cases"]/df_merged["Population"])









    Out[37]:





<matplotlib.collections.PathCollection at 0x10effb048>






    



/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [38]:

    
plt.scatter(df_merged["FIPS"], df_merged["Cases"]/df_merged["Population"])









    Out[38]:





<matplotlib.collections.PathCollection at 0x10f053eb8>






    



/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

Plot the correlation matrix



In [39]:

    
df_all = df_merged.copy()
df_all["Rate"] = df_all["Cases"]/df_all["Population"]
corr = df_all.corr()



In [40]:

    
pearsonr = corr["Rate"]
pearsonr









    Out[40]:





FIPS         -0.074673
Population    0.200290
Cases         0.299517
hd01s001      0.326293
hd02s002      0.177230
hd02s005      0.180993
hd02s006      0.261563
hd02s007      0.389383
hd02s008      0.305542
hd02s009      0.185169
hd02s010      0.048447
hd02s011     -0.264997
hd02s013     -0.303107
hd02s015     -0.312399
hd01s020     -0.366802
hd02s026     -0.156963
hd02s051      0.156963
hd02s078     -0.677478
hd02s079      0.638579
hd02s080      0.142858
hd02s081      0.137539
hd02s089     -0.004756
hd02s095      0.167221
hd02s107      0.086671
hd02s131     -0.083829
hd02s132     -0.214553
hd02s133     -0.583597
hd02s134      0.172910
hd02s135     -0.016189
hd02s136      0.539012
hd02s143      0.083823
hd02s151     -0.037727
hd02s152      0.143635
hd02s153     -0.523363
hd02s154     -0.166232
hd02s159      0.037720
hd01s167      0.179159
hd01s168      0.279396
hd02s181     -0.419810
hd02s184      0.419816
hd01vd01     -0.149611
d002         -0.523460
d014          0.176792
d019         -0.130580
d024          0.159114
d029          0.258884
lnd110210d    0.144497
Rate          1.000000
Name: Rate, dtype: float64



In [41]:

    
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True



In [42]:

    
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
            square=True, xticklabels=2, yticklabels=2,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)

plt.savefig('../graphics/cross-correlation_gonorrhea.png', bbox_inches='tight', dpi=150)









    



/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

Make principal component analysis



In [43]:

    
from sklearn.decomposition import PCA



In [44]:

    
df_merged.describe()









    Out[44]:






  
    
      
      FIPS
      Population
      Cases
      hd01s001
      hd02s002
      hd02s005
      hd02s006
      hd02s007
      hd02s008
      hd02s009
      hd02s010
      hd02s011
      hd02s013
      hd02s015
      hd01s020
      hd02s026
      hd02s051
      hd02s078
      hd02s079
      hd02s080
      hd02s081
      hd02s089
      hd02s095
      hd02s107
      hd02s131
      hd02s132
      hd02s133
      hd02s134
      hd02s135
      hd02s136
      hd02s143
      hd02s151
      hd02s152
      hd02s153
      hd02s154
      hd02s159
      hd01s167
      hd01s168
      hd02s181
      hd02s184
      hd01vd01
      d002
      d014
      d019
      d024
      d029
      lnd110210d
    
  
  
    
      count
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
      3060.000000
    
    
      mean
      30492.802614
      100423.684967
      103.278758
      4.460522
      19.163007
      6.908072
      5.977222
      5.819314
      5.695294
      5.899739
      6.297484
      15.002190
      13.250327
      15.985556
      40.491667
      50.003856
      49.996144
      84.076340
      8.121307
      1.550719
      1.165033
      0.082418
      1.966765
      8.417353
      96.604379
      39.254118
      20.403791
      27.047647
      20.409281
      5.134248
      3.395654
      67.828627
      27.814542
      52.057712
      19.173268
      32.171438
      2.476042
      2.988451
      72.576307
      27.423791
      46800.265359
      0.520577
      0.002492
      0.030613
      0.003221
      0.024522
      239.640924
    
    
      std
      15071.256920
      323873.098080
      492.184847
      0.634640
      2.846254
      1.135197
      2.546279
      1.223101
      0.950620
      0.878120
      0.830619
      1.506377
      2.129781
      4.166765
      4.959052
      2.204346
      2.204346
      15.045667
      13.038226
      5.071524
      2.533382
      0.967483
      1.549707
      13.303866
      4.451890
      3.484761
      2.711350
      3.484574
      3.001770
      2.163983
      4.451875
      4.990887
      4.766401
      5.948292
      4.289568
      4.990872
      0.203247
      0.181387
      7.350396
      7.350283
      12025.134705
      0.059487
      0.001291
      0.007625
      0.001295
      0.007228
      1610.657866
    
    
      min
      1001.000000
      90.000000
      0.000000
      1.913814
      0.000000
      0.000000
      1.300000
      2.300000
      2.400000
      1.200000
      2.800000
      6.300000
      4.000000
      3.500000
      22.600000
      43.200000
      27.900000
      14.200000
      0.000000
      0.000000
      0.000000
      0.000000
      0.100000
      0.000000
      45.000000
      17.600000
      7.500000
      4.400000
      0.000000
      0.700000
      0.000000
      18.800000
      0.000000
      11.600000
      0.000000
      13.400000
      1.260000
      2.000000
      1.400000
      10.200000
      19146.000000
      0.115942
      0.000000
      0.000000
      0.000000
      0.000000
      0.069674
    
    
      25%
      19006.500000
      11208.000000
      2.000000
      4.051722
      17.400000
      6.300000
      4.700000
      5.100000
      5.100000
      5.400000
      5.800000
      14.200000
      12.000000
      13.300000
      37.700000
      48.900000
      49.600000
      76.575000
      0.500000
      0.200000
      0.300000
      0.000000
      1.100000
      1.600000
      96.300000
      37.400000
      18.800000
      25.000000
      18.600000
      3.500000
      1.100000
      65.200000
      25.000000
      48.800000
      16.500000
      29.300000
      2.350000
      2.880000
      69.200000
      22.600000
      38810.000000
      0.488003
      0.001797
      0.025435
      0.002469
      0.020005
      17.078483
    
    
      50%
      29188.000000
      25987.000000
      10.000000
      4.414815
      19.100000
      6.800000
      5.500000
      5.600000
      5.600000
      5.900000
      6.300000
      15.100000
      13.100000
      15.700000
      40.500000
      49.500000
      50.500000
      89.450000
      1.900000
      0.400000
      0.500000
      0.000000
      1.600000
      3.300000
      98.200000
      39.500000
      20.700000
      27.100000
      20.400000
      4.900000
      1.800000
      68.100000
      27.450000
      52.500000
      18.700000
      31.900000
      2.450000
      2.970000
      73.800000
      26.200000
      45009.500000
      0.524803
      0.002367
      0.030222
      0.003160
      0.024365
      45.094109
    
    
      75%
      45085.500000
      67582.250000
      43.000000
      4.825413
      20.700000
      7.300000
      6.500000
      6.400000
      6.200000
      6.400000
      6.800000
      15.800000
      14.300000
      18.300000
      43.400000
      50.400000
      51.100000
      95.625000
      9.400000
      0.800000
      1.000000
      0.100000
      2.300000
      8.400000
      98.900000
      41.300000
      22.225000
      29.000000
      22.000000
      6.500000
      3.700000
      70.700000
      30.100000
      55.700000
      21.000000
      34.800000
      2.570000
      3.070000
      77.400000
      30.800000
      52013.500000
      0.556712
      0.002993
      0.035313
      0.003841
      0.028793
      114.277352
    
    
      max
      56045.000000
      10017068.000000
      15316.000000
      6.992050
      33.600000
      18.300000
      28.100000
      16.100000
      11.700000
      9.700000
      11.900000
      24.500000
      28.100000
      43.400000
      62.700000
      72.100000
      56.800000
      99.200000
      84.400000
      75.500000
      43.900000
      48.900000
      29.500000
      95.700000
      100.000000
      76.700000
      28.800000
      42.800000
      34.400000
      16.000000
      55.000000
      86.600000
      50.300000
      79.200000
      41.700000
      81.200000
      3.680000
      4.050000
      89.800000
      98.600000
      123966.000000
      0.792199
      0.022064
      0.080335
      0.017631
      0.077751
      68239.991607



In [45]:

    
df_new = df_merged.drop(["FIPS","Cases", "d002", "hd02s051", "hd02s143", "hd02s159", "hd02s184", "hd01s001"], axis=1)
X = df_new.values

columns = X.shape[1]
for column in np.arange(columns):
    mean_temp = X[:,column].mean()
    std_temp = X[:,column].std()
    X[:,column] = (X[:,column]-mean_temp)/std_temp

plt.plot((np.std(X, axis=0)))









    Out[45]:





[<matplotlib.lines.Line2D at 0x1151f4518>]



In [46]:

    
X.shape









    Out[46]:





(3060, 39)



In [47]:

    
pca = PCA(n_components=39)
pca.fit(X)
pca.explained_variance_ratio_









    Out[47]:





array([  2.94525564e-01,   1.64308375e-01,   7.98070500e-02,
         6.79178134e-02,   6.47768303e-02,   4.95135898e-02,
         4.30534407e-02,   3.07112910e-02,   2.94860465e-02,
         2.71764298e-02,   2.12387110e-02,   1.65600184e-02,
         1.57898743e-02,   1.35634732e-02,   1.22893006e-02,
         1.20671080e-02,   8.77515194e-03,   8.08329827e-03,
         6.89497673e-03,   5.86716605e-03,   4.85048055e-03,
         4.22251140e-03,   3.79732597e-03,   3.26146886e-03,
         2.58523806e-03,   2.24116473e-03,   1.79765723e-03,
         1.48926992e-03,   1.15747422e-03,   9.54360190e-04,
         3.18175827e-04,   2.56011170e-04,   1.86629727e-04,
         1.76638055e-04,   1.58593900e-04,   7.52418315e-05,
         4.39366076e-05,   1.35256555e-05,   8.78706740e-06])



In [48]:

    
plt.scatter(pca.components_[:,0], pca.components_[:,1])









    Out[48]:





<matplotlib.collections.PathCollection at 0x115e925f8>






    



/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [49]:

    
X_trans = pca.transform(X)
Y = df_merged["Cases"]/df_merged["Population"]
plt.scatter(X_trans[:,0],Y)









    Out[49]:





<matplotlib.collections.PathCollection at 0x115ef6630>






    



/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

Linear regression



In [50]:

    
from sklearn import linear_model



In [51]:

    
df_new["hd02s002"].hist()









    Out[51]:





<matplotlib.axes._subplots.AxesSubplot at 0x115227fd0>



In [52]:

    
df_new.head()



In [53]:

    
df_merged.head()

Split data set into training/test and validation data



In [54]:

    
cutoff = 1

X = df_new[df_merged["Cases"]>cutoff].values
Y = df_merged[df_merged["Cases"]>cutoff].Cases/(df_merged[df_merged["Cases"]>cutoff].Population+1.0)

X_full = df_new.values
Y_full = df_merged.Cases/(df_merged.Population+1.0)

#normalize all columns to the same normalization
columns = X.shape[1]
means = np.zeros(columns)
stds = np.zeros(columns)
for column in np.arange(columns):
    mean_temp = X[:,column].mean()
    std_temp = X[:,column].std()
    means[column] = mean_temp
    stds[column] = std_temp
    X[:,column] = (X[:,column]-mean_temp)/std_temp
    X_full[:,column] = (X_full[:,column]-mean_temp)/std_temp

Ymean = Y_full.mean()
Ystd = Y_full.std()

Y = (Y-Ymean)/Ystd
Y_full = (Y_full-Ymean)/Ystd
    
ones = np.ones(round(0.75*len(X)), dtype=bool)
zeros = np.zeros(len(X)-round(0.75*len(X)), dtype=bool)
training_list = np.hstack((ones, zeros))
np.random.shuffle(training_list)
test_list = np.zeros(len(X),dtype=bool)
test_list = np.array([not i for i in training_list])

X_train = X[training_list]
X_test = X[test_list]
Y_train = Y[training_list]
Y_test = Y[test_list]

X.shape, X_train.shape, X_test.shape, Y.shape, Y_train.shape, Y_test.shape









    Out[54]:





((2400, 39), (1800, 39), (600, 39), (2400,), (1800,), (600,))



In [55]:

    
X.shape, Y.shape









    Out[55]:





((2400, 39), (2400,))



In [56]:

    
Y_test.describe()









    Out[56]:





count    600.000000
mean       0.183267
std        1.027183
min       -0.852713
25%       -0.517351
50%       -0.207258
75%        0.565745
max        6.344650
dtype: float64



In [57]:

    
#X_weights = df_merged.values
#X_train_weights = X_weights[training_list]
weights = 1 #X_train_weights[:,2]
regr = linear_model.LinearRegression()
regr.fit(X_train, Y_train, sample_weight=weights)









    Out[57]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [58]:

    
print(regr.coef_)









    



[ 0.06999576 -0.28355523 -0.23556967 -0.36628503 -0.20079447 -0.10982781
 -0.17001598 -0.14272377 -0.13889428 -0.38876025 -0.57277059 -0.35333905
 -0.09109317 -0.03240245  0.23917996  0.03822984  0.05814973 -0.07549794
  0.06339112  0.00536592  0.1823018   0.63270391 -0.78847206 -0.85551799
  0.26370773 -0.20390549  1.11361052  0.56902164  0.38174778 -1.08045281
 -1.05423933  1.08499758 -0.13462548  0.18570043  0.04571197 -0.09023666
 -0.01984907 -0.03387758 -0.08901632]



In [59]:

    
1.0-np.sum((regr.predict(X_test)-Y_test)**2)/np.sum((Y_test-np.mean(Y_test))**2)









    Out[59]:





0.57055915646598554



In [60]:

    
plt.scatter(Y,regr.predict(X))
plt.plot(np.linspace(Y.min(),Y.max(),num=10),np.linspace(Y.min(),Y.max(),num=10),color='red')









    Out[60]:





[<matplotlib.lines.Line2D at 0x10f142588>]






    



/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [61]:

    
print('Variance score: %.5f\t(%.5f)' % (regr.score(X_test, Y_test), regr.score(X_full, Y_full)))









    



Variance score: 0.57056	(0.59701)



In [62]:

    
from sklearn import cross_validation
cv = cross_validation.ShuffleSplit(len(Y_train), n_iter=5, test_size=0.2, random_state=0)
scores_regression = cross_validation.cross_val_score(regr, X_train, Y_train, cv=cv)
scores_regression
#cross_val_score(regr, X_train, Y_train, cv=6, n_jobs=1)
#scores









    Out[62]:





array([ 0.51001046,  0.632968  ,  0.6034146 ,  0.60008855,  0.55867592])



In [63]:

    
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_regression.mean(), scores_regression.std() * 2))









    



Accuracy: 0.58 (+/- 0.09)



In [64]:

    
from sklearn.metrics import explained_variance_score



In [65]:

    
explained_variance_score(Y_train, regr.predict(X_train))









    Out[65]:





0.6014318615994938



In [66]:

    
from sklearn.metrics import mean_absolute_error



In [67]:

    
mean_absolute_error(Y_train, regr.predict(X_train))









    Out[67]:





0.44396426675554196



In [68]:

    
from sklearn.metrics import mean_squared_error



In [69]:

    
mean_squared_error(Y_train, regr.predict(X_train))









    Out[69]:





0.41339284741867016



In [70]:

    
from sklearn.metrics import median_absolute_error



In [71]:

    
median_absolute_error(Y_train, regr.predict(X_train))









    Out[71]:





0.31700773322247322



In [72]:

    
from sklearn.metrics import r2_score



In [73]:

    
r2_score(Y_train, regr.predict(X_train))









    Out[73]:





0.6014318615994938

Polynomial regression



In [74]:

    
from sklearn.preprocessing import PolynomialFeatures



In [75]:

    
poly = PolynomialFeatures(degree=2)



In [76]:

    
X_train_poly = poly.fit_transform(X_train)



In [77]:

    
poly_regr = linear_model.LinearRegression(fit_intercept=False)
poly_regr.fit(X_train_poly, Y_train)









    Out[77]:





LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)



In [78]:

    
plt.scatter(Y,poly_regr.predict(poly.fit_transform(X)))
plt.plot(np.linspace(Y_train.min(),Y_train.max(),num=10),np.linspace(Y_train.min(),Y_train.max(),num=10),color='red')









    Out[78]:





[<matplotlib.lines.Line2D at 0x10f364828>]






    



/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [79]:

    
print('Variance score: %.5f\t(%.5f)' % (poly_regr.score(poly.fit_transform(X_test), Y_test), poly_regr.score(poly.fit_transform(X), Y)))









    



Variance score: -99.89978	(-24.62157)

Ridge regression



In [80]:

    
from sklearn import linear_model



In [81]:

    
rregr = linear_model.Ridge(alpha=0.5)
rregr.fit(X_train, Y_train)









    Out[81]:





Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)



In [82]:

    
print('Variance score: %.5f\t(%.5f)' % (rregr.score(X_test, Y_test), rregr.score(X_full, Y_full)))









    



Variance score: 0.57282	(0.59722)



In [83]:

    
scores_rregression = cross_validation.cross_val_score(rregr, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_rregression.mean(), scores_rregression.std() * 2))









    



Accuracy: 0.58 (+/- 0.09)

Extra Trees Regression



In [84]:

    
from sklearn.ensemble import ExtraTreesRegressor



In [85]:

    
clf = ExtraTreesRegressor(n_estimators=250, bootstrap=True, oob_score=True, max_features='sqrt')
clf.fit(X_train, Y_train)









    Out[85]:





ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=None,
          max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=250, n_jobs=1, oob_score=True, random_state=None,
          verbose=0, warm_start=False)



In [86]:

    
print('Variance score: %.5f\t(%.5f)\nOut of bag error score: %.5f' % (clf.score(X_test, Y_test), clf.score(X_full, Y_full),clf.oob_score_))









    



Variance score: 0.62724	(0.80759)
Out of bag error score: 0.60974



In [87]:

    
scores_etregression = cross_validation.cross_val_score(clf, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_etregression.mean(), scores_etregression.std() * 2))









    



Accuracy: 0.62 (+/- 0.06)

Ada Boost Regressor



In [88]:

    
from sklearn.ensemble import AdaBoostRegressor



In [89]:

    
clf = AdaBoostRegressor(n_estimators=500, learning_rate=0.01, loss='linear')
clf.fit(X_train, Y_train)









    Out[89]:





AdaBoostRegressor(base_estimator=None, learning_rate=0.01, loss='linear',
         n_estimators=500, random_state=None)



In [90]:

    
print('Variance score: %.5f\t(%.5f)' % (clf.score(X_test, Y_test), clf.score(X_full, Y_full)))









    



Variance score: 0.53526	(0.57273)



In [91]:

    
scores_adaregression = cross_validation.cross_val_score(clf, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_adaregression.mean(), scores_adaregression.std() * 2))









    



Accuracy: 0.55 (+/- 0.06)

Bagging regressor



In [92]:

    
from sklearn.ensemble import BaggingRegressor



In [93]:

    
clf = BaggingRegressor(n_estimators=250, bootstrap=True, oob_score=True, max_features=20)
clf.fit(X_train, Y_train)









    Out[93]:





BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=20, max_samples=1.0,
         n_estimators=250, n_jobs=1, oob_score=True, random_state=None,
         verbose=0, warm_start=False)



In [94]:

    
print('Variance score: %.5f\t(%.5f)\nOut of bag error score: %.5f' % (clf.score(X_test, Y_test), clf.score(X_full, Y_full),clf.oob_score_))









    



Variance score: 0.65425	(0.76777)
Out of bag error score: 0.62089



In [95]:

    
scores_bagregression = cross_validation.cross_val_score(clf, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_bagregression.mean(), scores_bagregression.std() * 2))









    



Accuracy: 0.63 (+/- 0.07)

Gradient Boosting Regressor



In [96]:

    
from sklearn.ensemble import GradientBoostingRegressor



In [97]:

    
clf = GradientBoostingRegressor(n_estimators=250, max_features=10,max_depth=5)
clf.fit(X_train, Y_train)









    Out[97]:





GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             max_depth=5, max_features=10, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=250,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)



In [98]:

    
print('Variance score: %.5f\t(%.5f)' % (clf.score(X_test, Y_test), clf.score(X_full, Y_full)))









    



Variance score: 0.66302	(0.76559)



In [99]:

    
plt.scatter(Y_full,clf.predict(X_full))
plt.plot(np.linspace(Y_full.min(),Y_full.max(),num=10),np.linspace(Y_full.min(),Y_full.max(),num=10),color='red')









    Out[99]:





[<matplotlib.lines.Line2D at 0x1154f37b8>]






    



/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [100]:

    
scores_gradboostregression = cross_validation.cross_val_score(clf, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_gradboostregression.mean(), scores_gradboostregression.std() * 2))









    



Accuracy: 0.63 (+/- 0.10)



In [101]:

    
clf.fit(X_train, Y_train)









    Out[101]:





GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             max_depth=5, max_features=10, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=250,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)



In [102]:

    
import pickle



In [103]:

    
with open("../data/gradientboosting_params_gonorrhea.pickle", "wb") as myfile:
    pickle.dump(clf, myfile)

Random forest



In [104]:

    
from sklearn.ensemble import RandomForestRegressor



In [105]:

    
clf = RandomForestRegressor(n_estimators=250, oob_score=True, max_features='sqrt',min_samples_split=2, n_jobs=4)
clf.fit(X_train, Y_train)









    Out[105]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=250, n_jobs=4, oob_score=True, random_state=None,
           verbose=0, warm_start=False)



In [106]:

    
print('Variance score: %.5f\t(%.5f)\nOut of bag error score: %.5f' % (clf.score(X_test, Y_test), clf.score(X_full, Y_full),clf.oob_score_))









    



Variance score: 0.65100	(0.77673)
Out of bag error score: 0.61952



In [107]:

    
scores_randomforest = cross_validation.cross_val_score(clf, X_train, Y_train, cv=cv, n_jobs=4)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_randomforest.mean(), scores_randomforest.std() * 2))









    



Accuracy: 0.63 (+/- 0.08)



In [108]:

    
clf = RandomForestRegressor(n_estimators=250, oob_score=True, max_features='sqrt',min_samples_split=2, n_jobs=4)
clf.fit(X_train, Y_train)









    Out[108]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=250, n_jobs=4, oob_score=True, random_state=None,
           verbose=0, warm_start=False)



In [109]:

    
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

#ax.text(-1.5, 3, r'$R^2 = $%.2f'%(clf.oob_score_), fontsize=30)


ax.set_xlabel("CDC data", fontsize=20)
ax.set_ylabel("Model prediction", fontsize=20)

ax = plt.scatter(Y,clf.predict(X))
ax2 = plt.plot(np.linspace(Y.min(),Y.max(),10),np.linspace(Y.min(),Y.max(),10),color='red')

plt.savefig('../graphics/data_vs_model_gonorrhea.png', bbox_inches='tight', dpi=150)









    



/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [828]:

    
from collections import OrderedDict
RANDOM_STATE = 123
ensemble_clfs = [
    ("RandomForestRegressor, max_features='sqrt'",
        RandomForestRegressor(warm_start=False, oob_score=True,
                               max_features="sqrt",
                               random_state=RANDOM_STATE)),
    ("RandomForestRegressor, max_features='log2'",
        RandomForestRegressor(warm_start=False, max_features='log2',
                               oob_score=True,
                               random_state=RANDOM_STATE)),
    ("RandomForestRegressor, max_features=None",
        RandomForestRegressor(warm_start=False, max_features=None,
                               oob_score=True,
                               random_state=RANDOM_STATE))
]

# Map a classifier name to a list of (<n_estimators>, <error rate>) pairs.
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)

# Range of `n_estimators` values to explore.
min_estimators = 10
max_estimators = 250

for label, clf in ensemble_clfs:
    for i in range(min_estimators, max_estimators,10):
        clf.set_params(n_estimators=i)
        clf.fit(X_train, Y_train)

        # Record the OOB error for each `n_estimators=i` setting.
        oob_error = clf.oob_score_
        error_rate[label].append((i, oob_error))

# Generate the "OOB error rate" vs. "n_estimators" plot.
for label, clf_err in error_rate.items():
    xs, ys = zip(*clf_err)
    plt.plot(xs, ys, label=label)

plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB score")
plt.legend(loc="lower right")

plt.savefig('../graphics/features_estimators_gonorrhea.png', bbox_inches='tight', dpi=150)









    



/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/ensemble/forest.py:687: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/ensemble/forest.py:687: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/ensemble/forest.py:687: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "



In [111]:

    
feature_importance = (np.vstack((np.arange(len(clf.feature_importances_)), clf.feature_importances_)).T)
ranking = feature_importance[feature_importance[:,1].argsort()[::-1]]



In [112]:

    
for rank, importance in ranking: 
    print(df_new.columns[rank], importance)









    



hd02s078 0.131555315591
hd02s079 0.131042669253
hd02s153 0.077605262343
hd02s133 0.0724125131037
hd02s136 0.056609677503
hd02s026 0.0381425939706
hd02s181 0.0350510997217
d019 0.0235904527678
Population 0.0232923192023
lnd110210d 0.0196776197322
hd02s154 0.0195579171815
hd02s002 0.0183205296601
d029 0.0180897502352
hd02s006 0.0178635196094
hd02s011 0.017195152399
hd02s007 0.0161211203852
hd01vd01 0.0157915705716
hd02s095 0.0156136762701
hd01s020 0.015395242309
hd02s107 0.0153760048708
d024 0.0150625900875
d014 0.015013875519
hd02s134 0.0144646508373
hd02s132 0.0143730075973
hd02s151 0.0142209752962
hd01s168 0.0138617799277
hd02s135 0.012932051565
hd02s080 0.0126747567138
hd02s008 0.0123131135207
hd02s010 0.012283776033
hd02s131 0.0117112470565
hd01s167 0.0114938183702
hd02s152 0.0108361262187
hd02s015 0.0100120041693
hd02s009 0.00969465542139
hd02s013 0.00955429544546
hd02s005 0.00896322359447
hd02s081 0.00893532837002
hd02s089 0.00329471757626



In [113]:

    
labels_dict = {
    "Population": "Population",
    "hd01s001":  "log10(Population)",
    "hd02s002":  "0-14",
    "hd02s005":  "15-19",
    "hd02s006":  "20-24",
    "hd02s007":  "25-29",
    "hd02s008":  "30-34",
    "hd02s009":  "35-39",
    "hd02s010":  "40-44",
    "hd02s011":  "45-54",
    "hd02s013":  "55-64",
    "hd02s015":  "65+",
    "hd01s020":  "Median age",
    "hd02s026":  "Male percent",
    "hd02s051":  "Female percent",
    "hd02s078":  "White",
    "hd02s079":  "Black",
    "hd02s080":  "Native",
    "hd02s081":  "Asian",
    "hd02s089":  "Pacific/Hawaiian",
    "hd02s095":  "Two or more races",
    "hd02s107":  "Hispanic",
    "hd02s131":  "In households",
    "hd02s132":  "Householder",
    "hd02s133":  "Spouse",
    "hd02s134":  "Child",
    "hd02s135":  "Child w own child under 18",
    "hd02s136":  "Other relatives",
    "hd02s143":  "In group quarters",
    "hd02s151":  "Family households",
    "hd02s152":  "Family households w own child under 18",
    "hd02s153":  "Husband-wife family",
    "hd02s154":  "Husband-wife family w own child under 18",
    "hd02s159":  "Nonfamily households",
    "hd01s167":  "Average household size",
    "hd01s168":  "Average family size",
    "hd02s181":  "Owner occupied housing units",
    "hd02s184":  "Renter occupied housing units",
    "hd01vd01":  "Median income",
    "d001":      "Total households",
    "d002":      "Husband-wife households",
    "d014":      "Unmarried-partner households: - Male householder and male partner",
    "d019":      "Unmarried-partner households: - Male householder and female partner",
    "d024":      "Unmarried-partner households: - Female householder and female partner",
    "d029":      "Unmarried-partner households: - Female householder and male partner",
    "lnd110210d": "Population density"}



In [114]:

    
ranked_list = []
ranked_labels = []
for rank, importance in ranking: 
    print(labels_dict[df_new.columns[rank]], importance)
    ranked_list.append(importance)
    ranked_labels.append(labels_dict[df_new.columns[rank]])









    



White 0.131555315591
Black 0.131042669253
Husband-wife family 0.077605262343
Spouse 0.0724125131037
Other relatives 0.056609677503
Male percent 0.0381425939706
Owner occupied housing units 0.0350510997217
Unmarried-partner households: - Male householder and female partner 0.0235904527678
Population 0.0232923192023
Population density 0.0196776197322
Husband-wife family w own child under 18 0.0195579171815
0-14 0.0183205296601
Unmarried-partner households: - Female householder and male partner 0.0180897502352
20-24 0.0178635196094
45-54 0.017195152399
25-29 0.0161211203852
Median income 0.0157915705716
Two or more races 0.0156136762701
Median age 0.015395242309
Hispanic 0.0153760048708
Unmarried-partner households: - Female householder and female partner 0.0150625900875
Unmarried-partner households: - Male householder and male partner 0.015013875519
Child 0.0144646508373
Householder 0.0143730075973
Family households 0.0142209752962
Average family size 0.0138617799277
Child w own child under 18 0.012932051565
Native 0.0126747567138
30-34 0.0123131135207
40-44 0.012283776033
In households 0.0117112470565
Average household size 0.0114938183702
Family households w own child under 18 0.0108361262187
65+ 0.0100120041693
35-39 0.00969465542139
55-64 0.00955429544546
15-19 0.00896322359447
Asian 0.00893532837002
Pacific/Hawaiian 0.00329471757626



In [115]:

    
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

ax.set_xlabel("Feature importance", fontsize=20)
ax.set_ylabel("Feature", fontsize=20)

ax = sns.barplot(x=ranked_list, y=ranked_labels)
plt.savefig('../graphics/feature_importance_ranking_gonorrhea.png', bbox_inches='tight', dpi=150)



In [116]:

    
len(ranked_list)









    Out[116]:





39

Save model parameters for use in web app:



In [110]:

    
import pickle



In [111]:

    
with open("../data/randomforest_params_gonorrhea.pickle", "wb") as myfile:
    pickle.dump(clf, myfile)



In [112]:

    
with open("../data/Ymean_gonorrhea.pickle", "wb") as myfile:
    pickle.dump(Ymean, myfile)



In [113]:

    
with open("../data/Ystd_gonorrhea.pickle", "wb") as myfile:
    pickle.dump(Ystd, myfile)



In [114]:

    
deployed_model = pickle.load(open('../data/randomforest_params_gonorrhea.pickle', "rb" ))



In [115]:

    
print('Variance score: %.5f\t(%.5f)' % (deployed_model.score(X_test, Y_test), deployed_model.score(X_full, Y_full)))









    



Variance score: 0.64313	(0.77777)

Suport Vector Regression



In [123]:

    
from sklearn.svm import SVR



In [124]:

    
svr_rbf = SVR(kernel='rbf', C=15, gamma=0.0001, epsilon=0.0005, tol=0.00001)
#svr_lin = SVR(kernel='linear', C=1, epsilon=0.001)
#svr_poly = SVR(kernel='poly', C=1, degree=2, epsilon=0.001)
svr_rbf.fit(X_train, Y_train)
#svr_lin.fit(X_train, Y_train)
#svr_poly.fit(X_train, Y_train)
#print('Variance score:\n\t%.5f\t(rbf)\n\t%.5f\t(lin)\n\t%.5f\t(poly)' % (svr_rbf.score(X_train, Y_train), svr_lin.score(X_train, Y_train),svr_poly.score(X_train, Y_train)))
print('Variance score:\n\t%.5f\t(rbf)' % (svr_rbf.score(X_test, Y_test)))









    



Variance score:
	0.49249	(rbf)



In [125]:

    
scores_svm = cross_validation.cross_val_score(svr_rbf, X_train, Y_train, cv=cv, n_jobs=1)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_svm.mean(), scores_svm.std() * 2))









    



Accuracy: 0.56 (+/- 0.05)

Model comparison



In [126]:

    
model_scores = []
model_errors = []
model_names = []
model_scores.append(np.mean(scores_regression))
model_errors.append(np.std(scores_regression)*2)
model_names.append("Linear")
model_scores.append(np.mean(scores_rregression))
model_errors.append(np.std(scores_rregression)*2)
model_names.append("Ridge")
model_scores.append(np.mean(scores_etregression))
model_errors.append(np.std(scores_etregression)*2)
model_names.append("Extra trees")
model_scores.append(np.mean(scores_adaregression))
model_errors.append(np.std(scores_adaregression)*2)
model_names.append("ADA boost")
model_scores.append(np.mean(scores_bagregression))
model_errors.append(np.std(scores_bagregression)*2)
model_names.append("Bagging")
model_scores.append(np.mean(scores_gradboostregression))
model_errors.append(np.std(scores_gradboostregression)*2)
model_names.append("Gradient boost")
model_scores.append(np.mean(scores_svm))
model_errors.append(np.std(scores_svm)*2)
model_names.append("Suport vector")
model_scores.append(np.mean(scores_randomforest))
model_errors.append(np.std(scores_randomforest)*2)
model_names.append("Random forest")



In [127]:

    
model_names, model_scores, model_errors









    Out[127]:





(['Linear',
  'Ridge',
  'Extra trees',
  'ADA boost',
  'Bagging',
  'Gradient boost',
  'Suport vector',
  'Random forest'],
 [0.58456807197975214,
  0.58724151623728538,
  0.62247664219371257,
  0.55762532394194042,
  0.63789431535633934,
  0.64292067288243726,
  0.55546802562597519,
  0.63547396612249241],
 [0.020748703236739038,
  0.022872012731342393,
  0.046991141818972265,
  0.061103844940426887,
  0.048239678568851994,
  0.045742974864002715,
  0.045432117257869865,
  0.040558816555148106])



In [128]:

    
# Set up the matplotlib figure
sns.set_context("poster", font_scale=1)
f, ax = plt.subplots(figsize=(11, 9))

ax.set_xlabel("Model", fontsize=20)
ax.set_ylabel("Cross validation score", fontsize=20)

ax = sns.barplot(x=model_names, y=model_scores, yerr=model_errors)
plt.xticks(rotation=90)
plt.savefig('../graphics/model_performance_gonorrhea.png', bbox_inches='tight', dpi=150)









    



/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [ ]:



In [ ]:

	Disease	Area	State Abbreviation	FIPS	Year	Race	Sex	Age group	Transmission Category	Population	Cases	Rate
0	Gonorrhea	Autauga County	AL	1001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	NaN	48	86.9
1	Gonorrhea	Baldwin County	AL	1003	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	NaN	153	78.2
2	Gonorrhea	Barbour County	AL	1005	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	NaN	52	192.1
3	Gonorrhea	Bibb County	AL	1007	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	NaN	22	97.7
4	Gonorrhea	Blount County	AL	1009	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	NaN	6	10.4

	Disease	Area	State Abbreviation	FIPS	Year	Race	Sex	Age group	Transmission Category	Population	Cases	Rate
66	Gonorrhea	Winston County	AL	1133	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	24146	2	8.3
72	Gonorrhea	Denali Borough	AK	2068	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	1867	0	0.0
75	Gonorrhea	Haines Borough	AK	2100	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	2592	0	0.0
88	Gonorrhea	Prince of Wales - Outer Ketchikan	AK	2201	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	5786	0	0.0
93	Gonorrhea	Valdez-Cordova Census Area	AK	2261	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	9763	0	0.0
96	Gonorrhea	Wrangell-Petersburg Census Area	AK	2280	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	6174	0	0.0
97	Gonorrhea	Yakutat City and Borough	AK	2282	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	642	0	0.0
125	Gonorrhea	Cleburne County	AR	5023	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	25686	1	3.9
138	Gonorrhea	Fulton County	AR	5049	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	12304	0	0.0
157	Gonorrhea	Madison County	AR	5087	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	15701	0	0.0
158	Gonorrhea	Marion County	AR	5089	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	16430	0	0.0
164	Gonorrhea	Newton County	AR	5101	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	8064	0	0.0
166	Gonorrhea	Perry County	AR	5105	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	10345	0	0.0
170	Gonorrhea	Polk County	AR	5113	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	20406	2	9.8
190	Gonorrhea	Alpine County	CA	6003	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	1159	0	0.0
251	Gonorrhea	Baca County	CO	8009	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	3682	0	0.0
256	Gonorrhea	Cheyenne County	CO	8017	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	1890	0	0.0
259	Gonorrhea	Costilla County	CO	8023	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	3518	0	0.0
262	Gonorrhea	Delta County	CO	8029	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	30483	1	3.3
264	Gonorrhea	Dolores County	CO	8033	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	2029	0	0.0
266	Gonorrhea	Eagle County	CO	8037	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	52460	5	9.5
273	Gonorrhea	Gunnison County	CO	8051	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	15507	1	6.4
274	Gonorrhea	Hinsdale County	CO	8053	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	813	0	0.0
276	Gonorrhea	Jackson County	CO	8057	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	1365	0	0.0
278	Gonorrhea	Kiowa County	CO	8061	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	1423	0	0.0
279	Gonorrhea	Kit Carson County	CO	8063	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	8037	0	0.0
285	Gonorrhea	Logan County	CO	8075	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	22450	1	4.5
287	Gonorrhea	Mineral County	CO	8079	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	721	0	0.0
290	Gonorrhea	Montrose County	CO	8085	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	40713	1	2.5
292	Gonorrhea	Otero County	CO	8089	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	18703	1	5.3
...	...	...	...	...	...	...	...	...	...	...	...	...
3177	Gonorrhea	Guànica Municipio	PR	72055	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	17852	0	0.0
3178	Gonorrhea	Guayama Municipio	PR	72057	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	43467	2	4.6
3179	Gonorrhea	Guayanilla Municipio	PR	72059	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	20148	1	5.0
3180	Gonorrhea	Guaynabo Municipio	PR	72061	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	92799	6	6.5
3182	Gonorrhea	Hatillo Municipio	PR	72065	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	41618	2	4.8
3186	Gonorrhea	Jayuya Municipio	PR	72073	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	15693	0	0.0
3189	Gonorrhea	Lajas Municipio	PR	72079	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	24465	0	0.0
3190	Gonorrhea	Lares Municipio	PR	72081	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	28208	0	0.0
3191	Gonorrhea	Las MarÕas Municipio	PR	72083	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	9158	0	0.0
3196	Gonorrhea	Maricao Municipio	PR	72093	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	6022	0	0.0
3197	Gonorrhea	Maunabo Municipio	PR	72095	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	11565	0	0.0
3199	Gonorrhea	Moca Municipio	PR	72099	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	38461	0	0.0
3200	Gonorrhea	Morovis Municipio	PR	72101	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	32194	1	3.1
3202	Gonorrhea	Naranjito Municipio	PR	72105	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	29602	1	3.4
3203	Gonorrhea	Orocovis Municipio	PR	72107	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	22392	1	4.5
3204	Gonorrhea	Patillas Municipio	PR	72109	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	18261	0	0.0
3205	Gonorrhea	PeÐuelas Municipio	PR	72111	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	22365	1	4.5
3207	Gonorrhea	Quebradillas Municipio	PR	72115	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	25042	0	0.0
3210	Gonorrhea	Sabana Grande Municipio	PR	72121	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	24121	0	0.0
3211	Gonorrhea	Salinas Municipio	PR	72123	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	29881	2	6.7
3212	Gonorrhea	San Germàn Municipio	PR	72125	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	33725	1	3.0
3215	Gonorrhea	San Sebastiàn Municipio	PR	72131	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	39969	1	2.5
3216	Gonorrhea	Santa Isabel Municipio	PR	72133	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	22860	1	4.4
3217	Gonorrhea	Toa Alta Municipio	PR	72135	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	74837	7	9.4
3220	Gonorrhea	Utuado Municipio	PR	72141	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	31050	0	0.0
3221	Gonorrhea	Vega Alta Municipio	PR	72143	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	39236	3	7.6
3222	Gonorrhea	Vega Baja Municipio	PR	72145	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	56166	2	3.6
3224	Gonorrhea	Villalba Municipio	PR	72149	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	24389	0	0.0
3225	Gonorrhea	Yabucoa Municipio	PR	72151	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	35879	3	8.4
3226	Gonorrhea	Yauco Municipio	PR	72153	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	38782	0	0.0

	Disease	Area	State Abbreviation	FIPS	Year	Race	Sex	Age group	Transmission Category	Population	Cases	Rate
2319	Gonorrhea	Abbeville County	SC	45001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	25007	43	172.0
1116	Gonorrhea	Acadia Parish	LA	22001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	62204	106	170.4
2823	Gonorrhea	Accomack County	VA	51001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	33148	28	84.5
554	Gonorrhea	Ada County	ID	16001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	416464	205	49.2
996	Gonorrhea	Adair County	KY	21001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	18732	6	32.0
1486	Gonorrhea	Adair County	MO	29001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	25572	8	31.3
792	Gonorrhea	Adair County	IA	19001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	7472	2	26.8
2134	Gonorrhea	Adair County	OK	40001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	22194	11	49.6
1657	Gonorrhea	Adams County	NE	31001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	31610	8	25.3
700	Gonorrhea	Adams County	IN	18001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	34614	6	17.3
555	Gonorrhea	Adams County	ID	16003	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	3828	0	0.0
1404	Gonorrhea	Adams County	MS	28001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	32090	56	174.5
793	Gonorrhea	Adams County	IA	19003	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	3894	0	0.0
598	Gonorrhea	Adams County	IL	17001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	67130	33	49.2
2046	Gonorrhea	Adams County	OH	39001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	28105	2	7.1
1993	Gonorrhea	Adams County	ND	38001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	2360	1	42.4
247	Gonorrhea	Adams County	CO	8001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	469193	233	49.7
3051	Gonorrhea	Adams County	WI	55001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	20480	3	14.6
2957	Gonorrhea	Adams County	WA	53001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	19067	10	52.4
2247	Gonorrhea	Adams County	PA	42001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	101546	18	17.7
2809	Gonorrhea	Addison County	VT	50001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	36791	4	10.9
3149	Gonorrhea	Adjuntas Municipio	PR	72001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	18900	2	10.6
3150	Gonorrhea	Aguada Municipio	PR	72003	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	40329	3	7.4
3151	Gonorrhea	Aguadilla Municipio	PR	72005	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	57290	7	12.2
3152	Gonorrhea	Aguas Buenas Municipio	PR	72007	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	27473	2	7.3
3153	Gonorrhea	Aibonito Municipio	PR	72009	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	24561	3	12.2
2320	Gonorrhea	Aiken County	SC	45003	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	164176	280	170.5
1317	Gonorrhea	Aitkin County	MN	27001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	15742	5	31.8
323	Gonorrhea	Alachua County	FL	12001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	253451	411	162.2
1893	Gonorrhea	Alamance County	NC	37001	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	154378	262	169.7
...	...	...	...	...	...	...	...	...	...	...	...	...
3225	Gonorrhea	Yabucoa Municipio	PR	72151	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	35879	3	8.4
1991	Gonorrhea	Yadkin County	NC	37197	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	38038	13	34.2
2995	Gonorrhea	Yakima County	WA	53077	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	247044	409	165.6
97	Gonorrhea	Yakutat City and Borough	AK	2282	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	642	0	0.0
1484	Gonorrhea	Yalobusha County	MS	28161	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	12373	18	145.5
2246	Gonorrhea	Yamhill County	OR	41071	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	100725	18	17.9
1992	Gonorrhea	Yancey County	NC	37199	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	17566	2	11.4
2429	Gonorrhea	Yankton County	SD	46135	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	22696	8	35.2
1892	Gonorrhea	Yates County	NY	36123	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	25156	2	8.0
3226	Gonorrhea	Yauco Municipio	PR	72153	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	38782	0	0.0
112	Gonorrhea	Yavapai County	AZ	4025	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	215133	51	23.7
1485	Gonorrhea	Yazoo County	MS	28163	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	27883	103	369.4
188	Gonorrhea	Yell County	AR	5149	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	21893	8	36.5
1403	Gonorrhea	Yellow Medicine County	MN	27173	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	10143	1	9.9
1656	Gonorrhea	Yellowstone County	MT	30111	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	154162	101	65.5
2776	Gonorrhea	Yoakum County	TX	48501	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	8184	5	61.1
245	Gonorrhea	Yolo County	CA	6113	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	204593	194	94.8
2917	Gonorrhea	York County	VA	51199	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	66269	34	51.3
2364	Gonorrhea	York County	SC	45091	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	239363	311	129.9
1749	Gonorrhea	York County	NE	31185	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	13883	5	36.0
1195	Gonorrhea	York County	ME	23031	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	199431	35	17.5
2313	Gonorrhea	York County	PA	42133	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	438965	294	67.0
2777	Gonorrhea	Young County	TX	48503	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	18341	11	60.0
246	Gonorrhea	Yuba County	CA	6115	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	73340	86	117.3
98	Gonorrhea	Yukon-Koyukuk Census Area	AK	2290	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	5695	20	351.2
113	Gonorrhea	Yuma County	AZ	4027	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	201201	124	61.6
310	Gonorrhea	Yuma County	CO	8125	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	10151	0	0.0
2778	Gonorrhea	Zapata County	TX	48505	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	14390	1	6.9
2779	Gonorrhea	Zavala County	TX	48507	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	12156	5	41.1
2430	Gonorrhea	Ziebach County	SD	46137	2014	All races/ethnicities	Both sexes	All age groups	All transmission categories	2834	11	388.1

	FIPS	Population	Cases	hd01s001	hd02s002	hd02s005	hd02s006	hd02s007	hd02s008	hd02s009	hd02s010	hd02s011	hd02s013	hd02s015	hd01s020	hd02s026	hd02s051	hd02s078	hd02s079	hd02s080	hd02s081	hd02s089	hd02s095	hd02s107	hd02s131	hd02s132	hd02s133	hd02s134	hd02s135	hd02s136	hd02s143	hd02s151	hd02s152	hd02s153	hd02s154	hd02s159	hd01s167	hd01s168	hd02s181	hd02s184	hd01vd01	d002	d014	d019	d024	d029	lnd110210d
0	1001	55246	48	4.736962	21.8	7.9	5.6	5.8	6.1	7.6	7.5	15.0	10.7	12.0	37.0	48.7	51.3	78.5	17.7	0.4	0.9	0.1	1.6	2.4	99.2	37.1	20.8	31.8	23.6	6.1	0.8	74.5	34.9	56.2	25.3	25.5	2.68	3.13	75.4	24.6	52475	0.562138	0.003017	0.020029	0.002868	0.017704	92.781808
1	1003	195540	153	5.260703	19.0	6.4	5.2	5.6	5.9	6.3	6.6	14.8	13.5	16.9	41.1	48.9	51.1	85.7	9.4	0.7	0.7	0.0	1.5	4.4	98.7	40.2	21.9	26.8	20.1	5.6	1.3	69.9	28.0	54.5	19.9	30.1	2.46	2.93	72.5	27.5	50183	0.545409	0.002747	0.023886	0.003444	0.020292	122.920831
2	1005	27076	52	4.438653	18.0	6.3	6.5	7.3	6.6	6.6	6.6	14.7	13.2	14.3	39.0	53.1	46.9	48.0	46.9	0.4	0.4	0.1	0.9	5.1	88.4	35.8	15.6	25.7	17.7	7.8	11.6	68.4	27.4	43.7	14.4	31.6	2.47	3.01	66.8	33.2	35634	0.437169	0.002342	0.019348	0.003666	0.022200	30.563959
3	1007	22512	22	4.360120	18.4	6.7	6.5	7.0	7.2	7.6	7.1	14.8	11.9	12.6	37.8	53.7	46.3	75.8	22.0	0.3	0.1	0.1	0.9	1.8	90.3	34.7	18.2	26.8	18.7	7.5	9.7	72.3	29.5	52.5	20.1	27.7	2.60	3.09	75.6	24.4	37984	0.524582	0.001886	0.020244	0.002012	0.020370	36.101222
4	1009	57872	6	4.758321	20.2	7.0	5.4	6.0	6.0	6.8	7.0	14.1	12.6	14.6	39.0	49.5	50.5	92.6	1.3	0.5	0.2	0.1	1.2	8.1	99.1	37.6	22.8	29.2	21.3	6.4	0.9	75.0	31.1	60.6	24.1	25.0	2.63	3.07	80.6	19.4	44409	0.606034	0.001946	0.017981	0.003707	0.013440	89.615659

	FIPS	Population	Cases	hd01s001	hd02s002	hd02s005	hd02s006	hd02s007	hd02s008	hd02s009	hd02s010	hd02s011	hd02s013	hd02s015	hd01s020	hd02s026	hd02s051	hd02s078	hd02s079	hd02s080	hd02s081	hd02s089	hd02s095	hd02s107	hd02s131	hd02s132	hd02s133	hd02s134	hd02s135	hd02s136	hd02s143	hd02s151	hd02s152	hd02s153	hd02s154	hd02s159	hd01s167	hd01s168	hd02s181	hd02s184	hd01vd01	d002	d014	d019	d024	d029	lnd110210d
count	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000	3060.000000
mean	30492.802614	100423.684967	103.278758	4.460522	19.163007	6.908072	5.977222	5.819314	5.695294	5.899739	6.297484	15.002190	13.250327	15.985556	40.491667	50.003856	49.996144	84.076340	8.121307	1.550719	1.165033	0.082418	1.966765	8.417353	96.604379	39.254118	20.403791	27.047647	20.409281	5.134248	3.395654	67.828627	27.814542	52.057712	19.173268	32.171438	2.476042	2.988451	72.576307	27.423791	46800.265359	0.520577	0.002492	0.030613	0.003221	0.024522	239.640924
std	15071.256920	323873.098080	492.184847	0.634640	2.846254	1.135197	2.546279	1.223101	0.950620	0.878120	0.830619	1.506377	2.129781	4.166765	4.959052	2.204346	2.204346	15.045667	13.038226	5.071524	2.533382	0.967483	1.549707	13.303866	4.451890	3.484761	2.711350	3.484574	3.001770	2.163983	4.451875	4.990887	4.766401	5.948292	4.289568	4.990872	0.203247	0.181387	7.350396	7.350283	12025.134705	0.059487	0.001291	0.007625	0.001295	0.007228	1610.657866
min	1001.000000	90.000000	0.000000	1.913814	0.000000	0.000000	1.300000	2.300000	2.400000	1.200000	2.800000	6.300000	4.000000	3.500000	22.600000	43.200000	27.900000	14.200000	0.000000	0.000000	0.000000	0.000000	0.100000	0.000000	45.000000	17.600000	7.500000	4.400000	0.000000	0.700000	0.000000	18.800000	0.000000	11.600000	0.000000	13.400000	1.260000	2.000000	1.400000	10.200000	19146.000000	0.115942	0.000000	0.000000	0.000000	0.000000	0.069674
25%	19006.500000	11208.000000	2.000000	4.051722	17.400000	6.300000	4.700000	5.100000	5.100000	5.400000	5.800000	14.200000	12.000000	13.300000	37.700000	48.900000	49.600000	76.575000	0.500000	0.200000	0.300000	0.000000	1.100000	1.600000	96.300000	37.400000	18.800000	25.000000	18.600000	3.500000	1.100000	65.200000	25.000000	48.800000	16.500000	29.300000	2.350000	2.880000	69.200000	22.600000	38810.000000	0.488003	0.001797	0.025435	0.002469	0.020005	17.078483
50%	29188.000000	25987.000000	10.000000	4.414815	19.100000	6.800000	5.500000	5.600000	5.600000	5.900000	6.300000	15.100000	13.100000	15.700000	40.500000	49.500000	50.500000	89.450000	1.900000	0.400000	0.500000	0.000000	1.600000	3.300000	98.200000	39.500000	20.700000	27.100000	20.400000	4.900000	1.800000	68.100000	27.450000	52.500000	18.700000	31.900000	2.450000	2.970000	73.800000	26.200000	45009.500000	0.524803	0.002367	0.030222	0.003160	0.024365	45.094109
75%	45085.500000	67582.250000	43.000000	4.825413	20.700000	7.300000	6.500000	6.400000	6.200000	6.400000	6.800000	15.800000	14.300000	18.300000	43.400000	50.400000	51.100000	95.625000	9.400000	0.800000	1.000000	0.100000	2.300000	8.400000	98.900000	41.300000	22.225000	29.000000	22.000000	6.500000	3.700000	70.700000	30.100000	55.700000	21.000000	34.800000	2.570000	3.070000	77.400000	30.800000	52013.500000	0.556712	0.002993	0.035313	0.003841	0.028793	114.277352
max	56045.000000	10017068.000000	15316.000000	6.992050	33.600000	18.300000	28.100000	16.100000	11.700000	9.700000	11.900000	24.500000	28.100000	43.400000	62.700000	72.100000	56.800000	99.200000	84.400000	75.500000	43.900000	48.900000	29.500000	95.700000	100.000000	76.700000	28.800000	42.800000	34.400000	16.000000	55.000000	86.600000	50.300000	79.200000	41.700000	81.200000	3.680000	4.050000	89.800000	98.600000	123966.000000	0.792199	0.022064	0.080335	0.017631	0.077751	68239.991607