A first simple model



In [1]:

    
import pandas as pd
import pickle
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [2]:

    
# Plain Seaborn figures with matplotlib color codes mapped to the default seaborn palette 
sns.set(style="white", color_codes=True)



In [3]:

    
df_merged = pd.read_csv("../data/chlamydia_cdc_census.csv")
df_merged.drop("Cases", axis=1, inplace=True)
df_merged.head()









    Out[3]:






  
    
      
      FIPS
      Population
      hd01s001
      hd02s002
      hd02s005
      hd02s006
      hd02s007
      hd02s008
      hd02s009
      hd02s010
      ...
      hd01s168
      hd02s181
      hd02s184
      hd01vd01
      d002
      d014
      d019
      d024
      d029
      lnd110210d
    
  
  
    
      0
      1001
      55246
      4.736962
      21.8
      7.9
      5.6
      5.8
      6.1
      7.6
      7.5
      ...
      3.13
      75.4
      24.6
      52475
      0.562138
      0.003017
      0.020029
      0.002868
      0.017704
      92.781808
    
    
      1
      1003
      195540
      5.260703
      19.0
      6.4
      5.2
      5.6
      5.9
      6.3
      6.6
      ...
      2.93
      72.5
      27.5
      50183
      0.545409
      0.002747
      0.023886
      0.003444
      0.020292
      122.920831
    
    
      2
      1005
      27076
      4.438653
      18.0
      6.3
      6.5
      7.3
      6.6
      6.6
      6.6
      ...
      3.01
      66.8
      33.2
      35634
      0.437169
      0.002342
      0.019348
      0.003666
      0.022200
      30.563959
    
    
      3
      1007
      22512
      4.360120
      18.4
      6.7
      6.5
      7.0
      7.2
      7.6
      7.1
      ...
      3.09
      75.6
      24.4
      37984
      0.524582
      0.001886
      0.020244
      0.002012
      0.020370
      36.101222
    
    
      4
      1009
      57872
      4.758321
      20.2
      7.0
      5.4
      6.0
      6.0
      6.8
      7.0
      ...
      3.07
      80.6
      19.4
      44409
      0.606034
      0.001946
      0.017981
      0.003707
      0.013440
      89.615659
    
  

5 rows × 46 columns



In [4]:

    
df_zipfips= pd.read_csv("../data/ZIP_COUNTY_122014.csv", usecols={0,1})



In [5]:

    
zip2fips = dict(zip(df_zipfips["ZIP"], df_zipfips["COUNTY"]))



In [6]:

    
fips = zip2fips[10027]



In [7]:

    
target = df_merged[df_merged['FIPS']==fips]
target.shape









    Out[7]:





(1, 46)



In [8]:

    
target_params = target.values[0]



In [9]:

    
model = pickle.load(open('../data/randomforest_params.pickle', "rb" ))



In [10]:

    
model2 = pickle.load(open('../data/gradientboosting_params.pickle', "rb" ))



In [11]:

    
Ymean = pickle.load(open('../data/Ymean.pickle', "rb"))



In [12]:

    
Ystd = pickle.load(open('../data/Ystd.pickle', "rb"))



In [13]:

    
chlamydia_rate = model.predict(target_params[1:])









    



/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)



In [14]:

    
gender_rate = {}
gender_factor = {}
gender_number = {}
gender_rate["Male"] = 278.4e-5
gender_rate["Female"] = 627.2e-5
gender_number["Male"] = 155651602
gender_number["Female"] = 160477237
rate_average = (gender_rate["Male"]*gender_number["Male"]+gender_rate["Female"]*gender_number["Male"])/(gender_number["Male"]+gender_number["Female"])
gender_factor["Male"] = gender_rate["Male"]/rate_average
gender_factor["Female"] = gender_rate["Female"]/rate_average
gender_factor["Female"], gender_factor["Male"]
race_rate = {}
race_factor = {}
race_number = {}
race_number["Native"] = 1942876.0
race_number["Asian"] = 12721721.0
race_number["Black"] = 29489649.0
race_number["Hispanic"] = 46407173.0
race_number["Multiple"] = 5145135.0
race_number["Pacific"] = 473703.0
race_number["White"] = 161443167.0
race_rate["Native"] = 689.1e-5
race_rate["Asian"] = 115.8e-5
race_rate["Black"] = 1152.6e-5
race_rate["Hispanic"] = 376.2e-5
race_rate["Multiple"] = 116.1e-5
race_rate["Pacific"] = 641.5e-5
race_rate["White"] = 187.0e-5
US_number = race_number["Native"] + race_number["Asian"] + race_number["Black"] + race_number["Hispanic"] + race_number["Multiple"] + race_number["Pacific"] + race_number["White"]
rate_average = (race_rate["Native"]*race_number["Native"]+race_rate["Asian"]*race_number["Asian"]+race_rate["Black"]*race_number["Black"]+race_rate["Hispanic"]*race_number["Hispanic"]+race_rate["Multiple"]*race_number["Multiple"]+race_rate["Pacific"]*race_number["Multiple"]+race_rate["White"]*race_number["White"])/US_number  
race_factor["Native"] = race_rate["Native"]/rate_average
race_factor["Asian"] = race_rate["Asian"]/rate_average
race_factor["Black"] = race_rate["Black"]/rate_average
race_factor["Hispanic"] = race_rate["Hispanic"]/rate_average
race_factor["Multiple"] = race_rate["Multiple"]/rate_average
race_factor["Pacific"] = race_rate["Pacific"]/rate_average
race_factor["White"] = race_rate["White"]/rate_average

age_rate = {}
age_factor = {}
age_number = {}
age_number["0-14"] = 61089123.0
age_number["15-19"] = 21158964.0
age_number["20-24"] = 22795438.0
age_number["25-29"] = 21580198.0
age_number["30-34"] = 21264389.0
age_number["35-39"] = 19603770.0
age_number["40-44"] = 20848920.0
age_number["45-54"] = 43767532.0
age_number["55-64"] = 39316431.0
age_number["65+"] = 44704074.0

age_rate["0-14"] = 20.0e-5
age_rate["15-19"] = 1804.0e-5
age_rate["20-24"] = 2484.6e-5
age_rate["25-29"] = 1176.2e-5
age_rate["30-34"] = 532.4e-5
age_rate["35-39"] = 268.0e-5
age_rate["40-44"] = 131.5e-5
age_rate["45-54"] = 56.6e-5
age_rate["55-64"] = 16.6e-5
age_rate["65+"] = 3.2e-5

US_age_number = age_number["0-14"] + age_number["15-19"] + age_number["20-24"] + age_number["25-29"] + age_number["30-34"] + age_number["35-39"] + age_number["40-44"] + age_number["45-54"] + age_number["55-64"] + age_number["65+"]
rate_average = (age_rate["0-14"]*age_number["0-14"]+age_rate["15-19"]*age_number["15-19"]+age_rate["20-24"]*age_number["20-24"]+age_rate["25-29"]*age_number["25-29"]+age_rate["30-34"]*age_number["30-34"]+age_rate["35-39"]*age_number["35-39"]+age_rate["40-44"]*age_number["40-44"]+age_rate["45-54"]*age_number["45-54"]+age_rate["55-64"]*age_number["55-64"]+age_rate["65+"]*age_number["65+"])/US_age_number  
age_factor["0-14"] = age_rate["0-14"]/rate_average
age_factor["15-19"] = age_rate["15-19"]/rate_average
age_factor["20-24"] = age_rate["20-24"]/rate_average
age_factor["25-29"] = age_rate["25-29"]/rate_average
age_factor["30-34"] = age_rate["30-34"]/rate_average
age_factor["35-39"] = age_rate["35-39"]/rate_average
age_factor["40-44"] = age_rate["40-44"]/rate_average
age_factor["45-54"] = age_rate["45-54"]/rate_average
age_factor["55-64"] = age_rate["55-64"]/rate_average
age_factor["65+"] = age_rate["65+"]/rate_average

race_factor["Native"], race_factor["Asian"], race_factor["Black"], race_factor["Hispanic"], race_factor["Multiple"], race_factor["Pacific"], race_factor["White"]
age_factor["0-14"], age_factor["15-19"], age_factor["20-24"], age_factor["25-29"], age_factor["30-34"], age_factor["35-39"], age_factor["40-44"], age_factor["45-54"], age_factor["55-64"], age_factor["65+"]









    Out[14]:





(0.04390608807429472,
 3.960329144301384,
 5.454453321469633,
 2.5821170396492725,
 1.1687800645377253,
 0.5883415801955493,
 0.2886825290884878,
 0.12425422925025406,
 0.036442053101664616,
 0.007024974091887155)



In [15]:

    
def calculate_rate(Zipcode, Race, Gender, Age):
    fips = zip2fips[int(Zipcode)]
    target = df_merged[df_merged['FIPS']==fips]
    target_params = target.values[0]
    chlamydia_rate = model.predict(target_params[1:])
    return chlamydia_rate*gender_factor[Gender]*race_factor[Race]*age_factor[Age]

Race = "White"
Gender = "Male"
Age = "35-39"
Zipcode = "02139"
print("Your individual chance of having Chlamydia is %.2f percent"%(calculate_rate(Zipcode, Race, Gender, Age)*100))









    



Your individual chance of having Chlamydia is 4.74 percent






    



/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)

ZIP code census data base CSV file



In [16]:

    
df_zipcode = pd.read_csv("../data/census_zipcode.csv")
df_zipcode_unnormalized = pd.read_csv("../data/census_zipcode_unnormalized.csv")



In [17]:

    
df_zipcode.head()









    Out[17]:






  
    
      
      geoid2
      Population
      hd01s001
      hd02s002
      hd02s005
      hd02s006
      hd02s007
      hd02s008
      hd02s009
      hd02s010
      ...
      hd01s168
      hd02s181
      hd02s184
      hd01vd01
      d002
      d014
      d019
      d024
      d029
      landsqmi
    
  
  
    
      0
      602
      -0.187985
      0.202049
      -0.140291
      0.868417
      0.458393
      0.369554
      1.049363
      0.316382
      0.966525
      ...
      1.155430
      0.683548
      -0.683573
      -2.522258
      0.550821
      -0.475777
      -1.064918
      -0.417037
      -1.132623
      0.653718
    
    
      1
      603
      -0.147833
      0.401021
      0.001670
      0.241583
      0.222605
      0.452316
      0.835698
      0.668510
      -0.023418
      ...
      0.376440
      -1.061493
      1.061497
      -2.524746
      -1.301043
      -0.781474
      -1.519258
      -0.905455
      0.099090
      0.876386
    
    
      2
      606
      -0.294408
      -1.124613
      0.356574
      0.689322
      0.694182
      0.866131
      0.408369
      0.081630
      0.100325
      ...
      1.377999
      -0.036628
      0.036615
      -2.705957
      -0.339241
      -0.015301
      0.107207
      -1.278090
      -0.523061
      -0.057227
    
    
      3
      610
      -0.226109
      -0.056754
      0.072651
      0.510226
      0.144009
      0.286791
      1.049363
      0.903262
      0.966525
      ...
      0.821577
      0.683548
      -0.683573
      -2.446622
      -0.395512
      -0.547272
      -0.622365
      -0.760020
      0.785830
      0.330580
    
    
      4
      612
      -0.110268
      0.547769
      -0.317743
      0.420679
      0.183307
      0.121265
      0.622034
      0.433758
      0.224068
      ...
      0.320798
      0.032620
      -0.032634
      -2.406234
      -1.282855
      -0.831928
      -1.189636
      -1.009735
      0.851580
      0.447418
    
  

5 rows × 46 columns



In [18]:

    
df_zipcode_unnormalized[df_zipcode_unnormalized["geoid2"]==int(Zipcode)]









    Out[18]:






  
    
      
      geoid2
      Population
      hd01s001
      hd02s002
      hd02s005
      hd02s006
      hd02s007
      hd02s008
      hd02s009
      hd02s010
      ...
      hd01s168
      hd02s181
      hd02s184
      hd01vd01
      d002
      d014
      d019
      d024
      d029
      landsqmi
    
  
  
    
      487
      2139
      36349
      4.560492
      9.6
      6.7
      16.8
      19.1
      12.1
      7
      4.9
      ...
      2.81
      32.2
      67.8
      73819
      0.258902
      0.007714
      0.039328
      0.009229
      0.038983
      14198.828125
    
  

1 rows × 46 columns



In [19]:

    
def calculate_rate(Zipcode):
    target = df_zipcode[df_zipcode["geoid2"]==int(Zipcode)]
    target_params = target.values[0]
    chlamydia_rate = model.predict(target_params[1:])*Ystd+Ymean
    return chlamydia_rate

Race = "White"
Gender = "Male"
Age = "35-39"
Zipcode = "02139"
zipcoderate = calculate_rate(Zipcode)

target_unnormalized = df_zipcode_unnormalized[df_zipcode_unnormalized["geoid2"]==int(Zipcode)]

TOTALNR = target_unnormalized["Population"]

if Gender == "Male":
    gender_table = "hd02s026"
else:
    gender_table = "hd02s051"

GENDERNR = TOTALNR*target_unnormalized[gender_table]/100.0

if Race == "White":
    race_table = "hd02s078"
elif Race == "Black":
    race_table = "hd02s079"
elif Race == "Native":
    race_table = "hd02s080"
elif Race == "Asian":
    race_table = "hd02s081"
elif Race == "Pacific":
    race_table = "hd02s089"
elif Race == "Multiple":
    race_table = "hd02s095"
elif Race == "Hispanic":
    race_table = "hd02s107"

RACENR = TOTALNR*target_unnormalized[race_table]/100.0

if Age == "0-14":
    age_table = "hd02s002"
elif Age == "15-19":
    age_table = "hd02s005"
elif Age == "20-24":
    age_table = "hd02s006"
elif Age == "25-29":
    age_table = "hd02s007"
elif Age == "30-34":
    age_table = "hd02s008"
elif Age == "35-39":
    age_table = "hd02s009"
elif Age == "40-44":
    age_table = "hd02s010"
elif Age == "45-54":
    age_table = "hd02s011"
elif Age == "55-64":
    age_table = "hd02s013"
elif Age == "65+":
    age_table = "hd02s015"

AGENR = TOTALNR*target_unnormalized[age_table]/100.0

the_result = 100*(zipcoderate/TOTALNR + gender_rate[Gender]/GENDERNR + race_rate[Race]/RACENR + age_rate[Age]/AGENR)/(1.0/TOTALNR+1.0/GENDERNR+1.0/RACENR+1.0/AGENR)

print("Your individual chance of having Chlamydia is %.2f percent"%(the_result))
TOTALNR, GENDERNR, RACENR, AGENR









    



Your individual chance of having Chlamydia is 0.28 percent






    



/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)






    Out[19]:





(487    36349
 Name: Population, dtype: int64, 487    18283.547
 dtype: float64, 487    22645.427
 dtype: float64, 487    2544.43
 dtype: float64)

Model validation with Boston Public Health Commission data



In [20]:

    
df_boston = np.genfromtxt("../data/zipcodes_boston.txt", dtype=[('Neighborhood','O'),('zip1','i8'),('zip2','i8'),('zip3','i8'),('zip4','i8'),('zip5','i8'),('zip6','i8')], delimiter=",")



In [21]:

    
df_boston, len(df_boston)









    Out[21]:





(array([(b'HydePark', 2136, 0, 0, 0, 0, 0),
        (b'WestRoxbury', 2132, 0, 0, 0, 0, 0),
        (b'Roslindale', 2131, 0, 0, 0, 0, 0),
        (b'Mattapan', 2126, 0, 0, 0, 0, 0),
        (b'JamaicaPlain', 2130, 0, 0, 0, 0, 0),
        (b'SouthDorchester', 2124, 2122, 0, 0, 0, 0),
        (b'NorthDorchester', 2121, 2125, 0, 0, 0, 0),
        (b'Roxbury', 2119, 2120, 0, 0, 0, 0),
        (b'Fenway', 2215, 2115, 0, 0, 0, 0),
        (b'SouthEnd', 2118, 2111, 0, 0, 0, 0),
        (b'SouthBoston', 2127, 2210, 0, 0, 0, 0),
        (b'BackBay', 2199, 2116, 2110, 2113, 2114, 2109),
        (b'Charlestown', 2129, 0, 0, 0, 0, 0),
        (b'Allston', 2134, 2135, 2163, 0, 0, 0),
        (b'EastBoston', 2128, 0, 0, 0, 0, 0)], 
       dtype=[('Neighborhood', 'O'), ('zip1', '<i8'), ('zip2', '<i8'), ('zip3', '<i8'), ('zip4', '<i8'), ('zip5', '<i8'), ('zip6', '<i8')]),
 15)



In [22]:

    
predictions = np.zeros(len(df_boston))
i = 0
for hood in df_boston:
    average = 0.0
    count = 0
    if (hood["zip1"]):
        target = df_zipcode[df_zipcode["geoid2"]==int(hood["zip1"])]
        target_params = target.values[0]
        chlamydia_rate = model.predict(target_params[1:])*Ystd+Ymean
        average += chlamydia_rate[0]*1e5
        count += 1
        print(hood["Neighborhood"], hood["zip1"],round(chlamydia_rate[0]*1e5))
    if (hood["zip2"]):
        target = df_zipcode[df_zipcode["geoid2"]==int(hood["zip2"])]
        target_params = target.values[0]
        chlamydia_rate = model.predict(target_params[1:])*Ystd+Ymean
        average += chlamydia_rate[0]*1e5
        count += 1
        print(hood["Neighborhood"], hood["zip2"],round(chlamydia_rate[0]*1e5))
    if (hood["zip3"]):
        target = df_zipcode[df_zipcode["geoid2"]==int(hood["zip3"])]
        target_params = target.values[0]
        chlamydia_rate = model.predict(target_params[1:])*Ystd+Ymean
        average += chlamydia_rate[0]*1e5
        count += 1
        print(hood["Neighborhood"], hood["zip3"],round(chlamydia_rate[0]*1e5))
    if (hood["zip4"]):
        target = df_zipcode[df_zipcode["geoid2"]==int(hood["zip4"])]
        target_params = target.values[0]
        chlamydia_rate = model.predict(target_params[1:])*Ystd+Ymean
        average += chlamydia_rate[0]*1e5
        count += 1
        print(hood["Neighborhood"], hood["zip4"],round(chlamydia_rate[0]*1e5))
    if (hood["zip5"]):
        target = df_zipcode[df_zipcode["geoid2"]==int(hood["zip5"])]
        target_params = target.values[0]
        chlamydia_rate = model.predict(target_params[1:])*Ystd+Ymean
        average += chlamydia_rate[0]*1e5
        count += 1
        print(hood["Neighborhood"], hood["zip5"],round(chlamydia_rate[0]*1e5))
    if (hood["zip6"]):
        target = df_zipcode[df_zipcode["geoid2"]==int(hood["zip6"])]
        target_params = target.values[0]
        chlamydia_rate = model.predict(target_params[1:])*Ystd+Ymean
        average += chlamydia_rate[0]*1e5
        count += 1
        print(hood["Neighborhood"], hood["zip6"],round(chlamydia_rate[0]*1e5))
    average /= count
    predictions[i] = average
    i+=1









    



/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)






    



b'HydePark' 2136 715.0
b'WestRoxbury' 2132 343.0
b'Roslindale' 2131 639.0
b'Mattapan' 2126 749.0
b'JamaicaPlain' 2130 551.0
b'SouthDorchester' 2124 729.0
b'SouthDorchester' 2122 705.0
b'NorthDorchester' 2121 698.0
b'NorthDorchester' 2125 665.0
b'Roxbury' 2119 696.0
b'Roxbury' 2120 586.0
b'Fenway' 2215 470.0
b'Fenway' 2115 487.0






    



/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)






    



b'SouthEnd' 2118 429.0
b'SouthEnd' 2111 515.0
b'SouthBoston' 2127 452.0
b'SouthBoston' 2210 411.0
b'BackBay' 2199 401.0
b'BackBay' 2116 480.0
b'BackBay' 2110 324.0
b'BackBay' 2113 405.0
b'BackBay' 2114 458.0
b'BackBay' 2109 420.0
b'Charlestown' 2129 458.0
b'Allston' 2134 497.0
b'Allston' 2135 501.0
b'Allston' 2163 497.0
b'EastBoston' 2128 605.0






    



/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)



In [23]:

    
len(predictions),









    Out[23]:





(15,)



In [24]:

    
df_boston_rates = np.genfromtxt("../data/rate_boston.txt", dtype=[('Neighborhood','S20'),('mean','i8'),('error','i8')], delimiter=",")



In [25]:

    
labels = df_boston["Neighborhood"]



In [26]:

    
R2 = 1.0-np.sum((predictions-df_boston_rates["mean"])**2)/np.sum((df_boston_rates["mean"]-np.mean(df_boston_rates["mean"]))**2)
R2









    Out[26]:





0.50231504327178955



In [27]:

    
fig = plt.figure(figsize=(10, 6))
data = np.arange(len(df_boston_rates['Neighborhood']))
ax1 = plt.scatter(data,predictions)
#plt.xlabel("Neighborhood")
plt.ylabel("Chlamydia cases [per 100,000]")
ax2 = plt.errorbar(data, df_boston_rates["mean"], yerr=df_boston_rates["error"],fmt='o', color='red')
plt.xlim([-1,15])
plt.ylim([0,1400])

x = np.arange(15)
labels = ['Hyde Park', 'West Roxbury', 'Roslindale', 'Mattapan','Jamaica Plain', 'South Dorchester', 'North Dorchester', 'Roxbury','Fenway', 'South End', 'South Boston', 'Back Bay','Charlestown', 'Allston', 'East Boston']
plt.xticks(x, labels, rotation='vertical')

plt.text(12, 1200, r'$R^2 = $%.2f'%(R2), fontsize=20)

plt.savefig('../graphics/boston_comparison.png', bbox_inches='tight', dpi=150)









    



/Users/akuepper/anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

Plots for website



In [28]:

    
def calculate_rate(Zipcode):
    target = df_zipcode[df_zipcode["geoid2"]==int(Zipcode)]
    target_params = target.values[0]
    chlamydia_rate = model.predict(target_params[1:])*Ystd+Ymean
    return chlamydia_rate

Race = "Black"
Gender = "Male"
Age = "30-34"
Zipcode = "02474"


target_unnormalized = df_zipcode_unnormalized[df_zipcode_unnormalized["geoid2"]==int(Zipcode)]

TOTALNR = target_unnormalized["Population"]

if Gender == "Male":
    gender_table = "hd02s026"
else:
    gender_table = "hd02s051"

GENDERNR = TOTALNR*target_unnormalized[gender_table]/100.0

if Race == "White":
    race_table = "hd02s078"
elif Race == "Black":
    race_table = "hd02s079"
elif Race == "Native":
    race_table = "hd02s080"
elif Race == "Asian":
    race_table = "hd02s081"
elif Race == "Pacific":
    race_table = "hd02s089"
elif Race == "Multiple":
    race_table = "hd02s095"
elif Race == "Hispanic":
    race_table = "hd02s107"

RACENR = TOTALNR*target_unnormalized[race_table]/100.0

if Age == "0-14":
    age_table = "hd02s002"
elif Age == "15-19":
    age_table = "hd02s005"
elif Age == "20-24":
    age_table = "hd02s006"
elif Age == "25-29":
    age_table = "hd02s007"
elif Age == "30-34":
    age_table = "hd02s008"
elif Age == "35-39":
    age_table = "hd02s009"
elif Age == "40-44":
    age_table = "hd02s010"
elif Age == "45-54":
    age_table = "hd02s011"
elif Age == "55-64":
    age_table = "hd02s013"
elif Age == "65+":
    age_table = "hd02s015"

AGENR = TOTALNR*target_unnormalized[age_table]/100.0



zipcoderate = calculate_rate(Zipcode)*100
genderrate = gender_rate[Gender]*100
agerate = age_rate[Age]*100
racerate = race_rate[Race]*100

the_result = (zipcoderate/TOTALNR.values + genderrate/GENDERNR.values + racerate/RACENR.values + agerate/AGENR.values)/(1.0/TOTALNR.values+1.0/GENDERNR.values+1.0/RACENR.values+1.0/AGENR.values)

d = np.array([the_result[0], genderrate, agerate, racerate, zipcoderate[0]])
d_label = np.array(["You", "Your gender", "Your age group", "Your race / ethnicity", "Your location"])
d_label









    



/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)






    Out[28]:





array(['You', 'Your gender', 'Your age group', 'Your race / ethnicity',
       'Your location'], 
      dtype='<U21')



In [1]:

    
sns.set(style="white", context="talk")

fig, ax = plt.subplots(1, 1, figsize=(10, 6), sharex=True)
sns.barplot(d_label, d, palette="RdBu_r", ax=ax)
ax.set_ylabel("Risk", fontsize=20)
plt.title(r'Chlamydia', fontsize=20)
ax.plot([-1, len(d)], [0,0], "k-", linewidth=1.0)
sns.despine(bottom=True)
plt.setp(fig.axes, yticks=[])
plt.tight_layout(h_pad=3)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-2a9e1e284dae> in <module>()
----> 1 sns.set(style="white", context="talk")
      2 
      3 fig, ax = plt.subplots(1, 1, figsize=(10, 6), sharex=True)
      4 sns.barplot(d_label, d, palette="RdBu_r", ax=ax)
      5 ax.set_ylabel("Risk", fontsize=20)

NameError: name 'sns' is not defined



In [ ]:

	FIPS	Population	hd01s001	hd02s002	hd02s005	hd02s006	hd02s007	hd02s008	hd02s009	hd02s010	...	hd01s168	hd02s181	hd02s184	hd01vd01	d002	d014	d019	d024	d029	lnd110210d
0	1001	55246	4.736962	21.8	7.9	5.6	5.8	6.1	7.6	7.5	...	3.13	75.4	24.6	52475	0.562138	0.003017	0.020029	0.002868	0.017704	92.781808
1	1003	195540	5.260703	19.0	6.4	5.2	5.6	5.9	6.3	6.6	...	2.93	72.5	27.5	50183	0.545409	0.002747	0.023886	0.003444	0.020292	122.920831
2	1005	27076	4.438653	18.0	6.3	6.5	7.3	6.6	6.6	6.6	...	3.01	66.8	33.2	35634	0.437169	0.002342	0.019348	0.003666	0.022200	30.563959
3	1007	22512	4.360120	18.4	6.7	6.5	7.0	7.2	7.6	7.1	...	3.09	75.6	24.4	37984	0.524582	0.001886	0.020244	0.002012	0.020370	36.101222
4	1009	57872	4.758321	20.2	7.0	5.4	6.0	6.0	6.8	7.0	...	3.07	80.6	19.4	44409	0.606034	0.001946	0.017981	0.003707	0.013440	89.615659

	geoid2	Population	hd01s001	hd02s002	hd02s005	hd02s006	hd02s007	hd02s008	hd02s009	hd02s010	...	hd01s168	hd02s181	hd02s184	hd01vd01	d002	d014	d019	d024	d029	landsqmi
0	602	-0.187985	0.202049	-0.140291	0.868417	0.458393	0.369554	1.049363	0.316382	0.966525	...	1.155430	0.683548	-0.683573	-2.522258	0.550821	-0.475777	-1.064918	-0.417037	-1.132623	0.653718
1	603	-0.147833	0.401021	0.001670	0.241583	0.222605	0.452316	0.835698	0.668510	-0.023418	...	0.376440	-1.061493	1.061497	-2.524746	-1.301043	-0.781474	-1.519258	-0.905455	0.099090	0.876386
2	606	-0.294408	-1.124613	0.356574	0.689322	0.694182	0.866131	0.408369	0.081630	0.100325	...	1.377999	-0.036628	0.036615	-2.705957	-0.339241	-0.015301	0.107207	-1.278090	-0.523061	-0.057227
3	610	-0.226109	-0.056754	0.072651	0.510226	0.144009	0.286791	1.049363	0.903262	0.966525	...	0.821577	0.683548	-0.683573	-2.446622	-0.395512	-0.547272	-0.622365	-0.760020	0.785830	0.330580
4	612	-0.110268	0.547769	-0.317743	0.420679	0.183307	0.121265	0.622034	0.433758	0.224068	...	0.320798	0.032620	-0.032634	-2.406234	-1.282855	-0.831928	-1.189636	-1.009735	0.851580	0.447418