A first simple model


In [1]:
import pandas as pd
import pickle
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
%matplotlib inline

In [2]:
# Plain Seaborn figures with matplotlib color codes mapped to the default seaborn palette 
sns.set(style="white", color_codes=True)

In [3]:
gender_rate = {}
gender_factor = {}
gender_number = {}
gender_rate["Male"] = 278.4e-5
gender_rate["Female"] = 627.2e-5
gender_number["Male"] = 155651602
gender_number["Female"] = 160477237
rate_average = (gender_rate["Male"]*gender_number["Male"]+gender_rate["Female"]*gender_number["Male"])/(gender_number["Male"]+gender_number["Female"])
gender_factor["Male"] = gender_rate["Male"]/rate_average
gender_factor["Female"] = gender_rate["Female"]/rate_average
gender_factor["Female"], gender_factor["Male"]
race_rate = {}
race_factor = {}
race_number = {}
race_number["Native"] = 1942876.0
race_number["Asian"] = 12721721.0
race_number["Black"] = 29489649.0
race_number["Hispanic"] = 46407173.0
race_number["Multiple"] = 5145135.0
race_number["Pacific"] = 473703.0
race_number["White"] = 161443167.0
race_rate["Native"] = 689.1e-5
race_rate["Asian"] = 115.8e-5
race_rate["Black"] = 1152.6e-5
race_rate["Hispanic"] = 376.2e-5
race_rate["Multiple"] = 116.1e-5
race_rate["Pacific"] = 641.5e-5
race_rate["White"] = 187.0e-5
US_number = race_number["Native"] + race_number["Asian"] + race_number["Black"] + race_number["Hispanic"] + race_number["Multiple"] + race_number["Pacific"] + race_number["White"]
rate_average = (race_rate["Native"]*race_number["Native"]+race_rate["Asian"]*race_number["Asian"]+race_rate["Black"]*race_number["Black"]+race_rate["Hispanic"]*race_number["Hispanic"]+race_rate["Multiple"]*race_number["Multiple"]+race_rate["Pacific"]*race_number["Multiple"]+race_rate["White"]*race_number["White"])/US_number  
race_factor["Native"] = race_rate["Native"]/rate_average
race_factor["Asian"] = race_rate["Asian"]/rate_average
race_factor["Black"] = race_rate["Black"]/rate_average
race_factor["Hispanic"] = race_rate["Hispanic"]/rate_average
race_factor["Multiple"] = race_rate["Multiple"]/rate_average
race_factor["Pacific"] = race_rate["Pacific"]/rate_average
race_factor["White"] = race_rate["White"]/rate_average

age_rate = {}
age_factor = {}
age_number = {}
age_number["0-14"] = 61089123.0
age_number["15-19"] = 21158964.0
age_number["20-24"] = 22795438.0
age_number["25-29"] = 21580198.0
age_number["30-34"] = 21264389.0
age_number["35-39"] = 19603770.0
age_number["40-44"] = 20848920.0
age_number["45-54"] = 43767532.0
age_number["55-64"] = 39316431.0
age_number["65+"] = 44704074.0

age_rate["0-14"] = 20.0e-5
age_rate["15-19"] = 1804.0e-5
age_rate["20-24"] = 2484.6e-5
age_rate["25-29"] = 1176.2e-5
age_rate["30-34"] = 532.4e-5
age_rate["35-39"] = 268.0e-5
age_rate["40-44"] = 131.5e-5
age_rate["45-54"] = 56.6e-5
age_rate["55-64"] = 16.6e-5
age_rate["65+"] = 3.2e-5

US_age_number = age_number["0-14"] + age_number["15-19"] + age_number["20-24"] + age_number["25-29"] + age_number["30-34"] + age_number["35-39"] + age_number["40-44"] + age_number["45-54"] + age_number["55-64"] + age_number["65+"]
rate_average = (age_rate["0-14"]*age_number["0-14"]+age_rate["15-19"]*age_number["15-19"]+age_rate["20-24"]*age_number["20-24"]+age_rate["25-29"]*age_number["25-29"]+age_rate["30-34"]*age_number["30-34"]+age_rate["35-39"]*age_number["35-39"]+age_rate["40-44"]*age_number["40-44"]+age_rate["45-54"]*age_number["45-54"]+age_rate["55-64"]*age_number["55-64"]+age_rate["65+"]*age_number["65+"])/US_age_number  
age_factor["0-14"] = age_rate["0-14"]/rate_average
age_factor["15-19"] = age_rate["15-19"]/rate_average
age_factor["20-24"] = age_rate["20-24"]/rate_average
age_factor["25-29"] = age_rate["25-29"]/rate_average
age_factor["30-34"] = age_rate["30-34"]/rate_average
age_factor["35-39"] = age_rate["35-39"]/rate_average
age_factor["40-44"] = age_rate["40-44"]/rate_average
age_factor["45-54"] = age_rate["45-54"]/rate_average
age_factor["55-64"] = age_rate["55-64"]/rate_average
age_factor["65+"] = age_rate["65+"]/rate_average

race_factor["Native"], race_factor["Asian"], race_factor["Black"], race_factor["Hispanic"], race_factor["Multiple"], race_factor["Pacific"], race_factor["White"]
age_factor["0-14"], age_factor["15-19"], age_factor["20-24"], age_factor["25-29"], age_factor["30-34"], age_factor["35-39"], age_factor["40-44"], age_factor["45-54"], age_factor["55-64"], age_factor["65+"]


Out[3]:
(0.04390608807429472,
 3.960329144301384,
 5.454453321469633,
 2.5821170396492725,
 1.1687800645377253,
 0.5883415801955493,
 0.2886825290884878,
 0.12425422925025406,
 0.036442053101664616,
 0.007024974091887155)

In [4]:
model = pickle.load(open('../data/randomforest_params.pickle', "rb" ))

In [5]:
Ymean = pickle.load(open('../data/Ymean.pickle', "rb"))

In [6]:
Ystd = pickle.load(open('../data/Ystd.pickle', "rb"))

Connect to ZIP code census SQL data base


In [12]:
dbname = 'census_zipcode_db'
username = 'akuepper'
pswd = 'FLP@nd'

In [13]:
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print(engine.url)


postgresql://akuepper:FLP%40nd@localhost/census_zipcode_db

In [9]:
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

# query:
sql_query = """
SELECT * FROM zip_census_db WHERE geoid2='602';
"""

data_from_sql = pd.read_sql_query(sql_query,con)

data_from_sql.Population[0]


Out[9]:
-0.18798483379424599

In [10]:
sql_query = """
SELECT * FROM zip_census_db;
"""
data_from_sql = pd.read_sql_query(sql_query,con)
data_from_sql.head()


Out[10]:
geoid2 Population hd01s001 hd02s002 hd02s005 hd02s006 hd02s007 hd02s008 hd02s009 hd02s010 ... hd01s168 hd02s181 hd02s184 hd01vd01 d002 d014 d019 d024 d029 landsqmi
0 602 -0.187985 0.202049 -0.140291 0.868417 0.458393 0.369554 1.049363 0.316382 0.966525 ... 1.155430 0.683548 -0.683573 -2.522258 0.550821 -0.475777 -1.064918 -0.417037 -1.132623 0.653718
1 603 -0.147833 0.401021 0.001670 0.241583 0.222605 0.452316 0.835698 0.668510 -0.023418 ... 0.376440 -1.061493 1.061497 -2.524746 -1.301043 -0.781474 -1.519258 -0.905455 0.099090 0.876386
2 606 -0.294408 -1.124613 0.356574 0.689322 0.694182 0.866131 0.408369 0.081630 0.100325 ... 1.377999 -0.036628 0.036615 -2.705957 -0.339241 -0.015301 0.107207 -1.278090 -0.523061 -0.057227
3 610 -0.226109 -0.056754 0.072651 0.510226 0.144009 0.286791 1.049363 0.903262 0.966525 ... 0.821577 0.683548 -0.683573 -2.446622 -0.395512 -0.547272 -0.622365 -0.760020 0.785830 0.330580
4 612 -0.110268 0.547769 -0.317743 0.420679 0.183307 0.121265 0.622034 0.433758 0.224068 ... 0.320798 0.032620 -0.032634 -2.406234 -1.282855 -0.831928 -1.189636 -1.009735 0.851580 0.447418

5 rows × 46 columns


In [11]:
def calculate_rate(sql_query, con):
    target = pd.read_sql_query(sql_query,con)
    target_params = target.values[0]
    chlamydia_rate = model.predict(target_params[1:])*Ystd+Ymean
    return chlamydia_rate

Race = "White"
Gender = "Male"
Age = "15-19"
Zipcode = "02139"

sql_query = "SELECT * FROM zip_census_db WHERE geoid2=%i;"%(int(Zipcode))

calculate_rate(sql_query, con)


/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-11-2ce88992e27f> in <module>()
     12 sql_query = "SELECT * FROM zip_census_db WHERE geoid2=%i;"%(int(Zipcode))
     13 
---> 14 calculate_rate(sql_query, con)

<ipython-input-11-2ce88992e27f> in calculate_rate(sql_query, con)
      2     target = pd.read_sql_query(sql_query,con)
      3     target_params = target.values[0]
----> 4     chlamydia_rate = model.predict(target_params[1:])*Ystd+Ymean
      5     return chlamydia_rate
      6 

/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/ensemble/forest.py in predict(self, X)
    647         """
    648         # Check data
--> 649         X = self._validate_X_predict(X)
    650 
    651         # Assign chunk of trees to jobs

/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/ensemble/forest.py in _validate_X_predict(self, X)
    317                                  "call `fit` before exploiting the model.")
    318 
--> 319         return self.estimators_[0]._validate_X_predict(X, check_input=True)
    320 
    321     @property

/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/tree/tree.py in _validate_X_predict(self, X, check_input)
    374                              " match the input. Model n_features is %s and "
    375                              " input n_features is %s "
--> 376                              % (self.n_features_, n_features))
    377 
    378         return X

ValueError: Number of features of the model must  match the input. Model n_features is 39 and  input n_features is 45 

In [33]:
# query:
sql_query = "SELECT * FROM zip_census_unnormalized_db WHERE geoid2=%i;"%(int(Zipcode))

target_unnormalized = pd.read_sql_query(sql_query,con)

TOTALNR = target_unnormalized["Population"]

if Gender == "Male":
    gender_table = "hd02s026"
else:
    gender_table = "hd02s051"

GENDERNR = TOTALNR*target_unnormalized[gender_table]/100.0

if Race == "White":
    race_table = "hd02s078"
elif Race == "Black":
    race_table = "hd02s079"
elif Race == "Native":
    race_table = "hd02s080"
elif Race == "Asian":
    race_table = "hd02s081"
elif Race == "Pacific":
    race_table = "hd02s089"
elif Race == "Multiple":
    race_table = "hd02s095"
elif Race == "Hispanic":
    race_table = "hd02s107"

RACENR = TOTALNR*target_unnormalized[race_table]/100.0

if Age == "0-14":
    age_table = "hd02s002"
elif Age == "15-19":
    age_table = "hd02s005"
elif Age == "20-24":
    age_table = "hd02s006"
elif Age == "25-29":
    age_table = "hd02s007"
elif Age == "30-34":
    age_table = "hd02s008"
elif Age == "35-39":
    age_table = "hd02s009"
elif Age == "40-44":
    age_table = "hd02s010"
elif Age == "45-54":
    age_table = "hd02s011"
elif Age == "55-64":
    age_table = "hd02s013"
elif Age == "65+":
    age_table = "hd02s015"

AGENR = TOTALNR*target_unnormalized[age_table]/100.0

sql_query = "SELECT * FROM zip_census_db WHERE geoid2=%i;"%(int(Zipcode))

zipcoderate = calculate_rate(sql_query, con)*100
genderrate = gender_rate[Gender]*100
agerate = age_rate[Age]*100
racerate = race_rate[Race]*100

the_result = (zipcoderate/TOTALNR.values + genderrate/GENDERNR.values + racerate/RACENR.values + agerate/AGENR.values)/(1.0/TOTALNR.values+1.0/GENDERNR.values+1.0/RACENR.values+1.0/AGENR.values)

d = np.array([the_result[0], genderrate, agerate, racerate, zipcoderate[0]])
d_label = np.array(["You", "Your gender", "Your age group", "Your race / ethnicity", "Your location"])
d_label


/Users/akuepper/anaconda/lib/python3.5/site-packages/sklearn/utils/validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
Out[33]:
array(['You', 'Your gender', 'Your age group', 'Your race / ethnicity',
       'Your location'], 
      dtype='<U21')

In [35]:
sns.set(style="white", context="talk")

fig, ax = plt.subplots(1, 1, figsize=(10, 6), sharex=True)
sns.barplot(d_label, d, palette="RdBu_r", ax=ax)
ax.set_ylabel("Risk", fontsize=20)
plt.title(r'Chlamydia', fontsize=20)
ax.plot([-1, len(d)], [0,0], "k-", linewidth=1.0)
sns.despine(bottom=True)
plt.setp(fig.axes, yticks=[])
plt.tight_layout(h_pad=3)



In [ ]: