notebook.community

Edit and run



In [24]:

    
import pickle
%load_ext autoreload
%autoreload 1



In [25]:

    
import os
import yaml
import sys

os.chdir('..')



In [26]:

    
%aimport src.ingest.get_bills
%aimport src.wrangle.create_features
%aimport src.analyze.run_model
%aimport src.report.store_db



In [27]:

    
from src.ingest.get_bills import get_us_bills
from src.ingest.get_bills import get_ny_bills
from src.ingest.get_bills import get_subjects
from src.wrangle.create_features import make_feat_union
from src.analyze.run_model import create_model
from src.analyze.run_model import run_model
from src.wrangle.create_features import make_x_values
from src.wrangle.create_features import make_y_values
from src.analyze.run_model import get_y_probs
from src.report.store_db import store_us_db
from src.report.store_db import store_ny_db



In [28]:

    
from src.ingest.setup_database import US_Score
from src.ingest.setup_database import NY_Score
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine



In [29]:

    
import psycopg2
import pandas as pd
# connect:
con = None
con = psycopg2.connect(database = 'bills_db', user = 'Joel')

# query:
sql_query = """
SELECT * FROM us_bills;
"""
us_bills = pd.read_sql_query(sql_query,con)



In [30]:

    
X = make_x_values(us_bills)



In [31]:

    
import psycopg2
import pandas as pd
# connect:
con = None
con = psycopg2.connect(database = 'bills_db', user = 'Joel')

# query:
sql_query = """
SELECT * FROM ny_bills;
"""
ny_bills = pd.read_sql_query(sql_query,con)



In [32]:

    
X_ny = make_x_values(ny_bills)



In [175]:

    
model = pickle.load(open('/Users/Joel/Desktop/Insight/bill_taxonomy/models/model_Securities_2016-09-23-01-51-17.p'))



In [176]:

    
subject_name = 'Securities'



In [177]:

    
us_y_probs = model.predict_proba(X)[:, 1]



In [178]:

    
ny_y_probs = model.predict_proba(X_ny)[:, 1]



In [179]:

    
engine = create_engine('postgres://%s@localhost/%s' % ('Joel', 'bills_db'))

# Open a session and connect to the database engine
Session = sessionmaker(bind=engine)
session = Session()

for i, bill in enumerate(ny_y_probs):

    one_bill = NY_Score(subject=subject_name, bill_num=ny_bills.ix[i, 'bill_num'],
                        score=ny_y_probs[i])
    session.add(one_bill)
session.commit()
session.close()



In [180]:

    
engine = create_engine('postgres://%s@localhost/%s' % ('Joel', 'bills_db'))

# Open a session and connect to the database engine
Session = sessionmaker(bind=engine)
session = Session()

for i, bill in enumerate(us_y_probs):

    one_bill = US_Score(subject=subject_name, bill_num=us_bills.ix[i, 'bill_num'],
                        actual=bool(y[i]), score=us_y_probs[i])
    session.add(one_bill)
session.commit()
session.close()



In [33]:

    
import pickle
%load_ext autoreload
%autoreload 1

import os
import yaml
import sys

os.chdir('..')

from src.ingest.setup_database import US_Score
from src.ingest.setup_database import NY_Score
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from src.ingest.get_bills import get_us_bills
from src.ingest.get_bills import get_ny_bills
from src.ingest.get_bills import get_subjects
from src.wrangle.create_features import make_feat_union
from src.analyze.run_model import create_model
from src.analyze.run_model import run_model
from src.wrangle.create_features import make_x_values
from src.wrangle.create_features import make_y_values
from src.analyze.run_model import get_y_probs
from src.report.store_db import store_us_db
from src.report.store_db import store_ny_db









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



In [46]:

    
ymlfile = open("configs.yml", 'r')
cfg = yaml.load(ymlfile)
ymlfile.close()
us_bills_subset = cfg['us_bills_subset']



In [47]:

    
username = cfg['username']
dbname = cfg['dbname']



In [48]:

    
us_bills = get_us_bills(dbname, username, us_bills_subset)



In [49]:

    
subject_list = cfg['subjects']



In [50]:

    
subject_list









    Out[50]:





['Employee benefits and pensions']



In [51]:

    
X = make_x_values(us_bills)
pipe_feats = cfg['pipe_feats']



In [52]:

    
feat_un, feat_params = make_feat_union(pipe_feats, cfg)



In [53]:

    
model_type = cfg['model_type']
model = create_model(feat_un, model_type, feat_params, cfg)
        
subjects = get_subjects(dbname, username, subject_list)



In [54]:

    
y = make_y_values(us_bills, subjects, 'Employee benefits and pensions')



In [55]:

    
len(y[y==1])









    Out[55]:





14



In [58]:

    
reload(src.report.store_db)
results = []
for sub in subject_list:
    
    y = make_y_values(us_bills, subjects, sub)
    fit_mod = run_model(model, X, y, sub, cfg)
    results.append(fit_mod)

    if(cfg['store_us']):
        y_probs_us = get_y_probs(fit_mod, X)
        src.report.store_db.store_us_db(dbname, username, us_bills, sub, y_probs_us, y, cfg)

    if(cfg['store_ny']):
        ny_bills_subset = cfg['ny_bills_subset']
        ny_bills = get_ny_bills(dbname, username, ny_bills_subset)
        X_ny = make_x_values(ny_bills)
        y_probs_ny = get_y_probs(fit_mod, X_ny)
        src.report.store_db.store_ny_db(dbname, username, ny_bills, sub, y_probs_ny, cfg)









    



Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] model__C=10, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4 
[CV]  model__C=10, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4, score=0.943210 -  11.1s
[CV] model__C=10, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4 






    



[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:   11.2s






    



[CV]  model__C=10, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4, score=0.786420 -  12.8s
[CV] model__C=10, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4 
[CV]  model__C=10, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4, score=0.953704 -  14.0s
[CV] model__C=100, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4 
[CV]  model__C=100, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4, score=0.922222 -  10.8s
[CV] model__C=100, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4 






    



[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:   49.0s






    



[CV]  model__C=100, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4, score=0.793827 -  12.7s
[CV] model__C=100, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4 
[CV]  model__C=100, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4, score=0.919753 -  11.8s
[CV] model__C=1000000000, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4 
[CV]  model__C=1000000000, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4, score=0.711111 -  11.8s
[CV] model__C=1000000000, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4 






    



[Parallel(n_jobs=1)]: Done   7 tasks       | elapsed:  1.4min






    



[CV]  model__C=1000000000, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4, score=0.813580 -  11.2s
[CV] model__C=1000000000, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4 
[CV]  model__C=1000000000, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__tfidf_title__max_features=None, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4, score=0.716049 -  11.7s






    



[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.8min finished



In [22]:

    
from sqlalchemy import create_engine
import pandas as pd
import psycopg2
dbwrite_user = 'ubuntu'
host =  'ec2-52-45-31-100.compute-1.amazonaws.com'
#host = '52-45-31-100'
dbname = 'bills_db'



In [23]:

    
engine = create_engine('postgres://%s@%s/%s' % (dbwrite_user, host, dbname))
con = psycopg2.connect(host=host, database=dbname, user=dbwrite_user)
sql_query = """
            SELECT nyb.bill_num, nyb.bill_name, ts.score
            FROM table_score as ts
            INNER JOIN ny_bills as nyb
            ON nyb.bill_num=ts.bill_num
            WHERE ts.subject='Health'
            ORDER BY ts.score DESC
            LIMIT 100;
            """
query_results = pd.read_sql_query(sql_query, con)
print(query_results[:100])
con.close()









    



   bill_num                                          bill_name  score
0     A501E  Requires health care plans and insurers to pro...    1.0
1    S3651D  Requires specific Medicaid insurance codes rel...    1.0
2     S3398  Relates to requirements for the transfer of pa...    1.0
3     K1518  Memorializing Governor Andrew M. Cuomo to proc...    1.0
4    A3072A  Creates an education and outreach program for ...    1.0
5      J128  Memorializing Governor Andrew M. Cuomo to proc...    1.0
6    S5216A  Creates an education and outreach program for ...    1.0
7     S3900  Relates to the administration of certain immun...    1.0
8      J400  Memorializing Governor Andrew M. Cuomo to proc...    1.0
9     J6425  Memorializing Governor Andrew M. Cuomo to proc...    1.0
10    J1186  Memorializing Governor Andrew M. Cuomo to proc...    1.0
11     K560  Memorializing Governor Andrew M. Cuomo to proc...    1.0
12    J1334  Memorializing Governor Andrew M. Cuomo to proc...    1.0
13   A2552A  Relates to telehealth delivery of services; re...    1.0
14    K1577  Memorializing Governor Andrew M. Cuomo to proc...    1.0
15    J3535  Memorializing Governor Andrew M. Cuomo to proc...    1.0
16    A7616  Prohibits requiring health care providers to p...    1.0
17     J525  Congratulating Genesee Cancer Assistance, Inc....    1.0
18     K580  Memorializing Governor Andrew M. Cuomo to proc...    1.0
19     J387  Memorializing Governor Andrew M. Cuomo to proc...    1.0
20    S2405  Relates to telehealth delivery of services; re...    1.0
21     K299  Memorializing Governor Andrew M. Cuomo to proc...    1.0
22    A4938  Allows reimbursements for certain services pro...    1.0
23   S4857A  Authorizes pharmacists to perform collaborativ...    1.0
24     K570  Memorializing Governor Andrew M. Cuomo to proc...    1.0
25   A1327A  Provides for the expedited utilization review ...    1.0
26   S5170A  Limits the substitution of abuse-deterrent ana...    1.0
27    J5051  Memorializing Governor Andrew M. Cuomo to proc...    1.0
28    A1567  Establishes the clinical trial access and educ...    1.0
29    J4190  Memorializing Governor Andrew M. Cuomo to proc...    1.0
..      ...                                                ...    ...
70   A10718  Relates to the reimbursement rates for complex...    1.0
71    J4431  Memorializing Governor Andrew M. Cuomo to proc...    1.0
72   A10470  Relates to establishing protections from exces...    1.0
73    K1562  Memorializing Governor Andrew M. Cuomo to proc...    1.0
74    K1520  Memorializing Governor Andrew M. Cuomo to proc...    1.0
75    K1471  Memorializing Governor Andrew M. Cuomo to proc...    1.0
76    A355A  Authorizes the commissioner of health to estab...    1.0
77   S6392A  Requires reimbursement for surgical first assi...    1.0
78    A9487  Provides for patient access to FDA approved ab...    1.0
79    J5435  Memorializing Governor Andrew M. Cuomo to proc...    1.0
80     S280  Includes nurse practitioners as a provider of ...    1.0
81    K1116  Memorializing Governor Andrew M. Cuomo to proc...    1.0
82    K1382  Memorializing Governor Andrew M. Cuomo to proc...    1.0
83    K1296  Memorializing Governor Andrew M. Cuomo to proc...    1.0
84    K1112  Recognizing April 2016, as National Donate Lif...    1.0
85   A2211A  Relates to conforming the definition of pallia...    1.0
86   S7067A  Makes care and services provided by licensed m...    1.0
87   S2809A  Provides for the insurance coverage of the syn...    1.0
88    J4772  Memorializing Governor Andrew M. Cuomo to proc...    1.0
89   A4036A  Provides for the insurance coverage of the syn...    1.0
90   A10202  Relates to the sale and possession of hypoderm...    1.0
91    S378A  Extends the protections of the medical exempti...    1.0
92    K1284  Memorializing Governor Andrew M. Cuomo to proc...    1.0
93    S7319  Relates to requiring health insurance coverage...    1.0
94    A448A  Requires managed care programs to establish pr...    1.0
95   A10476  Ensures reproductive health care during auto e...    1.0
96   A10066  Relates to coverage for maternal depression sc...    1.0
97    A9301  Requires a utilization review of a request for...    1.0
98    K1499  Memorializing Governor Andrew M. Cuomo to proc...    1.0
99    K1496  Memorializing Governor Andrew M. Cuomo to proc...    1.0

[100 rows x 3 columns]



In [59]:

    
con.close()



In [129]:

    
from sqlalchemy.orm import sessionmaker
from src.ingest.setup_database import NY_Score

Session = sessionmaker(bind=engine)
session = Session()



In [ ]:



In [133]:

    
result = session.query(NY_Score).filter(NY_Score.subject == 'Health')
session.commit()



In [134]:

    
result









    Out[134]:





<sqlalchemy.orm.query.Query at 0x116a5ed50>



In [ ]: