Setups



In [1]:

    
import psycopg2
import pandas as pd
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
import string
import re
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy
import pickle
import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline

wordnet_lemmatizer = WordNetLemmatizer()
dbname = 'bills_db'
username = 'Joel'



In [2]:

    
import os
import yaml
import sys

os.chdir('..')



In [3]:

    
from src.ingest.get_bills import get_us_bills
from src.ingest.get_bills import get_ny_bills
from src.ingest.get_bills import get_subjects
from src.wrangle.create_features import make_feat_union
from src.analyze.run_model import create_model
from src.analyze.run_model import run_model
from src.wrangle.create_features import make_x_values
from src.wrangle.create_features import make_y_values
from src.wrangle.create_features import lemmatize_tokens
from src.wrangle.create_features import tokenize
from src.wrangle.create_features import my_preproc_text
from src.wrangle.create_features import my_preproc_title
from src.analyze.run_model import get_y_probs
from src.report.store_db import store_us_db
from src.report.store_db import store_ny_db
from src.report.make_roc_curve import make_roc_curve
from src.utils.get_time_stamp import get_time_stamp









    



/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')



In [9]:

    
con = psycopg2.connect(database = dbname, user = username)

Rerun only if the underlying data has changed

query:

sql_query = """ SELECT * FROM us_bills; """ us_bills = pd.read_sql_query(sql_query,con) us_X = make_x_values(us_bills) us_tf_vect_raw = CountVectorizer(stop_words='english', tokenizer=tokenize, preprocessor=my_preproc_text) us_tf_text_raw = us_tf_vect_raw.fit_transform(us_X) us_tf_vect_clean = CountVectorizer(stop_words='english', tokenizer=tokenize, preprocessor=my_preproc_text,
min_df=10, max_df=0.4) us_tf_text_clean = us_tf_vect_clean.fit_transform(us_X) pickle.dump((us_bills, us_X), open('../presentations/data/us_data.p', 'wb')) pickle.dump((us_tf_vect_raw, us_tf_text_raw, us_tf_vect_clean, us_tf_text_clean), open('../presentations/data/us_tf.p', 'wb'))

Rerun only if the underlying data has changed

con = psycopg2.connect(database = dbname, user = username)

query:

sql_query = """ SELECT * FROM ny_bills; """ ny_bills = pd.read_sql_query(sql_query,con) ny_X = make_x_values(ny_bills) ny_tf_vect_raw = CountVectorizer(stop_words='english', tokenizer=tokenize, preprocessor=my_preproc_text) ny_tf_text_raw = ny_tf_vect_raw.fit_transform(ny_X) ny_tf_vect_clean = CountVectorizer(stop_words='english', tokenizer=tokenize, preprocessor=my_preproc_text,
min_df=10, max_df=0.4) ny_tf_text_clean = ny_tf_vect_clean.fit_transform(ny_X) pickle.dump((ny_bills, ny_X), open('../presentations/data/ny_data.p', 'wb')) pickle.dump((ny_tf_vect_raw, ny_tf_text_raw, ny_tf_vect_clean, ny_tf_text_clean), open('../presentations/data/ny_tf.p', 'wb'))



In [8]:

    
us_bills, us_x = pickle.load(open('../presentations/data/us_data.p', 'rb'))
us_tf_vect_raw, us_tf_text_raw, us_tf_vect_clean, us_tf_text_clean = pickle.load(
    open('../presentations/data/us_tf.p', 'rb'))



In [5]:

    
ny_bills, ny_x = pickle.load(open('../presentations/data/ny_data.p', 'rb'))
ny_tf_vect_raw, ny_tf_text_raw, ny_tf_vect_clean, ny_tf_text_clean = pickle.load(
    open('../presentations/data/ny_tf.p', 'rb'))

Slide 4



In [15]:

    
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

here we set some aesthetic parameters so that all of our figures are nice and big

plt.rcParams['figure.figsize'] = (3, 8) plt.rcParams['font.size'] = 20 sns.set(style="white", context="talk")



In [82]:

    
#plt.rcParams.keys()



In [17]:

    
column_sums = us_tf_text_raw.sum(axis=0)



In [137]:

    
label_size = 11
figsize = (10, 3)



In [18]:

    
sum_df = pd.DataFrame(column_sums.transpose(), index=us_tf_vect_raw.get_feature_names(), columns=['word_counts'])



In [50]:

    
us_top_20 = sum_df.sort_values(by='word_counts', ascending=False)[0:20]



In [39]:

    
plt.figure(figsize=(3,4))
plt.hist(sum_df['word_counts'], 20, log=True)
plt.ylabel("Unique Words", size=15)
plt.xlabel("Word Count", size=15)
plt.ylim(0.1)
plt.xticks(size=15)
plt.yticks(size=15)
plt.title("U.S. Word Frequency", size=15)
plt.locator_params(axis='x', nbins=3)



In [269]:

    
us_top_20.sort_values(by='word_counts').plot(kind='barh', legend=None, figsize=figsize)
plt.ylabel("Unique Words", size=label_size)
plt.xlabel("Word Count", size=label_size)
plt.yticks(size=label_size)
plt.xticks(size=label_size)
plt.title("Word Counts for Top 20 Words in Bills for 114th U.S. Congress", size=label_size)









    Out[269]:





<matplotlib.text.Text at 0x18cb6f690>

To build a word cloud

all_words = [word for word in tqdm.tqdm(vect.get_feature_names()) for i in range(0,sum_df.ix[word,0])] one_text = " ".join(all_words) wordcloud = WordCloud().generate(one_text) plt.imshow(wordcloud) plt.axis("off") plt.show()



In [24]:

    
ny_column_sums = ny_tf_text_raw.sum(axis=0)



In [25]:

    
ny_sum_df = pd.DataFrame(ny_column_sums.transpose(), index=ny_tf_vect_raw.get_feature_names(), columns=['word_counts'])



In [34]:

    
ny_top_20 = ny_sum_df.sort_values(by='word_counts', ascending=False)[0:20]

plt.hist(ny_sum_df['word_counts'], 50, log=True) plt.ylabel("Number of Unique Words with Given Word Count") plt.xlabel("Word Count of Unique Words") plt.ylim(0.1) plt.title("Histogram of Word Frequency in Bills for 2015 Session of New York Legislature")



In [270]:

    
ny_top_20.sort_values(by='word_counts').plot(kind='barh', legend=None, figsize=figsize)
plt.ylabel("Unique Words", size=label_size)
plt.xlabel("Word Count", size=label_size)
plt.yticks(size=label_size)
plt.xticks(size=label_size)
plt.title("Word Counts for Top 20 Words in Bills for 2015 Session of New York Legislature", size=label_size)









    Out[270]:





<matplotlib.text.Text at 0x184888450>

ny_all_words = [word for word in tqdm.tqdm(ny_vect.get_feature_names()) for i in range(0,ny_sum_df.ix[word,0])] ny_one_text = " ".join(ny_all_words) wordcloud = WordCloud().generate(ny_one_text) plt.imshow(wordcloud) plt.axis("off") plt.show()

Slide 5

Cleaned by focusing only on words in at least 10 documents and fewer than 40% of documents



In [40]:

    
us_clean_column_sums = us_tf_text_clean.sum(axis=0)



In [41]:

    
us_clean_sum_df = pd.DataFrame(us_clean_column_sums.transpose(), index=us_tf_vect_clean.get_feature_names(), columns=['word_counts'])



In [121]:

    
us_clean_top_20 = us_clean_sum_df.sort_values(by='word_counts', ascending=False)[0:20]



In [58]:

    
plt.figure(figsize=(3,4))
plt.hist(us_clean_sum_df['word_counts'], 20, log=True)
plt.ylabel("Unique Words", size=15)
plt.xlabel("Word Count", size=15)
plt.ylim(0.1)
plt.xticks(size=15)
plt.yticks(size=15)
plt.title("U.S. Reduced Frequency", size=15)
plt.locator_params(axis='x', nbins=3)



In [271]:

    
us_clean_top_20.sort_values(by='word_counts').plot(kind='barh', legend=None, figsize=figsize)
plt.ylabel("Unique Words", size=label_size)
plt.xlabel("Word Count", size=label_size)
plt.yticks(size=label_size)
plt.xticks(size=label_size)
plt.title("Cleaned Word Counts for Top 20 Words in Bills for 114th U.S. Congress", size=label_size)









    Out[271]:





<matplotlib.text.Text at 0x185059210>

us_clean_all_words = [word for word in tqdm.tqdm(us_clean_vect.get_feature_names()) for i in range(0,us_clean_sum_df.ix[word,0])] us_clean_one_text = " ".join(us_clean_all_words) wordcloud = WordCloud().generate(us_clean_one_text) plt.imshow(wordcloud) plt.axis("off") plt.show()

NY Clean Data



In [125]:

    
ny_clean_column_sums = ny_tf_text_clean.sum(axis=0)



In [126]:

    
ny_clean_sum_df = pd.DataFrame(ny_clean_column_sums.transpose(), index=ny_tf_vect_clean.get_feature_names(), columns=['word_counts'])



In [127]:

    
ny_clean_top_20 = ny_clean_sum_df.sort_values(by='word_counts', ascending=False)[0:20]

plt.hist(ny_clean_sum_df['word_counts'], 50, log=True) plt.ylabel("Number of Unique Words with Given Word Count") plt.xlabel("Word Count of Unique Words") plt.ylim(0.1) plt.title("Histogram of Word Frequency in Bills for 114th U.S. Congress")



In [272]:

    
ny_clean_top_20.sort_values(by='word_counts').plot(kind='barh', legend=None, figsize=figsize)
plt.ylabel("Unique Words", size=label_size)
plt.xlabel("Word Count", size=label_size)
plt.yticks(size=label_size)
plt.xticks(size=label_size)
plt.title("Cleaned Word Counts for Top 20 Words in Bills for 2015 Session of New York Legislature", size=label_size)









    Out[272]:





<matplotlib.text.Text at 0x18b1ea8d0>

ny_clean_all_words = [word for word in tqdm.tqdm(ny_clean_vect.get_feature_names()) for i in range(0,ny_clean_sum_df.ix[word,0])] ny_clean_one_text = " ".join(ny_clean_all_words) wordcloud = WordCloud().generate(ny_clean_one_text) plt.imshow(wordcloud) plt.axis("off") plt.show()

Slide 6

Build for ROC Curves and Confusion Matrices



In [144]:

    
con = psycopg2.connect(database = dbname, user = username)
sql_str = """
    SELECT bill_num, subject FROM bill_subject
    WHERE subject='Health'
    """
sub_bills = pd.read_sql_query(sql_str, con)
y_health = make_y_values(us_bills, sub_bills, 'Health')



In [189]:

    
sql_str = """
    SELECT bill_num, subject FROM bill_subject
    WHERE subject='Intellectual property'
    """
sub_bills = pd.read_sql_query(sql_str, con)
y_ip = make_y_values(us_bills, sub_bills, 'Intellectual property')



In [162]:

    
lr = LogisticRegression(penalty='l2', C=10)



In [164]:

    
pipeline = Pipeline(steps=[("tf", us_tf_vect_clean), ('lr', lr)])



In [11]:

    
ymlfile = open("configs.yml", 'r')
cfg = yaml.load(ymlfile)
ymlfile.close()



In [191]:

    
import src.report.make_roc_curve
reload(src.report.make_roc_curve)
make_roc_curve(pipeline, us_x, y_ip, 0.9, 'Intellectual Property', cfg)









    



Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV]  ................................................................
[CV] ....................................... , score=0.779421 - 5.7min
[CV]  ................................................................






    



[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:  5.7min






    



[CV] ....................................... , score=0.714028 - 5.3min
[CV]  ................................................................
[CV] ....................................... , score=0.756213 - 5.3min






    



[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 16.3min finished



In [175]:

    
results_health = pickle.load(open('../presentations/figures/roc_health_tf_2016-09-24-13-52-01.p', 'rb'))



In [179]:

    
results_health[4]









    Out[179]:





array([[953,  11],
       [ 12, 118]])



In [193]:

    
results_ip = pickle.load(open('../presentations/figures/split_data_intellectual property_2016-09-24-14-49-24.p', 'rb'))



In [194]:

    
results_ip[4]









    Out[194]:





array([[1081,    2],
       [   8,    3]])

Slide 7

Produce density plots for TF-IDF

We would need to get count vectors for each of the words

us_tfidf_vect = TfidfVectorizer(stop_words='english', tokenizer=tokenize, preprocessor=my_preproc_text, min_df=10, max_df=0.4) us_tfidf_text = us_tfidf_vect.fit_transform(us_x) pickle.dump((us_tfidf_vect, us_tfidf_text), open('../presentations/data/us_tfidf.p', 'wb'))

ny_tfidf_vect = TfidfVectorizer(stop_words='english', tokenizer=tokenize, preprocessor=my_preproc_text, min_df=10, max_df=0.4) ny_tfidf_text = ny_tfidf_vect.fit_transform(ny_x) pickle.dump((ny_tfidf_vect, ny_tfidf_text), open('../presentations/data/ny_tfidf.p', 'wb'))



In [63]:

    
us_tfidf_vect, us_tfidf_text = pickle.load(open('../presentations/data/us_tfidf.p', 'rb'))



In [200]:

    
ny_tfidf_vect, ny_tfidf_text = pickle.load(open('../presentations/data/ny_tfidf.p', 'rb'))



In [64]:

    
tfidf_us_column_sums = us_tfidf_text.sum(axis=0)



In [65]:

    
tfidf_us_sum_df = pd.DataFrame(tfidf_us_column_sums.transpose(), index=us_tfidf_vect.get_feature_names(), columns=['word_counts'])



In [204]:

    
tfidf_us_top_20 = tfidf_us_sum_df.sort_values(by='word_counts', ascending=False)[0:20]



In [69]:

    
plt.figure(figsize=(3,4))
plt.hist(tfidf_us_sum_df['word_counts'], 20, log=True)
plt.ylabel("Word Count", size=15)
plt.xlabel("Densities", size=15)
plt.ylim(0.1)
plt.xticks(size=15)
plt.yticks(size=15)
plt.title("U.S. Word Densities", size=15)
plt.locator_params(axis='x', nbins=3)



In [274]:

    
tfidf_us_top_20.sort_values(by='word_counts').plot(kind='barh', legend=None, figsize=(6,5))
plt.ylabel("Unique Words", size=label_size+2)
plt.xlabel("Word Density", size=label_size+2)
plt.yticks(size=label_size+2)
plt.xticks(size=label_size+2)
plt.title("Top 20 Word Densities in Bills for 114th U.S. Congress", size=label_size+2)









    Out[274]:





<matplotlib.text.Text at 0x18b5ab6d0>



In [209]:

    
tfidf_ny_column_sums = ny_tfidf_text.sum(axis=0)



In [210]:

    
tfidf_ny_sum_df = pd.DataFrame(tfidf_ny_column_sums.transpose(), index=ny_tfidf_vect.get_feature_names(), columns=['word_counts'])



In [211]:

    
tfidf_ny_top_20 = tfidf_ny_sum_df.sort_values(by='word_counts', ascending=False)[0:20]

plt.hist(tfidf_ny_sum_df['word_counts'], 50, log=True) plt.ylabel("Count of Words with Given Density") plt.xlabel("Densities of Unique Words") plt.ylim(0.1) plt.title("Histogram of Word Densities in Bills for 2015 Session of New York Legislature")



In [275]:

    
tfidf_ny_top_20.plot(kind='barh', legend=None, figsize=figsize)
plt.ylabel("Unique Words", size=label_size)
plt.xlabel("Word Density", size=label_size)
plt.title("Top Word Densities in Bills for 2015 New York Legislative Session", size=label_size)









    Out[275]:





<matplotlib.text.Text at 0x18b18c210>

import numpy as np import matplotlib.pyplot as plt

N = 10 data = np.random.random((N, 4)) labels = ['point{0}'.format(i) for i in range(N)] plt.subplots_adjust(bottom = 0.1) plt.scatter( data[:, 0], data[:, 1], marker = 'o', c = data[:, 2], s = data[:, 3]*1500, cmap = plt.get_cmap('Spectral')) for label, x, y in zip(labels, data[:, 0], data[:, 1]): plt.annotate( label, xy = (x, y), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

plt.show()

tfidf_ny_top_10 = tfidf_ny_sum_df.sort_values(by='word_counts', ascending=False)[0:10]

ny_tfs = ny_clean_sum_df[ny_clean_sum_df.index.isin(tfidf_ny_top_10.index)]

ny_idfs = tfidf_ny_top_10/ny_tfs

labels = ny_tfs.sort_index().index

plt.subplots_adjust(bottom = 0.1)

y = ny_tfs.sort_index()['word_counts'] x = ny_idfs.sort_index()['word_counts'] plt.scatter( x, y, marker = 'o', s = tfidf_ny_top_10.sort_index()['word_counts']*0.5, c = tfidf_ny_top_10.sort_index()['word_counts'], cmap = plt.get_cmap('Spectral_r')) for label, x, y in zip(labels, x, y): plt.annotate( label, xy = (x, y), xytext = (-40, 40), textcoords = 'offset points', ha = 'left', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.xlim(0.005,0.02) plt.show()

Slide 9



In [227]:

    
tfidf_health = pickle.load(open('/Users/Joel/Desktop/Insight/bill_taxonomy/models/model_Health_2016-09-23-13-22-32.p'))



In [229]:

    
make_roc_curve(tfidf_health.best_estimator_, us_x, y_health, 0.9, 'Health', cfg)









    



Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV]  ................................................................
[CV] ....................................... , score=0.991740 - 6.8min
[CV]  ................................................................






    



[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:  6.8min






    



[CV] ....................................... , score=0.995623 - 6.6min
[CV]  ................................................................
[CV] ....................................... , score=0.994079 - 6.8min






    



[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 20.3min finished



In [230]:

    
final_health = pickle.load(open('../presentations/figures/split_data_health_2016-09-24-16-25-03.p'))



In [231]:

    
final_health[4]









    Out[231]:





array([[956,   8],
       [  7, 123]])



In [75]:

    
tfidf_ip = pickle.load(open('/Users/Joel/Desktop/Insight/bill_taxonomy/models/presentation_models/model_Intellectual property_2016-09-23-15-07-14.p'))



In [76]:

    
tfidf_ip.best_score_









    Out[76]:





0.92719128955081787



In [234]:

    
make_roc_curve(tfidf_ip.best_estimator_, us_x, y_ip, 0.8, 'Intellectual Property', cfg)









    



Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV]  ................................................................
[CV] ....................................... , score=0.909898 - 6.4min
[CV]  ................................................................






    



[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:  6.4min






    



[CV] ....................................... , score=0.893899 - 6.6min
[CV]  ................................................................
[CV] ....................................... , score=0.952397 - 6.3min






    



[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 19.4min finished



In [71]:

    
final_ip = pickle.load(open('../presentations/figures/split_data_intellectual property_2016-09-24-17-00-41.p'))



In [236]:

    
final_ip[4]









    Out[236]:





array([[2164,    0],
       [  13,   10]])



In [4]:

    
final_tax = pickle.load(open('models/model_Taxation_2016-09-26-08-30-51.p'))



In [6]:

    
final_tax.best_params_









    Out[6]:





{'features__tfidf_text__max_df': 0.4,
 'features__tfidf_text__max_features': None,
 'features__tfidf_text__min_df': 10,
 'features__tfidf_text__ngram_range': (1, 2),
 'features__tfidf_title__max_df': 0.4,
 'features__tfidf_title__max_features': None,
 'features__tfidf_title__min_df': 10,
 'features__tfidf_title__ngram_range': (1, 3),
 'model__C': 1,
 'model__penalty': 'l2'}



In [10]:

    
sql_str = """
    SELECT bill_num, subject FROM bill_subject
    WHERE subject='Taxation'
    """
sub_bills = pd.read_sql_query(sql_str, con)
y_tax = make_y_values(us_bills, sub_bills, 'Taxation')



In [12]:

    
make_roc_curve(final_tax.best_estimator_, us_x, y_tax, 0.8, 'Taxation', cfg)









    



Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV]  ................................................................
[CV] ....................................... , score=0.997176 - 5.9min
[CV]  ................................................................






    



[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:  5.9min






    



[CV] ....................................... , score=0.997051 - 5.7min
[CV]  ................................................................
[CV] ....................................... , score=0.997677 - 5.7min






    



[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 17.4min finished






    



0.989940557842
[[1995    4]
 [  18  170]]



In [14]:

    
subject = "Bank accounts, deposits, capital"

if (subject.split(' ')[0] == 'Bank'):
    subject = subject.replace('capital', 'and capital')
subject = subject.replace(' ', '_')
subject = subject.replace(',', '')

print subject









    



Bank_accounts_deposits_and_capital

Slide 10



In [77]:

    
best_est_lr = tfidf_ip.best_estimator_.steps[1][1]



In [82]:

    
feats = tfidf_ip.best_estimator_.steps[0][1]



In [83]:

    
feat_names = feats.get_feature_names()



In [84]:

    
weights = [(feat_names[i], best_est_lr.coef_[0][i]) for i in tqdm.tqdm(range(0, len(best_est_lr.coef_[0])))]









    



100%|██████████| 121433/121433 [00:00<00:00, 276177.40it/s]



In [90]:

    
sort_weights = sorted(weights, key=lambda (a,b): abs(b), reverse=True)[0:10]



In [93]:

    
# Don't think I need this anymore but afraid to get rid of it
# feat_vect = [s[0].split('_')[1] + ': ' + s[1] for s in top20_df['feature'].str.split('__')]



In [91]:

    
top10_df = pd.DataFrame(sort_weights, columns=['feature', 'coefficient'])
feat_vect = [s[0].split('_')[1] + ': ' + s[1] for s in top10_df['feature'].str.split('__')]
top10_df.ix[:, 'feature'] = feat_vect
top10_df.set_index('feature', inplace=True)



In [101]:

    
top10_df.sort_values(by='coefficient').plot(kind='barh', legend=None, figsize=(8,6))
plt.ylabel("Feature", size=25)
plt.xlabel("Coefficient", size=25)
plt.xticks(size=25)
plt.yticks(size=25)
plt.title("Coefficient Weights for Intellectual Property", size=25)









    Out[101]:





<matplotlib.text.Text at 0x16fba2450>

Notes from the production of Slide 8



In [ ]:

    
svc_model = pickle.load(open('/Users/Joel/Desktop/Insight/bill_taxonomy/models/tfidf_models2/model_health_svc.p'))



In [ ]:

    
svc_model.best_score_



In [ ]:

    
X_svc = make_x_values(us_bills)



In [ ]:

    
len(X_svc)



In [ ]:

    
sql_str = """
    SELECT bill_num, subject FROM bill_subject
    WHERE subject='Health'
    """



In [ ]:

    
sub_bills = pd.read_sql_query(sql_str, con)



In [ ]:

    
y = make_y_values(us_bills, sub_bills, 'Health' )



In [ ]:

    
svc_model.best_estimator_.predict



In [ ]:

    
svc_model = pickle.load(open('/Users/Joel/Desktop/Insight/bill_taxonomy/models/tfidf_models2/model_health_svc.p'))



In [ ]:

    
nb_model = svc_model



In [299]:

    
get_time_stamp()









    Out[299]:





'2016-09-24-17-42-37'



In [160]:









    Out[160]:





'2016-09-24-13-19-41'



In [62]:

    
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
wordnet_lemmatizer.lemmatize('striking')









    Out[62]:





'striking'



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]: