In [7]:
%load_ext watermark
%watermark -a "Joel Piper" -d -t -v -p numpy,pandas,nltk,sklearn,gensim -g
In [ ]:
import psycopg2
con = psycopg2.connect(dbname='bills_db', user='Joel')
In [ ]:
import pandas as pd
# query:
sql_query = """
SELECT *
FROM us_bills
LIMIT 1000;
"""
us_bills = pd.read_sql_query(sql_query, con)
In [ ]:
# query:
sql_query = """
SELECT *
FROM bill_subject
WHERE bill_num IN (' {0} ');
"""
revised = sql_query.format("','".join(us_bills['bill_num']))
subjects = pd.read_sql_query(revised, con)
In [ ]:
bill_subset = us_bills.ix[0:1000,['bill_name','bill_text']]
bill_tuples = tuples = [tuple(x) for x in bill_subset.values]
In [ ]:
import string
from nltk import word_tokenize
#from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
#######
# Use a lemmatizer rather than just a stemmer
#stemmer = PorterStemmer()
#def stem_tokens(tokens, stemmer):
# stemmed = []
# for item in tokens:
# stemmed.append(stemmer.stem(item))
# return stemmed
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize_tokens(tokens, lemma):
lemmatized = []
for item in tokens:
lemmatized.append(lemma.lemmatize(item))
return lemmatized
def tokenize(text):
text = "".join([ch for ch in text if ch not in string.punctuation])
text = "".join([ch for ch in text if ch not in string.digits])
tokens = word_tokenize(text)
lemmas = lemmatize_tokens(tokens, wordnet_lemmatizer)
return lemmas
def my_preproc_text(bill_tuple):
text = bill_tuple[1].lower()
revised = " ".join([t for t in text.split() if len(t) > 3])
return revised
def my_preproc_title(bill_tuple):
title = bill_tuple[0].lower()
revised = " ".join([t for t in title.split() if len(t) > 3])
return revised
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
from gensim.parsing.preprocessing import STOPWORDS
tf_text = CountVectorizer(stop_words='english', token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=tokenize,
preprocessor=my_preproc_text, ngram_range=(1,2), min_df=10, max_df=0.4)
In [ ]:
tf_title = CountVectorizer(stop_words='english', token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=tokenize,
preprocessor=my_preproc_title, ngram_range=(1,3), min_df=10, max_df=0.4)
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_text = TfidfVectorizer(stop_words='english', token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=tokenize,
preprocessor=my_preproc_text, ngram_range=(1,2), min_df=10, max_df=0.4)
tfidf_title = TfidfVectorizer(stop_words='english', token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=tokenize,
preprocessor=my_preproc_title, ngram_range=(1,3), min_df=10, max_df=0.4)
In [ ]:
from sklearn.decomposition import LatentDirichletAllocation
lda_text = LatentDirichletAllocation(n_topics=100, max_iter=5, learning_method='online', learning_offset=50.,
random_state=0)
lda_title = LatentDirichletAllocation(n_topics=10, max_iter=5, learning_method='online', learning_offset=50.,
random_state=0)
In [ ]:
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
print("Topic #%d:" % topic_idx)
print(" ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()
n_top_words = 20
tf_feature_names = tf_title.get_feature_names()
print_top_words(lda_title, tf_feature_names, n_top_words)
In [ ]:
health_bills = subjects[subjects['subject'] == 'Health']
In [ ]:
us_bills['health'] = 0
In [ ]:
us_bills.ix[us_bills['bill_num'].isin(health_bills['bill_num']),'health'] = 1
In [ ]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV
In [ ]:
lda_text_model = Pipeline(steps=[('tf_text', tf_text), ('lda_text', lda_text)])
lda_title_model = Pipeline(steps=[('tf_title', tf_title), ('lda_title', lda_title)])
In [ ]:
combined_features = FeatureUnion([("tfidf_text", tfidf_text), ("lda_text_model", lda_text_model),
("tfidf_title", tfidf_title), ("lda_title_model", lda_title_model)])
In [ ]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(C=1e9, penalty='l1')
In [ ]:
pipe = Pipeline(steps=[('features', combined_features), ('logistic', logistic)])
In [ ]:
param_grid = dict(features__lda_text_model__lda_text__n_topics=[100],
features__lda_title_model__lda_title__n_topics=[10],
features__tfidf_text__max_features=[None, 100],
features__tfidf_title__max_features=[None],
logistic__C=[0.1, 1, 10, 1e9])
In [ ]:
grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='roc_auc', verbose=10)
In [ ]:
grid_search.fit(bill_tuples, us_bills['health'])
In [ ]:
grid_search.best_score_
In [ ]:
grid_search.best_params_
In [ ]:
grid_search.best_estimator_
In [ ]:
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import pickle
%matplotlib inline
fig_dir = '/Users/Joel/Desktop/Insight/data/'
def make_roc_curve(pipeline, X, y, train_frac, subject, fig_dir):
X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=train_frac, random_state=1, stratify=y)
grid_search = GridSearchCV(pipeline, {}, scoring='roc_auc', verbose=10)
grid_search.fit(X_train, y_train)
y_pred_class = grid_search.predict(X_test)
y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
print(metrics.accuracy_score(y_test, y_pred_class))
print(metrics.confusion_matrix(y_test, y_pred_class))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
roc_auc = metrics.auc(fpr, tpr)
# method I: plt
plt.title(subject + '\nReceiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
plt.savefig(fig_dir+'/roc_curve_'+subject.lower()+'.png')
results_save = (grid_search, X_test, y_test)
pickle.dump(results_save, open(fig_dir+'/plot_info_nb.p', 'wb'))
In [ ]:
grid_search, X_test, y_test = pickle.load(open(fig_dir+'/plot_info.p', 'rb'))
In [ ]:
make_roc_curve(pipe_nb, bill_tuples, us_bills['health'], 0.9, 'Health', '/Users/Joel/Desktop/Insight/data/')
In [ ]:
from sklearn.svm import SVC
svc = SVC()
pipe_svc = Pipeline(steps=[('features', combined_features), ('svc', svc)])
param_grid_svc = dict(features__lda_text_model__lda_text__n_topics=[100],
features__lda_title_model__lda_title__n_topics=[10],
features__tfidf_text__max_features=[None],
features__tfidf_title__max_features=[None],
svc__C=[1])
grid_search_svc = GridSearchCV(pipe_svc, param_grid=param_grid_svc, scoring='roc_auc', verbose=10)
grid_search_svc.fit(bill_tuples, us_bills['health'])
In [ ]:
grid_search_svc.best_score_
In [ ]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
pipe_nb = Pipeline(steps=[('features', combined_features), ('nb', nb)])
param_grid_nb = dict(features__lda_text_model__lda_text__n_topics=[100],
features__lda_title_model__lda_title__n_topics=[10],
features__tfidf_text__max_features=[None],
features__tfidf_title__max_features=[None],
nb__alpha=[1])
grid_search_nb = GridSearchCV(pipe_nb, param_grid=param_grid_nb, scoring='roc_auc', verbose=10)
grid_search_nb.fit(bill_tuples, us_bills['health'])