Setup


In [7]:
%load_ext watermark
%watermark -a "Joel Piper" -d -t -v -p numpy,pandas,nltk,sklearn,gensim -g


The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Joel Piper 2016-09-17 20:10:01 

CPython 2.7.12
IPython 4.2.0

numpy 1.11.1
pandas 0.18.1
nltk 3.2.1
sklearn 0.17.1
gensim 0.13.2
Git hash: 2e718645ec0e62dd529a4b2784c93c884eff7694

Tokenize and Lemmatize the Word List in Count Vectorizer


In [ ]:
import psycopg2
con = psycopg2.connect(dbname='bills_db', user='Joel')

Return first 1000 us bills


In [ ]:
import pandas as pd
# query:
sql_query = """
SELECT *
FROM us_bills
LIMIT 1000;
"""
us_bills = pd.read_sql_query(sql_query, con)

Now get the subjects for those 1000 bills


In [ ]:
# query:
sql_query = """
SELECT *
FROM bill_subject
WHERE bill_num IN (' {0} ');
"""

revised = sql_query.format("','".join(us_bills['bill_num']))
subjects = pd.read_sql_query(revised, con)

In [ ]:
bill_subset = us_bills.ix[0:1000,['bill_name','bill_text']]
bill_tuples = tuples = [tuple(x) for x in bill_subset.values]

In [ ]:
import string
from nltk import word_tokenize          
#from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
#######
# Use a lemmatizer rather than just a stemmer
#stemmer = PorterStemmer()
#def stem_tokens(tokens, stemmer):
#    stemmed = []
#    for item in tokens:
#        stemmed.append(stemmer.stem(item))
#    return stemmed
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize_tokens(tokens, lemma):
    lemmatized = []
    for item in tokens:
        lemmatized.append(lemma.lemmatize(item))
    return lemmatized

def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    text = "".join([ch for ch in text if ch not in string.digits])
    tokens = word_tokenize(text)
    lemmas = lemmatize_tokens(tokens, wordnet_lemmatizer)
    return lemmas

def my_preproc_text(bill_tuple):
    text = bill_tuple[1].lower()
    revised = " ".join([t for t in text.split() if len(t) > 3])
    return revised

def my_preproc_title(bill_tuple):
    title = bill_tuple[0].lower()
    revised = " ".join([t for t in title.split() if len(t) > 3])
    return revised

In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
from gensim.parsing.preprocessing import STOPWORDS

tf_text = CountVectorizer(stop_words='english', token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=tokenize,
                          preprocessor=my_preproc_text, ngram_range=(1,2), min_df=10, max_df=0.4)

In [ ]:
tf_title = CountVectorizer(stop_words='english', token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=tokenize,
                           preprocessor=my_preproc_title, ngram_range=(1,3), min_df=10, max_df=0.4)

Create the TF/IDF and LDA Vectors


In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_text = TfidfVectorizer(stop_words='english', token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=tokenize,
                             preprocessor=my_preproc_text, ngram_range=(1,2), min_df=10, max_df=0.4)
tfidf_title = TfidfVectorizer(stop_words='english', token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=tokenize,
                             preprocessor=my_preproc_title, ngram_range=(1,3), min_df=10, max_df=0.4)

In [ ]:
from sklearn.decomposition import LatentDirichletAllocation
lda_text = LatentDirichletAllocation(n_topics=100, max_iter=5, learning_method='online', learning_offset=50., 
                                     random_state=0)
lda_title = LatentDirichletAllocation(n_topics=10, max_iter=5, learning_method='online', learning_offset=50., 
                                     random_state=0)

In [ ]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
n_top_words = 20
tf_feature_names = tf_title.get_feature_names()
print_top_words(lda_title, tf_feature_names, n_top_words)

Create the logistic regression model using gridcv from scikit learn


In [ ]:
health_bills = subjects[subjects['subject'] == 'Health']

In [ ]:
us_bills['health'] = 0

In [ ]:
us_bills.ix[us_bills['bill_num'].isin(health_bills['bill_num']),'health'] = 1

In [ ]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV

In [ ]:
lda_text_model = Pipeline(steps=[('tf_text', tf_text), ('lda_text', lda_text)])
lda_title_model = Pipeline(steps=[('tf_title', tf_title), ('lda_title', lda_title)])

In [ ]:
combined_features = FeatureUnion([("tfidf_text", tfidf_text), ("lda_text_model", lda_text_model), 
                                  ("tfidf_title", tfidf_title), ("lda_title_model", lda_title_model)])

In [ ]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(C=1e9, penalty='l1')

In [ ]:
pipe = Pipeline(steps=[('features', combined_features), ('logistic', logistic)])

In [ ]:
param_grid = dict(features__lda_text_model__lda_text__n_topics=[100],
                  features__lda_title_model__lda_title__n_topics=[10],
                  features__tfidf_text__max_features=[None, 100],
                  features__tfidf_title__max_features=[None],
                  logistic__C=[0.1, 1, 10, 1e9])

In [ ]:
grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='roc_auc', verbose=10)

In [ ]:
grid_search.fit(bill_tuples, us_bills['health'])

In [ ]:
grid_search.best_score_

In [ ]:
grid_search.best_params_

In [ ]:
grid_search.best_estimator_

For visualization and reporting interest only, leave out 10% to plot a ROC curve


In [ ]:
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import pickle
%matplotlib inline
fig_dir = '/Users/Joel/Desktop/Insight/data/'
def make_roc_curve(pipeline, X, y, train_frac, subject, fig_dir):
    X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=train_frac, random_state=1, stratify=y)
    grid_search = GridSearchCV(pipeline, {}, scoring='roc_auc', verbose=10)
    grid_search.fit(X_train, y_train)
    y_pred_class = grid_search.predict(X_test)
    y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
    print(metrics.accuracy_score(y_test, y_pred_class))
    print(metrics.confusion_matrix(y_test, y_pred_class))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
    roc_auc = metrics.auc(fpr, tpr)

    # method I: plt

    plt.title(subject + '\nReceiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    plt.savefig(fig_dir+'/roc_curve_'+subject.lower()+'.png')
    results_save = (grid_search, X_test, y_test)
    pickle.dump(results_save, open(fig_dir+'/plot_info_nb.p', 'wb'))

In [ ]:
grid_search, X_test, y_test = pickle.load(open(fig_dir+'/plot_info.p', 'rb'))

In [ ]:
make_roc_curve(pipe_nb, bill_tuples, us_bills['health'], 0.9, 'Health', '/Users/Joel/Desktop/Insight/data/')

Test SVC


In [ ]:
from sklearn.svm import SVC
svc = SVC()
pipe_svc = Pipeline(steps=[('features', combined_features), ('svc', svc)])
param_grid_svc = dict(features__lda_text_model__lda_text__n_topics=[100],
                  features__lda_title_model__lda_title__n_topics=[10],
                  features__tfidf_text__max_features=[None],
                  features__tfidf_title__max_features=[None],
                  svc__C=[1])
grid_search_svc = GridSearchCV(pipe_svc, param_grid=param_grid_svc, scoring='roc_auc', verbose=10)
grid_search_svc.fit(bill_tuples, us_bills['health'])

In [ ]:
grid_search_svc.best_score_

Test Naive Bayes


In [ ]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
pipe_nb = Pipeline(steps=[('features', combined_features), ('nb', nb)])
param_grid_nb = dict(features__lda_text_model__lda_text__n_topics=[100],
                  features__lda_title_model__lda_title__n_topics=[10],
                  features__tfidf_text__max_features=[None],
                  features__tfidf_title__max_features=[None],
                  nb__alpha=[1])
grid_search_nb = GridSearchCV(pipe_nb, param_grid=param_grid_nb, scoring='roc_auc', verbose=10)
grid_search_nb.fit(bill_tuples, us_bills['health'])