In [1]:
%load_ext watermark
%watermark -a "Joel Piper" -d -t -v -p numpy,pandas,nltk,sklearn,gensim -g


Joel Piper 2016-09-19 18:05:12 

CPython 2.7.12
IPython 4.2.0

numpy 1.11.1
pandas 0.18.1
nltk 3.2.1
sklearn 0.17.1
gensim 0.13.2
Git hash: 2e718645ec0e62dd529a4b2784c93c884eff7694

In [3]:
import os
import yaml
import sys

os.chdir('..')

In [4]:
PROJ_ROOT="/Users/Joel/Desktop/Insight/bill_taxonomy/"
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)

In [5]:
%load_ext autoreload
%autoreload 1

In [7]:
%aimport src.ingest.get_bills
%aimport src.wrangle.create_features
%aimport src.analyze.run_model
%aimport src.report.store_db

In [23]:
from src.ingest.get_bills import get_us_bills
from src.ingest.get_bills import get_ny_bills
from src.ingest.get_bills import get_subjects
from src.wrangle.create_features import make_feat_union
from src.analyze.run_model import create_model
from src.analyze.run_model import run_model
from src.wrangle.create_features import make_x_values
from src.wrangle.create_features import make_y_values
from src.analyze.run_model import get_y_probs
from src.report.store_db import store_us_db
from src.report.store_db import store_ny_db

with open("configs.yml", 'r') as ymlfile:
        cfg = yaml.load(ymlfile)

        dbname = cfg['dbname']
        username = cfg['username']
        us_bills_subset = cfg['us_bills_subset']
        subject_list = cfg['subjects']
        pipe_feats = cfg['pipe_feats']
        model_type = cfg['model_type']
        us_bills = get_us_bills(dbname, username, us_bills_subset)
        subjects = get_subjects(dbname, username, subject_list)
        X = make_x_values(us_bills)
        feat_un, feat_params = make_feat_union(pipe_feats, cfg)
        model = create_model(feat_un, model_type, feat_params, cfg)

        results = []
        for sub in subject_list:
            y = make_y_values(us_bills, subjects, sub)
            fit_mod = run_model(model, X, y, sub, cfg)
            results.append(fit_mod)

            if(cfg['store_us']):
                y_probs_us = get_y_probs(fit_mod, X)
                store_us_db(dbname, username, us_bills, sub, y_probs_us, y, cfg)

            if(cfg['store_ny']):
                ny_bills_subset = cfg['ny_bills_subset']
                ny_bills = get_ny_bills(dbname, username, ny_bills_subset)
                X_ny = make_x_values(ny_bills)
                y_probs_ny = get_y_probs(fit_mod, X_ny)
                store_ny_db(dbname, username, ny_bills, sub, y_probs_ny, cfg)


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] features__lda_text__lda_tf_text__min_df=10, model__C=10, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__lda_text__lda_model_text__n_topics=100, features__lda_text__lda_tf_text__ngram_range=(1, 1), features__tfidf_title__max_features=None, features__lda_text__lda_tf_text__max_df=0.4, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4 
---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-23-86f06729fcfd> in <module>()
     29         for sub in subject_list:
     30             y = make_y_values(us_bills, subjects, sub)
---> 31             fit_mod = run_model(model, X, y, sub, cfg)
     32             results.append(fit_mod)
     33 

/Users/Joel/Desktop/Insight/bill_taxonomy/src/analyze/run_model.py in run_model(model, X, y, sub, cfg)
     41 def run_model(model, X, y, sub, cfg):
     42 
---> 43     model.fit(X, y)
     44 
     45     if cfg['save_model']:

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit(self, X, y)
    802 
    803         """
--> 804         return self._fit(X, y, ParameterGrid(self.param_grid))
    805 
    806 

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable)
    551                                     self.fit_params, return_parameters=True,
    552                                     error_score=self.error_score)
--> 553                 for parameters in parameter_iterable
    554                 for train, test in cv)
    555 

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    798             # was dispatched. In particular this covers the edge
    799             # case of Parallel used with an exhausted iterator.
--> 800             while self.dispatch_one_batch(iterator):
    801                 self._iterating = True
    802             else:

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
    656                 return False
    657             else:
--> 658                 self._dispatch(tasks)
    659                 return True
    660 

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
    564 
    565         if self._pool is None:
--> 566             job = ImmediateComputeBatch(batch)
    567             self._jobs.append(job)
    568             self.n_dispatched_batches += 1

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, batch)
    178         # Don't delay the application, to avoid keeping the input
    179         # arguments in memory
--> 180         self.results = batch()
    181 
    182     def get(self):

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
     73 
     74     def __len__(self):

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
   1529             estimator.fit(X_train, **fit_params)
   1530         else:
-> 1531             estimator.fit(X_train, y_train, **fit_params)
   1532 
   1533     except Exception as e:

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit(self, X, y, **fit_params)
    162             the pipeline.
    163         """
--> 164         Xt, fit_params = self._pre_transform(X, y, **fit_params)
    165         self.steps[-1][-1].fit(Xt, y, **fit_params)
    166         return self

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/pipeline.pyc in _pre_transform(self, X, y, **fit_params)
    143         for name, transform in self.steps[:-1]:
    144             if hasattr(transform, "fit_transform"):
--> 145                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
    146             else:
    147                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit_transform(self, X, y, **fit_params)
    495             delayed(_fit_transform_one)(trans, name, X, y,
    496                                         self.transformer_weights, **fit_params)
--> 497             for name, trans in self.transformer_list)
    498 
    499         Xs, transformers = zip(*result)

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    798             # was dispatched. In particular this covers the edge
    799             # case of Parallel used with an exhausted iterator.
--> 800             while self.dispatch_one_batch(iterator):
    801                 self._iterating = True
    802             else:

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
    656                 return False
    657             else:
--> 658                 self._dispatch(tasks)
    659                 return True
    660 

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
    564 
    565         if self._pool is None:
--> 566             job = ImmediateComputeBatch(batch)
    567             self._jobs.append(job)
    568             self.n_dispatched_batches += 1

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, batch)
    178         # Don't delay the application, to avoid keeping the input
    179         # arguments in memory
--> 180         self.results = batch()
    181 
    182     def get(self):

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
     73 
     74     def __len__(self):

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/pipeline.pyc in _fit_transform_one(transformer, name, X, y, transformer_weights, **fit_params)
    411             return X_transformed * transformer_weights[name], transformer
    412     if hasattr(transformer, 'fit_transform'):
--> 413         X_transformed = transformer.fit_transform(X, y, **fit_params)
    414         return X_transformed, transformer
    415     else:

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in fit_transform(self, raw_documents, y)
   1303             Tf-idf-weighted document-term matrix.
   1304         """
-> 1305         X = super(TfidfVectorizer, self).fit_transform(raw_documents)
   1306         self._tfidf.fit(X)
   1307         # X is already a transformed view of raw_documents so

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in fit_transform(self, raw_documents, y)
    815 
    816         vocabulary, X = self._count_vocab(raw_documents,
--> 817                                           self.fixed_vocabulary_)
    818 
    819         if self.binary:

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in _count_vocab(self, raw_documents, fixed_vocab)
    750         indptr.append(0)
    751         for doc in raw_documents:
--> 752             for feature in analyze(doc):
    753                 try:
    754                     j_indices.append(vocabulary[feature])

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in <lambda>(doc)
    236 
    237             return lambda doc: self._word_ngrams(
--> 238                 tokenize(preprocess(self.decode(doc))), stop_words)
    239 
    240         else:

/Users/Joel/Desktop/Insight/bill_taxonomy/src/wrangle/create_features.pyc in tokenize(text)
     23     text = "".join([ch for ch in text if ch not in string.digits])
     24     tokens = word_tokenize(text)
---> 25     lemmas = lemmatize_tokens(tokens, wordnet_lemmatizer)
     26     return lemmas
     27 

/Users/Joel/Desktop/Insight/bill_taxonomy/src/wrangle/create_features.pyc in lemmatize_tokens(tokens, lemma)
     15     lemmatized = []
     16     for item in tokens:
---> 17         lemmatized.append(lemma.lemmatize(item))
     18     return lemmatized
     19 

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/nltk/stem/wordnet.pyc in lemmatize(self, word, pos)
     38 
     39     def lemmatize(self, word, pos=NOUN):
---> 40         lemmas = wordnet._morphy(word, pos)
     41         return min(lemmas, key=len) if lemmas else word
     42 

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/nltk/corpus/reader/wordnet.pyc in _morphy(self, form, pos)
   1710 
   1711         # 1. Apply rules once to the input to get y1, y2, y3, etc.
-> 1712         forms = apply_rules([form])
   1713 
   1714         # 2. Return all that are in the database (and check the original too)

/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/nltk/corpus/reader/wordnet.pyc in apply_rules(forms)
   1692                     for form in forms
   1693                     for old, new in substitutions
-> 1694                     if form.endswith(old)]
   1695 
   1696         def filter_forms(forms):

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 24: ordinal not in range(128)

In [ ]:
%debug


> /Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/nltk/corpus/reader/wordnet.py(1694)apply_rules()
   1692                     for form in forms
   1693                     for old, new in substitutions
-> 1694                     if form.endswith(old)]
   1695 
   1696         def filter_forms(forms):

ipdb> u
> /Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/nltk/corpus/reader/wordnet.py(1712)_morphy()
   1710 
   1711         # 1. Apply rules once to the input to get y1, y2, y3, etc.
-> 1712         forms = apply_rules([form])
   1713 
   1714         # 2. Return all that are in the database (and check the original too)

ipdb> u
> /Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/nltk/stem/wordnet.py(40)lemmatize()
     38 
     39     def lemmatize(self, word, pos=NOUN):
---> 40         lemmas = wordnet._morphy(word, pos)
     41         return min(lemmas, key=len) if lemmas else word
     42 

ipdb> u
> /Users/Joel/Desktop/Insight/bill_taxonomy/src/wrangle/create_features.py(17)lemmatize_tokens()
     15     lemmatized = []
     16     for item in tokens:
---> 17         lemmatized.append(lemma.lemmatize(item))
     18     return lemmatized
     19 

ipdb> item
'methylenedioxymethamphet\xc2\xadamine'
ipdb> ll
     14 def lemmatize_tokens(tokens, lemma):
     15     lemmatized = []
     16     for item in tokens:
---> 17         lemmatized.append(lemma.lemmatize(item))
     18     return lemmatized
     19 


In [ ]:
import psycopg2
    import pandas as pd
    con = psycopg2.connect(database='bills_db', user='Joel')
    ny_subset = 10
    # query:
    sql_str = """
    SELECT bill_num, bill_name, bill_text FROM ny_bills
    LIMIT {0}
    """
    sql_query = sql_str.format(ny_subset)
    ny_bills = pd.read_sql_query(sql_query, con)

In [ ]:
ny_bills.ix[0,'bill_name'].lower()

In [ ]:
bill_cols = ny_bills[['bill_name', 'bill_text']]
test_tuple = [tuple(x) for x in bill_cols.values]

In [ ]:
test_tuple[0][0].lower()

In [ ]:
us_bill_cols = us_bills[['bill_name', 'bill_text']]
us_test_tuple = [tuple(x) for x in us_bill_cols.values]

In [ ]:
us_test_tuple[0][0]