Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] features__lda_text__lda_tf_text__min_df=10, model__C=10, features__tfidf_title__max_df=0.4, features__tfidf_title__ngram_range=(1, 3), features__tfidf_title__min_df=10, features__tfidf_text__max_features=None, features__tfidf_text__ngram_range=(1, 2), model__penalty=l1, features__lda_text__lda_model_text__n_topics=100, features__lda_text__lda_tf_text__ngram_range=(1, 1), features__tfidf_title__max_features=None, features__lda_text__lda_tf_text__max_df=0.4, features__tfidf_text__min_df=10, features__tfidf_text__max_df=0.4
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-23-86f06729fcfd> in <module>()
29 for sub in subject_list:
30 y = make_y_values(us_bills, subjects, sub)
---> 31 fit_mod = run_model(model, X, y, sub, cfg)
32 results.append(fit_mod)
33
/Users/Joel/Desktop/Insight/bill_taxonomy/src/analyze/run_model.py in run_model(model, X, y, sub, cfg)
41 def run_model(model, X, y, sub, cfg):
42
---> 43 model.fit(X, y)
44
45 if cfg['save_model']:
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit(self, X, y)
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
805
806
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable)
551 self.fit_params, return_parameters=True,
552 error_score=self.error_score)
--> 553 for parameters in parameter_iterable
554 for train, test in cv)
555
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
798 # was dispatched. In particular this covers the edge
799 # case of Parallel used with an exhausted iterator.
--> 800 while self.dispatch_one_batch(iterator):
801 self._iterating = True
802 else:
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
656 return False
657 else:
--> 658 self._dispatch(tasks)
659 return True
660
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
564
565 if self._pool is None:
--> 566 job = ImmediateComputeBatch(batch)
567 self._jobs.append(job)
568 self.n_dispatched_batches += 1
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, batch)
178 # Don't delay the application, to avoid keeping the input
179 # arguments in memory
--> 180 self.results = batch()
181
182 def get(self):
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1529 estimator.fit(X_train, **fit_params)
1530 else:
-> 1531 estimator.fit(X_train, y_train, **fit_params)
1532
1533 except Exception as e:
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit(self, X, y, **fit_params)
162 the pipeline.
163 """
--> 164 Xt, fit_params = self._pre_transform(X, y, **fit_params)
165 self.steps[-1][-1].fit(Xt, y, **fit_params)
166 return self
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/pipeline.pyc in _pre_transform(self, X, y, **fit_params)
143 for name, transform in self.steps[:-1]:
144 if hasattr(transform, "fit_transform"):
--> 145 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
146 else:
147 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit_transform(self, X, y, **fit_params)
495 delayed(_fit_transform_one)(trans, name, X, y,
496 self.transformer_weights, **fit_params)
--> 497 for name, trans in self.transformer_list)
498
499 Xs, transformers = zip(*result)
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
798 # was dispatched. In particular this covers the edge
799 # case of Parallel used with an exhausted iterator.
--> 800 while self.dispatch_one_batch(iterator):
801 self._iterating = True
802 else:
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
656 return False
657 else:
--> 658 self._dispatch(tasks)
659 return True
660
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
564
565 if self._pool is None:
--> 566 job = ImmediateComputeBatch(batch)
567 self._jobs.append(job)
568 self.n_dispatched_batches += 1
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, batch)
178 # Don't delay the application, to avoid keeping the input
179 # arguments in memory
--> 180 self.results = batch()
181
182 def get(self):
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/pipeline.pyc in _fit_transform_one(transformer, name, X, y, transformer_weights, **fit_params)
411 return X_transformed * transformer_weights[name], transformer
412 if hasattr(transformer, 'fit_transform'):
--> 413 X_transformed = transformer.fit_transform(X, y, **fit_params)
414 return X_transformed, transformer
415 else:
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in fit_transform(self, raw_documents, y)
1303 Tf-idf-weighted document-term matrix.
1304 """
-> 1305 X = super(TfidfVectorizer, self).fit_transform(raw_documents)
1306 self._tfidf.fit(X)
1307 # X is already a transformed view of raw_documents so
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in fit_transform(self, raw_documents, y)
815
816 vocabulary, X = self._count_vocab(raw_documents,
--> 817 self.fixed_vocabulary_)
818
819 if self.binary:
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in _count_vocab(self, raw_documents, fixed_vocab)
750 indptr.append(0)
751 for doc in raw_documents:
--> 752 for feature in analyze(doc):
753 try:
754 j_indices.append(vocabulary[feature])
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in <lambda>(doc)
236
237 return lambda doc: self._word_ngrams(
--> 238 tokenize(preprocess(self.decode(doc))), stop_words)
239
240 else:
/Users/Joel/Desktop/Insight/bill_taxonomy/src/wrangle/create_features.pyc in tokenize(text)
23 text = "".join([ch for ch in text if ch not in string.digits])
24 tokens = word_tokenize(text)
---> 25 lemmas = lemmatize_tokens(tokens, wordnet_lemmatizer)
26 return lemmas
27
/Users/Joel/Desktop/Insight/bill_taxonomy/src/wrangle/create_features.pyc in lemmatize_tokens(tokens, lemma)
15 lemmatized = []
16 for item in tokens:
---> 17 lemmatized.append(lemma.lemmatize(item))
18 return lemmatized
19
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/nltk/stem/wordnet.pyc in lemmatize(self, word, pos)
38
39 def lemmatize(self, word, pos=NOUN):
---> 40 lemmas = wordnet._morphy(word, pos)
41 return min(lemmas, key=len) if lemmas else word
42
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/nltk/corpus/reader/wordnet.pyc in _morphy(self, form, pos)
1710
1711 # 1. Apply rules once to the input to get y1, y2, y3, etc.
-> 1712 forms = apply_rules([form])
1713
1714 # 2. Return all that are in the database (and check the original too)
/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/nltk/corpus/reader/wordnet.pyc in apply_rules(forms)
1692 for form in forms
1693 for old, new in substitutions
-> 1694 if form.endswith(old)]
1695
1696 def filter_forms(forms):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 24: ordinal not in range(128)