In [16]:
import numpy as np
import pandas as pd
import scipy
import nltk
import sklearn
import random
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.preprocessing import OneHotEncoder,scale, MinMaxScaler, binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn import svm
from sklearn.neural_network import BernoulliRBM
from sklearn.grid_search import GridSearchCV,ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator,TransformerMixin
In [2]:
nltk.download('reuters')
nltk.download('punkt') # needed for tokenization
Out[2]:
In [3]:
dataset = nltk.corpus.reuters
In [5]:
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
corpus_train = []
corpus_test = []
for fileid in dataset.fileids():
document = dataset.raw(fileid)
if re.match('training/',fileid):
corpus_train.append(document)
else:
corpus_test.append(document)
In [6]:
def preprocessor(string):
repl = re.sub('<','',string)
return repl.lower()
In [7]:
%%time
Y_train = []
Y_test = []
for (idx,fileid) in enumerate(dataset.fileids()):
categories = '*'.join(dataset.categories(fileid))
if re.match('training/',fileid):
Y_train.append(categories)
else:
Y_test.append(categories)
series_train = pd.Series(Y_train)
Y_train_df = series_train.str.get_dummies(sep='*')
series_test = pd.Series(Y_test)
Y_test_df = series_test.str.get_dummies(sep='*')
Y_train = Y_train_df.values
Y_test = Y_test_df.values
In [17]:
class DenseTransformer(BaseEstimator,TransformerMixin):
def transform(self, X, y=None, **fit_params):
return X.todense()
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
In [18]:
clf = OneVsRestClassifier(Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('to_dense',DenseTransformer()),
('minmax', MinMaxScaler()),
('rbm', BernoulliRBM() ),
('clf', svm.LinearSVC()),
]))
parameters = [
{
"estimator__vect__min_df": [5],
"estimator__vect__preprocessor":[preprocessor],
"estimator__vect__stop_words": ['english'],
"estimator__vect__strip_accents":['ascii'],
"estimator__minmax__copy":[False],
"estimator__clf__penalty": ["l1"],
"estimator__clf__dual":[False],
"estimator__clf__multi_class":["crammer_singer"],
"estimator__clf__tol": [0.001],
}
]
# parameters = {
# 'rbm__n_components':[2,5,10,25,30,50],
# 'rbm__n_iter':[5,10,20,50,100],
# 'rbm__batch_size': [10,50,100,500],
# 'rbm__learning_rate': [0.1,0.2,0.3,0.6]}
In [ ]:
best_score = float("-inf")
# I had to manually search over the parameter grid because, since we have a mod-apte split
# we cannot do any cross-validations selecting random train/test sets.
# GridSearchCV does not let one do grid search *without* also doing cross validation so we need to do this
for g in ParameterGrid(parameters):
clf.set_params(**g)
clf.fit(corpus_train,Y_train)
Y_pred = clf.predict(corpus_test)
current_score = f1_score(Y_test,Y_pred,average='micro')
print("current_score was {} and the current grid was {}".format(current_score,g))
if current_score > best_score:
best_score = current_score
best_grid = g
In [ ]:
best_score
In [ ]:
best_grid