In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.decomposition import PCA
cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)
In [42]:
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target
In [54]:
class DenseTransformer(BaseEstimator,TransformerMixin):
def transform(self, X, y=None, **fit_params):
return X.todense()
def fit(self, X, y=None, **fit_params):
return self
In [55]:
pipeline = Pipeline([
('tfidf',TfidfVectorizer()),
('to_dense',DenseTransformer()),
('pca',PCA()),
('clf',DecisionTreeClassifier())
])
# this is where you define the values for
# GridSearchCV to iterate over
# l1 penalty is incompatible with other configs
param_grid = [
{
'tfidf__max_df':[0.8,0.9,1.0]
}
]
# do 3-fold cross validation for each of the 6 possible
# combinations of the parameter values above
grid = GridSearchCV(pipeline, cv=3, param_grid=param_grid,scoring='f1_micro')
grid.fit(X_train,y_train)
Out[55]:
In [56]:
# summarize results
print("Best: %f using %s" % (grid.best_score_,
grid.best_params_))
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
In [5]:
# now train and predict test instances
# using the best configs
pipeline.set_params(clf__penalty='l2',vect__max_df=0.9,clf__dual=True)
pipeline.fit(X_train,y_train)
y_preds = pipeline.predict(X_test)
In [6]:
# calculate f1
f1_score(y_test, y_preds, average='micro')
Out[6]: