In [18]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)
In [7]:
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target
In [21]:
# this calculates a vector of term frequencies for
# each document
vect = CountVectorizer()
# this normalizes each term frequency by the
# number of documents having that term
tfidf = TfidfTransformer()
# this is a linear SVM classifier
clf = LinearSVC()
pipeline = Pipeline([
('vect',vect),
('tfidf',tfidf),
('clf',clf)
])
scores = cross_val_score(pipeline,X_train,y_train,cv=3,
scoring='f1_micro')
In [22]:
scores
Out[22]:
In [23]:
scores.mean()
Out[23]:
In [26]:
# now train and predict test instances
pipeline.fit(X_train,y_train)
y_preds = pipeline.predict(X_test)
In [27]:
# calculate f1
f1_score(y_test, y_preds, average='micro')
Out[27]: