notebook.community

Edit and run



In [18]:

    
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups

cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)



In [7]:

    
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target



In [21]:

    
# this calculates a vector of term frequencies for 
# each document
vect = CountVectorizer()

# this normalizes each term frequency by the 
# number of documents having that term
tfidf = TfidfTransformer()

# this is a linear SVM classifier
clf = LinearSVC()

pipeline = Pipeline([
    ('vect',vect),
    ('tfidf',tfidf),
    ('clf',clf)
])

scores = cross_val_score(pipeline,X_train,y_train,cv=3,
    scoring='f1_micro')



In [22]:

    
scores









    Out[22]:





array([ 0.99162011,  0.98882682,  0.99159664])



In [23]:

    
scores.mean()









    Out[23]:





0.99068118867658794



In [26]:

    
# now train and predict test instances
pipeline.fit(X_train,y_train)
y_preds = pipeline.predict(X_test)



In [27]:

    
# calculate f1
f1_score(y_test, y_preds, average='micro')









    Out[27]:





0.97475455820476853