notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import scipy
import nltk
import sklearn
import random
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA, RandomizedPCA



In [2]:

    
nltk.download('reuters')
nltk.download('punkt') # needed for tokenization









    



[nltk_data] Downloading package reuters to /home/felipe/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /home/felipe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!






    Out[2]:





True



In [3]:

    
dataset = nltk.corpus.reuters



In [4]:

    
fileids = dataset.fileids()



In [5]:

    
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
corpus_train = []
corpus_test = []
for fileid in dataset.fileids():
    document = dataset.raw(fileid)
    if re.match('training/',fileid):
        corpus_train.append(document)
    else:
        corpus_test.append(document)



In [6]:

    
def preprocessor(string):
    repl = re.sub('&lt;','',string)
    return repl.lower()



In [7]:

    
vectorizer = CountVectorizer(
                min_df=10, # tweaking this parameter reduces the length of the feature vector
                strip_accents='ascii',
                preprocessor=preprocessor,
                stop_words='english')



In [8]:

    
# need to use both corpuses for fitting because otherwise there may be words that only occur in the
# training set or in the test set
full_corpus = corpus_train + corpus_test
vectorizer.fit(full_corpus)

X_train_counts = vectorizer.transform(corpus_train)
X_test_counts = vectorizer.transform(corpus_test)
X_full_counts = vectorizer.transform(full_corpus)

X_train_counts.shape,X_test_counts.shape, X_full_counts.shape









    Out[8]:





((7769, 6462), (3019, 6462), (10788, 6462))



In [9]:

    
transformer = TfidfTransformer()
# again, we need to fit the transformer to all documents (train and test)
transformer.fit(X_full_counts)

X_train_tfidf = transformer.transform(X_train_counts)
X_test_tfidf = transformer.transform(X_test_counts)
X_full_tfidf = transformer.transform(X_full_counts)



In [10]:

    
Y_train = []
Y_test = []

for (idx,fileid) in enumerate(dataset.fileids()):    
    categories = '*'.join(dataset.categories(fileid))

    if re.match('training/',fileid):
        Y_train.append(categories)
    else:
        Y_test.append(categories)

series_train = pd.Series(Y_train)
Y_train_df = series_train.str.get_dummies(sep='*')

series_test = pd.Series(Y_test)
Y_test_df = series_test.str.get_dummies(sep='*')

Y_train = Y_train_df.values
Y_test = Y_test_df.values

Y_train.shape,Y_test.shape









    Out[10]:





((7769, 90), (3019, 90))



In [15]:

    
%%time

clf = LogisticRegression()

meta_clf = OneVsRestClassifier(clf)

pca = RandomizedPCA(n_components=140)
pca.fit(X_train_tfidf.toarray())

X_train_reduced = pca.transform(X_train_tfidf.toarray())
X_test_reduced = pca.transform(X_test_tfidf.toarray())









    



CPU times: user 13.9 s, sys: 3.24 s, total: 17.2 s
Wall time: 5.44 s



In [16]:

    
meta_clf.fit(X_train_reduced,Y_train)









    Out[16]:





OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)



In [17]:

    
Y_pred = meta_clf.predict(X_test_reduced)



In [18]:

    
f1_score(Y_test,Y_pred,average='micro')









    Out[18]:





0.74623444662737393



In [ ]: