notebook.community

Edit and run



In [1]:

    
# code to generate sample multiclass 1-vs-all data sets
# data set is very small and needs to be replaced with rcv
#
#  this example will use the 20 newsgroup set
#
# see: http://scikit-learn.org/stable/auto_examples/document_classification_20newsgroups.html#example-document-classification-20newsgroups-py
# 
import sys
from time import time
from pprint import pprint

import numpy as np
import scipy
import scipy.sparse as sp
import joblib

import io
import os.path

import sklearn
import sklearn.svm
import sklearn.datasets
import sklearn.metrics
import sklearn.cross_validation


from sklearn.externals.six import u, b

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2

import warnings
warnings.filterwarnings('ignore')

%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
categories = None
remove = ('headers', 'footers', 'quotes')



In [10]:

    
print("Loading 20 newsgroups dataset for categories:")

data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)
print('data loaded')









    



Loading 20 newsgroups dataset for categories:
data loaded



In [11]:

    
categories = data_train.target_names    # for case categories == None


def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

data_train_size_mb = size_mb(data_train.data)
data_test_size_mb = size_mb(data_test.data)

print("%d documents - %0.3fMB (training set)" % (
    len(data_train.data), data_train_size_mb))
print("%d documents - %0.3fMB (test set)" % (
    len(data_test.data), data_test_size_mb))
print("%d categories" % len(categories))
print()









    



11314 documents - 13.782MB (training set)
7532 documents - 8.262MB (test set)
20 categories
()



In [13]:

    
# split a training set and a test set
# we will eventually split the training up into L and U, and use test as the HO
y_train, y_test = data_train.target, data_test.target



In [31]:

    
print("Extracting features from the training dataset using a sparse vectorizer")
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test dataset using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
print("n_samples: %d, n_features: %d" % X_test.shape)
print()









    



Extracting features from the training dataset using a sparse vectorizer
n_samples: 11314, n_features: 101323
()
Extracting features from the test dataset using the same vectorizer
n_samples: 7532, n_features: 101323
()



In [35]:

    
feature_names = np.asarray(vectorizer.get_feature_names())
feature_names









    Out[35]:





array([u'00', u'000', u'0000', ..., u'zzzzzzt', u'\xb3ation', u'\xfd\xe9'], 
      dtype='<U81')



In [39]:

    
#labels
y_train









    Out[39]:





array([7, 4, 4, ..., 3, 1, 8])



In [46]:

    
# note:  sample sizes are actually quite small for each category
# probably not enough for true MC test
for i in range(20):
    print i, np.where(y_train==i)[0].size



In [ ]:

    
# train MC classifier as baseline

# build  1-vs-all data sets for all 20 categories
#  start with 1 vs [2-20]
#  
# train 1-vs-all baseline and report accuracies
# repeat with L+U and report accuracies for different size L's
#   U and HO accuracies
# 
# this is the baseline for further work
#

# generate svmlin inputs
# run and test