In [1]:
# code to generate sample multiclass 1-vs-all data sets
# data set is very small and needs to be replaced with rcv
#
#  this example will use the 20 newsgroup set
#
# see: http://scikit-learn.org/stable/auto_examples/document_classification_20newsgroups.html#example-document-classification-20newsgroups-py
# 
import sys
from time import time
from pprint import pprint

import numpy as np
import scipy
import scipy.sparse as sp
import joblib

import io
import os.path

import sklearn
import sklearn.svm
import sklearn.datasets
import sklearn.metrics
import sklearn.cross_validation


from sklearn.externals.six import u, b

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2

import warnings
warnings.filterwarnings('ignore')

%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
categories = None
remove = ('headers', 'footers', 'quotes')

In [10]:
print("Loading 20 newsgroups dataset for categories:")

data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)
print('data loaded')


Loading 20 newsgroups dataset for categories:
data loaded

In [11]:
categories = data_train.target_names    # for case categories == None


def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

data_train_size_mb = size_mb(data_train.data)
data_test_size_mb = size_mb(data_test.data)

print("%d documents - %0.3fMB (training set)" % (
    len(data_train.data), data_train_size_mb))
print("%d documents - %0.3fMB (test set)" % (
    len(data_test.data), data_test_size_mb))
print("%d categories" % len(categories))
print()


11314 documents - 13.782MB (training set)
7532 documents - 8.262MB (test set)
20 categories
()

In [13]:
# split a training set and a test set
# we will eventually split the training up into L and U, and use test as the HO
y_train, y_test = data_train.target, data_test.target

In [31]:
print("Extracting features from the training dataset using a sparse vectorizer")
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test dataset using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
print("n_samples: %d, n_features: %d" % X_test.shape)
print()


Extracting features from the training dataset using a sparse vectorizer
n_samples: 11314, n_features: 101323
()
Extracting features from the test dataset using the same vectorizer
n_samples: 7532, n_features: 101323
()

In [35]:
feature_names = np.asarray(vectorizer.get_feature_names())
feature_names


Out[35]:
array([u'00', u'000', u'0000', ..., u'zzzzzzt', u'\xb3ation', u'\xfd\xe9'], 
      dtype='<U81')

In [39]:
#labels
y_train


Out[39]:
array([7, 4, 4, ..., 3, 1, 8])

In [46]:
# note:  sample sizes are actually quite small for each category
# probably not enough for true MC test
for i in range(20):
    print i, np.where(y_train==i)[0].size


0 480
1 584
2 591
3 590
4 578
5 593
6 585
7 594
8 598
9 597
10 600
11 595
12 591
13 594
14 593
15 599
16 546
17 564
18 465
19 377

In [ ]:
# train MC classifier as baseline

# build  1-vs-all data sets for all 20 categories
#  start with 1 vs [2-20]
#  
# train 1-vs-all baseline and report accuracies
# repeat with L+U and report accuracies for different size L's
#   U and HO accuracies
# 
# this is the baseline for further work
#

# generate svmlin inputs
# run and test