In [1]:
# code to generate sample multiclass 1-vs-all data sets
# data set is very small and needs to be replaced with rcv
#
# this example will use the 20 newsgroup set
#
# see: http://scikit-learn.org/stable/auto_examples/document_classification_20newsgroups.html#example-document-classification-20newsgroups-py
#
import sys
from time import time
from pprint import pprint
import numpy as np
import scipy
import scipy.sparse as sp
import joblib
import io
import os.path
import sklearn
import sklearn.svm
import sklearn.datasets
import sklearn.metrics
import sklearn.cross_validation
from sklearn.externals.six import u, b
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2
import warnings
warnings.filterwarnings('ignore')
%pylab inline
In [2]:
categories = None
remove = ('headers', 'footers', 'quotes')
In [10]:
print("Loading 20 newsgroups dataset for categories:")
data_train = fetch_20newsgroups(subset='train', categories=categories,
shuffle=True, random_state=42,
remove=remove)
data_test = fetch_20newsgroups(subset='test', categories=categories,
shuffle=True, random_state=42,
remove=remove)
print('data loaded')
In [11]:
categories = data_train.target_names # for case categories == None
def size_mb(docs):
return sum(len(s.encode('utf-8')) for s in docs) / 1e6
data_train_size_mb = size_mb(data_train.data)
data_test_size_mb = size_mb(data_test.data)
print("%d documents - %0.3fMB (training set)" % (
len(data_train.data), data_train_size_mb))
print("%d documents - %0.3fMB (test set)" % (
len(data_test.data), data_test_size_mb))
print("%d categories" % len(categories))
print()
In [13]:
# split a training set and a test set
# we will eventually split the training up into L and U, and use test as the HO
y_train, y_test = data_train.target, data_test.target
In [31]:
print("Extracting features from the training dataset using a sparse vectorizer")
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)
print("n_samples: %d, n_features: %d" % X_train.shape)
print()
print("Extracting features from the test dataset using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
print("n_samples: %d, n_features: %d" % X_test.shape)
print()
In [35]:
feature_names = np.asarray(vectorizer.get_feature_names())
feature_names
Out[35]:
In [39]:
#labels
y_train
Out[39]:
In [46]:
# note: sample sizes are actually quite small for each category
# probably not enough for true MC test
for i in range(20):
print i, np.where(y_train==i)[0].size
In [ ]:
# train MC classifier as baseline
# build 1-vs-all data sets for all 20 categories
# start with 1 vs [2-20]
#
# train 1-vs-all baseline and report accuracies
# repeat with L+U and report accuracies for different size L's
# U and HO accuracies
#
# this is the baseline for further work
#
# generate svmlin inputs
# run and test