In [2]:
# code to generate sample multiclass 1-vs-all data sets
# data set is very small and needs to be replaced with rcv
#
# this example will use the 20 newsgroup set
#
# see: http://scikit-learn.org/stable/auto_examples/document_classification_20newsgroups.html#example-document-classification-20newsgroups-py
#
import sys
from time import time
from pprint import pprint
import numpy as np
import scipy
import scipy.sparse as sp
import joblib
import io
import os.path
import sklearn
import sklearn.svm
import sklearn.datasets
import sklearn.metrics
import sklearn.cross_validation
from sklearn.externals.six import u, b
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2
import warnings
warnings.filterwarnings('ignore')
%pylab inline
In [3]:
# Read RCV train and test Datasets using SvmLight Reader
#
# usable categories in test
# 1031 20
# 1134 15
# 1929 4
# 3649 2
#
# select N_l random examples from all M other categories
# parameterize by l=20,15,4,2 and M=3, 5, 10, all
#
X_train, y_train = sklearn.datasets.load_svmlight_file('rcv1_train.multiclass')
In [4]:
X_train.shape, y_train.shape
Out[4]:
In [5]:
# Read RCV tests data
#
# create sample data
# 39197 20
# 36409 15
# 64048 4
# 126777 2
# select balanced (50/50) set from l=l=20,15,4,2 and M=3, 5, 10, all
# same code as above
X_test, y_test = sklearn.datasets.load_svmlight_file('rcv1_test.multiclass')
In [6]:
X_test.shape, y_test.shape
Out[6]:
In [7]:
l = 20
Lnum = 10
Lone = np.where(y_train==l)[0]
Lother = np.where(y_train!=l)[0]
Lpos = numpy.random.choice(Lone, size=Lnum)
Lneg = numpy.random.choice(Lother, size=Lnum)
Tpos = np.where(y_test==l)[0]
Tother = np.where(y_test!=l)[0]
Unum = Tpos.shape[0]/2
#Unum = 5000
Upos = Tpos[0:Unum]
Uneg = numpy.random.choice(Tother, size=Unum)
print Tpos.shape, Tother.shape
HOstart = Tpos.shape[0]-Unum
HOpos = Tpos[HOstart:]
HOneg = numpy.random.choice(numpy.setxor1d(Tother,Uneg), size=Unum)
# not sure why HO > U, but ok for now
print Lpos.shape, Upos.shape, HOpos.shape
print Lneg.shape, Uneg.shape, HOneg.shape
# check that T and HO dont overlap
print np.intersect1d(Upos,HOpos)
print np.intersect1d(Uneg,HOneg)
In [8]:
Lids = np.union1d(Lpos,Lneg)
Uids = np.union1d(Upos,Uneg)
HOids = np.union1d(HOpos,HOneg)
print Lids.shape, Uids.shape, HOids.shape
In [9]:
# why does this fail?
L = X_train[Lids]
l_labels = y_train[Lids]
U = X_test[Uids]
u_labels = y_test[Uids]
HO = X_test[HOids]
ho_labels = y_test[HOids]
In [10]:
#TODO: simplify
def binary_labels(labels,one=20):
b_l = np.zeros_like(labels)
one_ids = np.where(labels==one)
for i in one_ids:
b_l[i] = 2
return (b_l -1)
In [11]:
y_l = binary_labels(l_labels,one=l)
y_u = binary_labels(u_labels,one=l)
y_ho = binary_labels(ho_labels,one=l)
In [12]:
#fix labels...they are wrong...need -1, +1, 0
svm_small = sklearn.svm.LinearSVC(C=10,fit_intercept=False)
svm_small.fit(L, y_l)
y_p = svm_small.predict(L)
score = svm_small.score(L,y_l)
print "baseline accuracy for ",Unum," l=",L.shape[0]
print " L accuracy ",(score)*100.0
y_p = svm_small.predict(U)
score = svm_small.score(U,y_u)
print " U accuracy ",(score)*100.0
y_p = svm_small.predict(HO)
score = svm_small.score(HO,y_ho)
print " HO accuracy ",(score)*100.0
# accuracy is already very very high!!!
# can we reduce ? is this real?
In [13]:
# of course, I need the same set to get the soft labels and compute the cross entropy
# can i look at AUC ROC on L ?
# I want cross entropy of soft labels as weights
# weighted distance will also do
# and other measures
# essentially I need to guess R, and I do this using the weights we find
In [ ]:
#TODO: redo this...i accidently deleted the older notebook
# start checking things in every day
# redo this carefully , read in cluster metric
# this time...run the eval and read back in the results
# stabilize this!
# universvm and cccp tests also now
# run qn-s3vm
# also try to fix svmlin,accuracy and old method
# clearly setting R is the issue
# set up matlab...do any of these methods work
# read papers...what is the classificaiton func
# github now
# make svmlin lib to read