notebook.community

Edit and run



In [2]:

    
# code to generate sample multiclass 1-vs-all data sets
# data set is very small and needs to be replaced with rcv
#
#  this example will use the 20 newsgroup set
#
# see: http://scikit-learn.org/stable/auto_examples/document_classification_20newsgroups.html#example-document-classification-20newsgroups-py
# 
import sys
from time import time
from pprint import pprint

import numpy as np
import scipy
import scipy.sparse as sp
import joblib

import io
import os.path

import sklearn
import sklearn.svm
import sklearn.datasets
import sklearn.metrics
import sklearn.cross_validation


from sklearn.externals.six import u, b

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2

import warnings
warnings.filterwarnings('ignore')

%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [3]:

    
# Read RCV train and test Datasets using SvmLight Reader
#
# usable categories in test
# 1031 20
# 1134 15
# 1929 4
# 3649 2
#
#  select N_l random examples from all M other categories 
#    parameterize by l=20,15,4,2 and M=3, 5, 10, all
#
X_train, y_train = sklearn.datasets.load_svmlight_file('rcv1_train.multiclass')



In [4]:

    
X_train.shape, y_train.shape









    Out[4]:





((15564, 47236), (15564,))



In [5]:

    
# Read RCV tests data
#  
# create sample data
# 39197 20
# 36409 15
# 64048 4
# 126777 2

# select balanced (50/50) set from l=l=20,15,4,2 and M=3, 5, 10, all
#  same code as above
X_test, y_test = sklearn.datasets.load_svmlight_file('rcv1_test.multiclass')



In [6]:

    
X_test.shape, y_test.shape









    Out[6]:





((518571, 47236), (518571,))



In [7]:

    
l = 20
Lnum = 10

Lone = np.where(y_train==l)[0]
Lother = np.where(y_train!=l)[0]

Lpos = numpy.random.choice(Lone, size=Lnum)
Lneg = numpy.random.choice(Lother, size=Lnum)

Tpos = np.where(y_test==l)[0]
Tother = np.where(y_test!=l)[0]

Unum = Tpos.shape[0]/2
#Unum = 5000
Upos = Tpos[0:Unum]
Uneg = numpy.random.choice(Tother, size=Unum)

print Tpos.shape, Tother.shape

HOstart = Tpos.shape[0]-Unum
HOpos = Tpos[HOstart:]
HOneg = numpy.random.choice(numpy.setxor1d(Tother,Uneg), size=Unum)

# not sure why HO > U, but ok for now
print Lpos.shape, Upos.shape, HOpos.shape
print Lneg.shape, Uneg.shape, HOneg.shape

# check that T and HO dont overlap
print np.intersect1d(Upos,HOpos)
print np.intersect1d(Uneg,HOneg)









    



(39197,) (479374,)
(10,) (19598,) (19598,)
(10,) (19598,) (19598,)
[]
[]



In [8]:

    
Lids = np.union1d(Lpos,Lneg)
Uids = np.union1d(Upos,Uneg)
HOids =  np.union1d(HOpos,HOneg)
print Lids.shape, Uids.shape, HOids.shape









    



(20,) (38786,) (38777,)



In [9]:

    
# why does this fail?

L = X_train[Lids]
l_labels = y_train[Lids]

U = X_test[Uids]
u_labels = y_test[Uids]

HO = X_test[HOids]
ho_labels = y_test[HOids]



In [10]:

    
#TODO:  simplify
def binary_labels(labels,one=20):
    b_l = np.zeros_like(labels)
    one_ids = np.where(labels==one) 
    for i in one_ids:
        b_l[i] = 2

    return  (b_l -1)



In [11]:

    
y_l = binary_labels(l_labels,one=l)
y_u = binary_labels(u_labels,one=l)
y_ho = binary_labels(ho_labels,one=l)



In [12]:

    
#fix labels...they are wrong...need -1, +1, 0

svm_small = sklearn.svm.LinearSVC(C=10,fit_intercept=False)
svm_small.fit(L, y_l)

y_p = svm_small.predict(L)
score = svm_small.score(L,y_l)

print "baseline accuracy for ",Unum," l=",L.shape[0] 


print " L  accuracy ",(score)*100.0

y_p = svm_small.predict(U)
score = svm_small.score(U,y_u)

print " U  accuracy ",(score)*100.0

y_p = svm_small.predict(HO)
score = svm_small.score(HO,y_ho)

print " HO accuracy ",(score)*100.0
# accuracy is already very very high!!!
# can we reduce  ?  is this real?









    



baseline accuracy for  19598  l= 20
 L  accuracy  100.0
 U  accuracy  77.6929820038
 HO accuracy  77.476339067



In [13]:

    
# of course, I need the same set to get the soft labels and compute the cross entropy
# can i look at AUC ROC on L ?
# I want cross entropy of soft labels as weights 
#  weighted distance will also do
#  and other measures

# essentially I need to guess R, and I do this using the weights we find



In [ ]:

    
#TODO:  redo this...i accidently deleted the older notebook
# start checking things in every day
# redo this carefully , read in cluster metric
# this time...run the eval and read back in the results

# stabilize this!
# universvm and cccp tests also now
# run qn-s3vm

# also try to fix svmlin,accuracy and old method
# clearly setting R is the issue

# set up matlab...do any of these methods work
# read papers...what is the classificaiton func

# github now

# make svmlin lib to read