In [15]:
# code to regenerate input files for results for paper
#
# http://www.fabiangieseke.de/pdfs/neucom2013_draft.pdf
# see Table 2 results real-sim data set
#
# see also:
# http://www.fabiangieseke.de/pdfs/icpram2012.pdf
#
import sys
from time import time
from pprint import pprint
import numpy as np
import scipy
import scipy.sparse as sp
import joblib
import io
import os.path
import sklearn
import sklearn.svm
import sklearn.datasets
import sklearn.metrics
import sklearn.cross_validation
from sklearn.externals.six import u, b
import warnings
warnings.filterwarnings('ignore')
%pylab inline
In [16]:
X, y = sklearn.datasets.load_svmlight_file('real-sim')
In [17]:
X.shape
Out[17]:
In [18]:
from collections import Counter
Counter(y)
Out[18]:
In [19]:
splits = sklearn.cross_validation.StratifiedShuffleSplit(y, n_iter=1, test_size=0.50)
train_indices, test_indices = splits.__iter__().next()
In [20]:
instance_ids = np.arange(y.size)
X_train = X[train_indices]
train_ids = instance_ids[train_indices]
X_test = X[test_indices]
test_ids = instance_ids[test_indices]
train_labels = y[train_indices]
test_labels = y[test_indices]
In [21]:
svm = sklearn.svm.LinearSVC(penalty='l2', C=1, dual=False)
svm.fit(X_train, train_labels)
Out[21]:
Ground Truth
Best possible HO accuracy
In [22]:
accuracy = sklearn.metrics.accuracy_score(test_labels, svm.predict(X_test))
print 100.0*(accuracy)
#ncv = 10
#print sklearn.cross_validation.cross_val_score(svm, X_train, train_labels, cv=10).sum()/ncv
#print sklearn.cross_validation.cross_val_score(svm, X_test, test_labels, cv=10).sum()/ncv
In [23]:
print X_train.shape
print X_test.shape
In [24]:
def print_svmlight_infiles(L, y_l, U, y_u, HO, y_ho):
A = scipy.sparse.vstack((L,U))
unk_l = y_u*0
a_l = np.hstack((y_l, unk_l))
print A.shape, a_l.shape
numL = L.shape[0]
training_file = 'svmlight.train.%d'%numL
sklearn.datasets.dump_svmlight_file(A,a_l,training_file,zero_based=False)
testL_file = 'svmlight.testL.%d'%numL
sklearn.datasets.dump_svmlight_file(L,y_l,testL_file,zero_based=False)
testU_file = 'svmlight.testU.%d'%numL
sklearn.datasets.dump_svmlight_file(U,y_u,testU_file,zero_based=False)
testHO_file = 'svmlight.testHO.%d'%numL
sklearn.datasets.dump_svmlight_file(HO,y_ho,testHO_file,zero_based=False)
In [29]:
def dump_svmlin(X, y, fX, fy):
X_value_pattern = u("%d:%.16g")
is_sp = int(hasattr(X, "tocsr"))
fy.write(''.join(["%d\n" % label for label in y]))
for i in range(X.shape[0]):
if is_sp:
span = slice(X.indptr[i], X.indptr[i + 1])
row = zip(X.indices[span], X.data[span])
else:
nz = X[i] != 0
row = zip(np.where(nz)[0], X[i, nz])
s = " ".join(X_value_pattern % (j + 1, x) for j, x in row)
fX.write(("%s\n" % s).encode('ascii'))
In [26]:
def print_svmlin_infiles(L, y_l, U, y_u, HO, y_ho):
print "print_svmlin_infiles..."
A = scipy.sparse.vstack((L,U))
unk_l = y_u*0
a_l = np.hstack((y_l, unk_l))
print A.shape, a_l.shape
numL = L.shape[0]
examples_file = open('svmlin.train.examples.%d'%numL, "wb")
labels_file = open('svmlin.train.labels.%d'%numL, "wb")
dump_svmlin(A, a_l, examples_file, labels_file)
examples_file = open('svmlin.testL.examples.%d'%numL, "wb")
labels_file = open('svmlin.testL.labels.%d'%numL, "wb")
dump_svmlin(L, y_l, examples_file, labels_file)
examples_file = open('svmlin.testU.examples.%d'%numL, "wb")
labels_file = open('svmlin.testU.labels.%d'%numL, "wb")
dump_svmlin(U, y_u, examples_file, labels_file)
examples_file = open('svmlin.testHO.examples.%d'%numL, "wb")
labels_file = open('svmlin.testHO.labels.%d'%numL, "wb")
dump_svmlin(HO, y_ho, examples_file, labels_file)
In [27]:
def print_universvm_infiles(L, y_l, U, y_u, HO, y_ho):
A = scipy.sparse.vstack((L,U))
unk_l = y_u*0 - 3
a_l = np.hstack((y_l, unk_l))
print A.shape, a_l.shape
numL = L.shape[0]
training_file = 'universvm.train.%d'%numL
sklearn.datasets.dump_svmlight_file(A,a_l,training_file,zero_based=False)
testL_file = 'universvm.testL.%d'%numL
sklearn.datasets.dump_svmlight_file(L,y_l,testL_file,zero_based=False)
testU_file = 'universvm.testU.%d'%numL
sklearn.datasets.dump_svmlight_file(U,y_u,testU_file,zero_based=False)
testHO_file = 'universvm.testHO.%d'%numL
sklearn.datasets.dump_svmlight_file(HO,y_ho,testHO_file,zero_based=False)
In [30]:
#split_sizes = [0.0025, 0.005, 0.01, 0.04, 0.08]
split_sizes = [0.08]
for U_size in split_sizes:
splits = sklearn.cross_validation.StratifiedShuffleSplit(train_labels, n_iter=1, test_size=1-U_size)
labeled_indices, unlabeled_indices = splits.__iter__().next()
L = X_train[labeled_indices]
L_ids = instance_ids[labeled_indices]
U = X_train[unlabeled_indices]
U_ids = instance_ids[unlabeled_indices]
y_l = train_labels[labeled_indices]
y_u = train_labels[unlabeled_indices]
HO = X_test
y_ho = test_labels
print "making..."
#print_universvm_infiles(L, y_l, U, y_u, X_test, test_labels)
print_svmlin_infiles(L, y_l, U, y_u, X_test, test_labels)
print_svmlight_infiles(L, y_l, U, y_u, HO, y_ho)
svm_small = sklearn.svm.LinearSVC(C=10,fit_intercept=False)
svm_small.fit(L, y_l)
y_p = svm_small.predict(HO)
score = svm_small.score(HO,y_ho)
# print np.mean((y_ho+1)/2),np.mean((y_p+1)/2)
yy = y_p*y_ho
acc = float(np.where(yy>0)[0].shape[0])/y_ho.shape[0]
# print acc
print L.shape, y_l.shape, X_test.shape, test_labels.shape
print "baseline HO accuracy for ",U_size," l=",L.shape[0] ," ",(score)*100.0
print "\n"
In [ ]: