In [3]:
import numpy as np
import sklearn
from sklearn.svm import SVC
from scipy import linalg
from scipy import sparse
from scipy.sparse import linalg as sp_linalg
from sklearn.datasets import load_svmlight_file
from sklearn.utils.extmath import safe_sparse_dot
In [5]:
# classifier with/ instance weights
# Linear LibSVM
def classify(X, y, weights, alpha=1.0):
classifier = SVC(C=alpha,kernel='linear')
classifier.fit(X, y, weights)
return classifier
In [3]:
# TODO; check that the algo actually switches the labels properly
# need some kind of python unit tests
#
# multi-switch algo
# switch [R (+)]/[1-R (-)] labels
# switch S of them?
#
# does this make sense?
def switch_labels(Apos_ids, Apos_scores, Aneg_ids, Aneg_scores, R, S) :
switched_pos_ids = np.empty([])
switched_neg_ids = np.empty([])
return switched_pos_ids, switched_neg_ids
In [ ]:
#
# function which decreases if we guess a better labeling
#
def objective_function(guess_labels):
# can we just evaluate the function without retraining
# I think so
# do we need to check?
#
return 0
In [ ]:
def converged():
return 0
In [2]:
# run the incremental self training algo
#
def transduce(X, known_labels, L_ids, U_ids, R=0.5, U_reg=1.0, W_reg=0.001, alpha = 1.0, num_steps=1000, num_switches = 1000):
L = X[L_ids]
U = X[U_ids]
numL = L.shape[0]
numU = U.shape[0]
numAll = numL+numU
# I think this is fixed..we never expand this
# y_L = known_labels
# regularization constants
U_final = U_reg
W = W_reg
# run initial classifier
# train classifier on initial labeled (L) instances
# predict labels for unlabeled (U)
weights0 = np.ones(numL)
classifier0 = classify(L, known_labels, weights0, alpha):
# guess the initial weights on U
# what happens to L ?
guess_labels = classifier0.predict(U)
# set initial labels and weights
# same as y_all
all_labels = np.zeros(numAll)
all_labels[L_ids]=known_labels
all_weights = weights0 = np.ones(numAll)
all_weights[L_ids] = W / float(numL)
# for istep in xrange(num_steps):
U_incr = U_final/float(num_steps)
U_norm = 1.0 / float(numU)
for istep, U_val in enumerate(np.arange(U_incr,U_final,U_incr)):
# train a new classifier with
# combined known_labels and guess labels
# scaled weights
all_labels[U_ids]=guess_labels[U_ids]
all_weights[U_ids] = U_step*U_norm
classifier = classify(X, all_labels, all_weights, alpha)
# find the Active set of mis-classified exammples
# or strong violations
# y*(wx+b) < 1
# before we classified, what did we think
Upos_ids = np.where(guess_labels > 0)
Uneg_ids = np.where(guess_labels < 0)
# score = prediction = wx+b seems I hope
U_predictions = classifier.predict(U)
#TODO: check to see if this actually makes sense
# is this actually the active set
# Apos_ids_sorted are ids into Apos_scores
# somehow this seems wrong ... ? but im not sure why
# the predictions are only valid if the |score| > 1
Upos_scores = U_predictions[Upos_ids]
Apos_ids = np.where(Upos_scores < 1.0)
Apos_scores = Upos_scores[Apos_ids]
Apos_ids_sorted = np.argsort(Apos_scores)
Uneg_scores = U_predictions[Uneg_ids]
Aneg_ids = np.where(Uneg_scores > -1.0)
Aneg_scores = Upos_scores[Aneg_ids]
Aneg_ids_sorted = np.argsort(Aneg_scores)
# switch the most violating constraints
# sort ids by score / predictions .. maybe not so easy?
# switch top R/(1-R) pairs
# select R Apos_ids_sorted and (1-R) Aneg_ids_sorted
#
switched_pos_ids, switched_neg_ids = switch_labels(Apos_ids_sorted, Apos_scores, Aneg_ids_sorted, Aneg_scores, R, S)
# form a new set of label guesses based on current predictions
# guess_labels = U_predictions
# converged ?
# repeat
In [7]:
# check that the svm works
L, y_L = sklearn.datasets.load_svmlight_file('qn-s3vm-2014-paper/svmlight.testL.90')
wieghts_0 = np.ones(L.shape[0])
model= classify(L,y_L,wieghts_0)
np.mean(y_L==model.predict(L))
Out[7]:
In [9]:
# step 1
# we label the unlabelled data
%time U, y_U = sklearn.datasets.load_svmlight_file('qn-s3vm-2014-paper/svmlight.testU.90')
In [10]:
type(L), type(U)
Out[10]:
In [11]:
L.shape, U.shape
Out[11]:
In [14]:
Lpad = np.zeros([L.shape[0],U.shape[1]-L.shape[1]])
print L.shape, Lpad.shape
Ll= sparse.hstack([L,Lpad])
print Ll.shape
In [69]:
X = sparse.vstack([Ll,U])
X.shape
Out[69]:
In [16]:
L_ids = np.arange(Ll.shape[0])
U_ids = + np.arange(Ll.shape[0],Ll.shape[0]+U.shape[0])
In [73]:
known_labels = y_L
In [ ]:
transduce(X, known_labels, L_ids, U_ids)