In [3]:
import numpy as np
import sklearn
from sklearn.svm import SVC


from scipy import linalg
from scipy import sparse
from scipy.sparse import linalg as sp_linalg
from sklearn.datasets import load_svmlight_file

from sklearn.utils.extmath import safe_sparse_dot

In [5]:
# classifier with/ instance weights
# Linear LibSVM 
def classify(X, y, weights, alpha=1.0): 
    classifier = SVC(C=alpha,kernel='linear')
    classifier.fit(X, y, weights)
    return classifier

In [3]:
# TODO; check that the algo actually switches the labels properly
# need some kind of python unit tests
#
# multi-switch algo 
#  switch [R (+)]/[1-R (-)] labels 
#  switch S of them?
#
#  does this make sense?
def switch_labels(Apos_ids, Apos_scores, Aneg_ids, Aneg_scores, R, S) :
    switched_pos_ids = np.empty([])
    switched_neg_ids = np.empty([])

    
    return switched_pos_ids, switched_neg_ids

In [ ]:
#
# function which decreases if we guess a better labeling
#
def objective_function(guess_labels):
    # can we just evaluate the function without retraining
    #  I think so
    #  do we need to check?
    #
    return 0

In [ ]:
def converged():
    return 0

In [2]:
# run the incremental self training algo
#  
def transduce(X, known_labels, L_ids, U_ids, R=0.5, U_reg=1.0, W_reg=0.001, alpha = 1.0, num_steps=1000, num_switches = 1000):
  
    L = X[L_ids]  
    U = X[U_ids]
    
    numL = L.shape[0]
    numU = U.shape[0]
    numAll = numL+numU
    
    # I think this is fixed..we never expand this
    # y_L = known_labels
   
    # regularization constants
    U_final = U_reg
    W  = W_reg
        
    # run initial classifier
    #  train classifier on initial labeled (L) instances
    #  predict labels for unlabeled (U) 
    weights0 = np.ones(numL)
    classifier0 = classify(L, known_labels, weights0, alpha): 

    # guess the initial weights on U
    #  what happens to L ?
    guess_labels =  classifier0.predict(U)  

    # set initial labels and weights
    # same as y_all 
    all_labels = np.zeros(numAll)
    all_labels[L_ids]=known_labels
    
    all_weights =  weights0 = np.ones(numAll)
    all_weights[L_ids] = W / float(numL)
    
    # for istep in xrange(num_steps):

    U_incr = U_final/float(num_steps)
    U_norm = 1.0 / float(numU)  
    for istep, U_val in enumerate(np.arange(U_incr,U_final,U_incr)):
        
        # train a new classifier with
        #   combined known_labels and guess labels
        #   scaled weights
        all_labels[U_ids]=guess_labels[U_ids]
        all_weights[U_ids] = U_step*U_norm 
        classifier = classify(X, all_labels, all_weights, alpha)
        
    
        # find the Active set of mis-classified exammples      
        #  or strong violations
        #  y*(wx+b) < 1

        # before we classified, what did we think
        Upos_ids = np.where(guess_labels > 0)  
        Uneg_ids = np.where(guess_labels < 0)  

        
        # score = prediction = wx+b  seems I hope
        U_predictions = classifier.predict(U)
    
        #TODO:  check to see if this actually makes sense
        #  is this actually the active set
        
        # Apos_ids_sorted are ids into Apos_scores
        #  somehow this seems wrong ... ? but im not sure why
        #  the predictions are only valid if the |score| > 1 
        Upos_scores  = U_predictions[Upos_ids]        
        Apos_ids = np.where(Upos_scores < 1.0)
        Apos_scores = Upos_scores[Apos_ids]
        Apos_ids_sorted = np.argsort(Apos_scores)
        
        Uneg_scores  = U_predictions[Uneg_ids]
        Aneg_ids = np.where(Uneg_scores > -1.0)
        Aneg_scores = Upos_scores[Aneg_ids]
        Aneg_ids_sorted = np.argsort(Aneg_scores)

        

        # switch the most violating constraints
        # sort ids by score / predictions .. maybe not so easy?
        #  switch top R/(1-R) pairs 
    
        # select R Apos_ids_sorted and (1-R) Aneg_ids_sorted
        #  
        switched_pos_ids, switched_neg_ids = switch_labels(Apos_ids_sorted, Apos_scores, Aneg_ids_sorted, Aneg_scores, R, S)
        
        # form a new set of label guesses based on current predictions            
        # guess_labels = U_predictions 
        
        
        # converged ?
        
        
        # repeat


  File "<ipython-input-2-1585ef432c77>", line 15
    labeled_only_clf = classify(X_labeled, known_labels, sample_weights, alpha=1.0):
                                                                                   ^
SyntaxError: invalid syntax

In [7]:
# check that the svm works 
L, y_L = sklearn.datasets.load_svmlight_file('qn-s3vm-2014-paper/svmlight.testL.90')
wieghts_0 = np.ones(L.shape[0])
model= classify(L,y_L,wieghts_0)
np.mean(y_L==model.predict(L))


Out[7]:
1.0

In [9]:
# step 1
# we label the unlabelled data
%time U, y_U = sklearn.datasets.load_svmlight_file('qn-s3vm-2014-paper/svmlight.testU.90')


CPU times: user 2.63 s, sys: 47.5 ms, total: 2.68 s
Wall time: 2.69 s

In [10]:
type(L), type(U)


Out[10]:
(scipy.sparse.csr.csr_matrix, scipy.sparse.csr.csr_matrix)

In [11]:
L.shape, U.shape


Out[11]:
((90, 20926), (36064, 20958))

In [14]:
Lpad = np.zeros([L.shape[0],U.shape[1]-L.shape[1]])
print L.shape, Lpad.shape
Ll= sparse.hstack([L,Lpad])
print Ll.shape


(90, 20926) (90, 32)
(90, 20958)

In [69]:
X = sparse.vstack([Ll,U])
X.shape


Out[69]:
(36154, 20958)

In [16]:
L_ids = np.arange(Ll.shape[0])
U_ids =  + np.arange(Ll.shape[0],Ll.shape[0]+U.shape[0])

In [73]:
known_labels = y_L

In [ ]:
transduce(X, known_labels, L_ids, U_ids)