``````

In [ ]:

import numpy as np
import sklearn
from sklearn.linear_model import Ridge

from scipy import linalg
from scipy import sparse
from scipy.sparse import linalg as sp_linalg

from sklearn.utils.extmath import safe_sparse_dot

``````
``````

In [2]:

import numpy as np
import sklearn
from sklearn.linear_model import Ridge

from scipy import linalg
from scipy import sparse
from scipy.sparse import linalg as sp_linalg

from sklearn.utils.extmath import safe_sparse_dot

``````
``````

In [ ]:

# for some reason, the python ridge regression library does not
# allow rescaled data for the sparse_cg solver..no idea why
#  probably should test the
def _rescale_data(X, y, sample_weight):
"""Rescale data so as to support sample_weight"""
n_samples = X.shape[0]
sample_weight = sample_weight * np.ones(n_samples)
sample_weight = np.sqrt(sample_weight)
sw_matrix = sparse.dia_matrix((sample_weight, 0),
shape=(n_samples, n_samples))
X = safe_sparse_dot(sw_matrix, X)
y = safe_sparse_dot(sw_matrix, y)
return X, y

``````
``````

In [3]:

# classifier with/ instance weights
# current:  Ridge Regression w/rescaled data and real labels
def classify(X, y, sample_weight, alpha):
X, y = _rescale_data(X, y, sample_weight)
classifier = Ridge(alpha=alpha, fit_intercept=False, solver='sparse_cg')
classifier.fit(X, y)
return classifier

``````
``````

In [ ]:

# select the highest confidence documents using our model
# X = X[unlabelled_ids]
#  classify all docs
#  select based on score  R(+) , (1-R) (-)
del select_high_confidence_results(X, R, classifier):
high_c_ids = []

# apply to all data

# select the top R positive, bottom (1-R) negative scores

return high_c_ids

``````
``````

In [ ]:

# self training step
#  apply classifier to documents w/labels + guessed_labels
#  add R fraction of [+] documents (and 1-R [-]) guessed_labels set
#  retrain, with guessed_label ss weighted down
def self_train_step(X, y, W, U, R, alpha, labeled_ids, guessed_ids, unlabeled_ids):

# apply classifier with sample weights W and U
# to labeled docs and current guess labels
current_labels = labeled_ids

if guessed_ids.shape[0] > 0 then
current_labels = np.union1d(labeled_ids,guessed_ids)

X_current = X[current_labels]
y_current = y[current_labels]

# how do we set the sample_weights ?
# create weights for all unlabeled, and then select?
# wasteful but simple

# does the guess get sample weight W or U ... I think U
# notice: we normalize by num_guessed, not num_unlabelled
instance_weights = np.empty_like(y)
instance_weights[labeled_ids] = W / float(labeled_ids.shape[0])
instance_weights[guessed_ids] = U / float(guessed_ids.shape[0])

current_weights = instance_weights[current_labels]

current_model = classify(X_current, y_current, current_weights, alpha)

# added the R/(1-R) high confidence (+)/(-) documents to the guessed set

#
#
#

# switch based on the current set of guesses total
# so can switch out labels that were not present earlier

#
#
#

# switch

return 0

``````
``````

In [3]:

# multi-switch algo
#  switch [R (+)]/[1-R (-)] labels
#  if they make the current fit better
#
def switch_labels(X, y,labeled_ids, guessed_ids, unlabeled_ids, alpha, W, U, R) :
num_switched = 0

return num_switched

``````
``````

In [ ]:

# metric to decide if we switch the labels or not
# can we use the margin even for Regularized Least Squares?
# the regularizer is the same
#
#  if Xw=y, then w=(X^-1)y
#  #=> we need the current version of the classifier, with weights set
# some function of the classifier
def switch_metric(classifier )
return 0

``````
``````

In [ ]:

# run the incremental self training algo
#
def self_train(X, y, labeled_ids, unlabeled_ids, R=0.5, U=1, W=0.001, alpha = 1.0:

U_step_size = 0.001
istep = 1
# or, equivalently, num_steps = 1000
#   U_step_size = 1/num_steps

# run initial classifier
X_labeled = X[labeled_ids]
y_labeled = y[labeled_ids]

guessed_ids = [] # or np.empty

# loop over istep = start to finish
#  U_step_size*U to U in increments U_step_size

#  or: break at maximum U steps
#  or: break at some convergence criteria?

# set guessed_sample_weights
#  U_step = (U_step_size*istep)*U
U_step = (U_step_size*istep)*U

# apply current classifier to remaining unlabeled data
#  note:  U = U_step
self_train_step(X, y, W, U_step, R, alpha, labeled_ids, guessed_ids, unlabeled_ids):

switch_labels(X, y, labeled_ids, guessed_ids, unlabeled_ids, alpha, W, U, R)

# stop or keep switching / stop?
#  just run all the way to the end?

``````
``````

In [ ]:

``````