This example demonstrates how an active learning algorithm can be implemented within the reactive framework. In this scenario, we use a margin based active learning algorithm with an SVM classifier. After setting up the problem, the practicitioner can view examples that need to be labeled, set their proper labels, and view the accuracy of the classifier.


In [ ]:
import cPickle, gzip
f = gzip.open('mnist.pkl.gz', 'rb')
train_set, valid_set, test_set = cPickle.load(f)
f.close()
ytr = train_set[1]
filter_tr = (ytr==0)+(ytr==1)
X_train = train_set[0][filter_tr,:]
y_train = train_set[1][filter_tr,None]
y_train = y_train*2-1

import dreaml as dm
from dreaml.server import start
import numpy as np
df = dm.DataFrame()
start(df)
df["data/", "input/raw/"].set_matrix(X_train)
df["data/", "input/label/"].set_matrix(np.zeros(y_train.shape,dtype=int))
# number of samples per set
m = 50
# cutoff 
b = 1

# Label the first set
T = df["data/", "input/label/"].get_matrix()
T[:m,:] = y_train[:m,:]

First, we set up the internal algorithm that will learn a weight vector from labeled samples. We use the LinearSVC from Scikitlearn, and write a simple class method that re-apply the algorithm.


In [ ]:
from dreaml.dataframe.transform import BatchTransform,ContinuousTransform
from sklearn.svm import LinearSVC
class SVC(BatchTransform):
    mk = m
    def func(self,target_df,X_df,y_df):
        X = X_df.get_matrix()[:self.mk,:]
        y = y_df.get_matrix()[:self.mk,:].squeeze()
        
        # get all labeled examples
        I = y!=0
        X0 = X[I,:]
        y0 = y[I]
        self.svm = LinearSVC()
        self.svm.fit(X0,y)
        self.target_df = target_df
        target_df.set_matrix(self.svm.coef_)
    def next_batch(self):
        self.mk += m
        self.apply(self.target_df)
        
df["weights/","input/raw/"] = SVC(df["data/", "input/raw/"],
                                  df["data/", "input/label/"])

Here, we write the margin based active learning algorithm. It calculates the margins from the weight vector, and labels each example accordingly.


In [ ]:
class MarginBasedAL(ContinuousTransform):
    def init_func(self,target_df,X_df,y_df,w_df):
        target_df.set_matrix(np.zeros(y_df.shape))
        self.next_batch = True
    def continuous_func(self,target_df,X_df,y_df,w_df):
        if self.next_batch:
            mk = w_df.T.mk
            coef = w_df.T.svm.coef_
            intercept = w_df.T.svm.intercept_

            X = X_df.get_matrix()
            y = y_df.get_matrix()
            margins = target_df.get_matrix()
            margins[mk:mk+m] = X[mk:mk+m,:].dot(coef.T)+intercept
            for j in range(mk,mk+m):
                if np.abs(margins[j]) > b:
                    y[j] = np.sign(margins[j])
                else: 
                    y[j] = 0
            self.mk = mk
            self.next_batch = False
        if self.mk < w_df.T.mk:
            self.next_batch = True
        
df["data/","margins/"] = MarginBasedAL(df["data/", "input/raw/"],
                                       df["data/", "input/label/"],
                                       df["weights/","input/raw/"])

We draw any examples that the algorithm needs to be labeled. As soon as the image is labeled, the transformation moves on to the next unlabeled example. The number of the unlabeled example currently being displayed is saved in the target.


In [ ]:
from bokeh.plotting import figure
from dreaml.dataframe.transform import FigureTransform

class DrawUnlabeled(FigureTransform):
    def create_figure(self,target_df,X_df,y_df,w_df):
        p = figure(x_range=[0,28],y_range=[0,28])
        M = np.zeros((28,28))
        p.image(image=[M], x=[0], y=[0], dw=[28], dh=[28],name="active")
        self.target_df = target_df
        self.X_df = X_df
        self.y_df = y_df
        self.w_df = w_df
        return p
    def update(self,p):
        mk = self.w_df.T.mk
        y = self.y_df.get_matrix()
        for j in range(mk,mk+m):
            if y[j]==0:
                break
        if y[j]==0:
            x = self.X_df.get_matrix()[j,:].reshape(28,28)
            rs = p.select(dict(name="active"))
            rs[0].data_source.data.update(image=[x])
            self.target_df.set_matrix(np.array([[j]]))
        else:
            print "next batch"
            self.w_df.T.next_batch()

df["image/","number/"] = DrawUnlabeled(df["data/", "input/raw/"],
                                       df["data/", "input/label/"],
                                       df["weights/","input/raw/"])

Finally, we plot the accuracy of the weight vector to track our progress.


In [ ]:
yval = valid_set[1]
filter_val = (yval==0)+(yval==1)
X_valid = valid_set[0][filter_val,:]
y_valid = valid_set[1][filter_val]
y_valid = y_train*2-1
def accuracy():
    y_predict = df["weights/","input/raw/"].T.svm.predict(X_valid)
    return ([np.mean(y_valid==y_predict)],[df["weights/","input/raw/"].T.mk])
df["plot/","err/"] = dm.Plotter(accuracy,
                                "accuracy",
                               legend=["accuracy"])

y = df["data/", "input/label/"].get_matrix()

Here is where the active part of the algorithm comes in. The frontend shows the top-most example that needs to be labeled. This can be done by simply directly setting the value in the label matrix with a -1 for a zero digit, and a 1 for a one digit. After evaluating one of the following lines, the frontend image automatically updates with the next example.


In [ ]:
y[df["image/","number/"].get_matrix()[0,0]] = -1

In [ ]:
y[df["image/","number/"].get_matrix()[0,0]] = 1

In [ ]: