notebook.community

Edit and run



In [1]:

    
# Label propagation is a semi-supervised technique that makes use
# of the labeled and unlabeled data to learn about the unlabeled
# data. Quite often, data that will benefit from a classification
# algorithm is difficult to label. For example: labeling data
# might be very expensive, so only a subset is cost-effective to
# manually label.



In [2]:

    
# A problem area is censored data. Imagine a case where the
# frontier of time will affect your ability to gather labeled
# data.



In [10]:

    
from sklearn import datasets
import numpy as np
d = datasets.load_iris()



In [11]:

    
X = d.data.copy()
y = d.target.copy()
names = d.target_names.copy()



In [12]:

    
names = np.append(names, ['unlabeled'])
names









    Out[12]:





array(['setosa', 'versicolor', 'virginica', 'unlabeled'], 
      dtype='|S10')



In [13]:

    
# update y with -1 for the marker of the unlabeled case



In [14]:

    
y[np.random.choice([True, False], len(y))] = -1



In [15]:

    
y[:10]









    Out[15]:





array([-1, -1, -1,  0,  0, -1,  0, -1,  0, -1])



In [16]:

    
names[y[:10]]









    Out[16]:





array(['unlabeled', 'unlabeled', 'unlabeled', 'setosa', 'setosa',
       'unlabeled', 'setosa', 'unlabeled', 'setosa', 'unlabeled'], 
      dtype='|S10')



In [20]:

    
from sklearn import semi_supervised
lp = semi_supervised.LabelPropagation()



In [21]:

    
lp.fit(X, y)









    Out[21]:





LabelPropagation(alpha=1, gamma=20, kernel='rbf', max_iter=30, n_neighbors=7,
         tol=0.001)



In [23]:

    
preds = lp.predict(X)
np.mean(preds == d.target)









    Out[23]:





0.97333333333333338



In [24]:

    
# LabelSpreading is related to LabelPropagation



In [25]:

    
ls = semi_supervised.LabelSpreading()



In [26]:

    
ls.fit(X, y)









    Out[26]:





LabelSpreading(alpha=0.2, gamma=20, kernel='rbf', max_iter=30, n_neighbors=7,
        tol=0.001)



In [27]:

    
np.mean(ls.predict(X) == d.target)









    Out[27]:





0.98666666666666669



In [29]:

    
# Label Propagation works by creating a graph of the data points,
# with weights placed at the edge equal to the following:
#   w[i,j](Theta) = d[i,j] / (Theta^2)
# The algorithm then works by labeled data points propagating
# their labels to the unlabeled data. This propagation is in part
# determined by edge weight.
# The edge weights can be placed in a matrix of transition
# probabilities. We can iteratively determine a good estimate of
# the actual labels.



In [ ]: