In [1]:
# Label propagation is a semi-supervised technique that makes use
# of the labeled and unlabeled data to learn about the unlabeled
# data. Quite often, data that will benefit from a classification
# algorithm is difficult to label. For example: labeling data
# might be very expensive, so only a subset is cost-effective to
# manually label.

In [2]:
# A problem area is censored data. Imagine a case where the
# frontier of time will affect your ability to gather labeled
# data.

In [10]:
from sklearn import datasets
import numpy as np
d = datasets.load_iris()

In [11]:
X = d.data.copy()
y = d.target.copy()
names = d.target_names.copy()

In [12]:
names = np.append(names, ['unlabeled'])
names


Out[12]:
array(['setosa', 'versicolor', 'virginica', 'unlabeled'], 
      dtype='|S10')

In [13]:
# update y with -1 for the marker of the unlabeled case

In [14]:
y[np.random.choice([True, False], len(y))] = -1

In [15]:
y[:10]


Out[15]:
array([-1, -1, -1,  0,  0, -1,  0, -1,  0, -1])

In [16]:
names[y[:10]]


Out[16]:
array(['unlabeled', 'unlabeled', 'unlabeled', 'setosa', 'setosa',
       'unlabeled', 'setosa', 'unlabeled', 'setosa', 'unlabeled'], 
      dtype='|S10')

In [20]:
from sklearn import semi_supervised
lp = semi_supervised.LabelPropagation()

In [21]:
lp.fit(X, y)


Out[21]:
LabelPropagation(alpha=1, gamma=20, kernel='rbf', max_iter=30, n_neighbors=7,
         tol=0.001)

In [23]:
preds = lp.predict(X)
np.mean(preds == d.target)


Out[23]:
0.97333333333333338

In [24]:
# LabelSpreading is related to LabelPropagation

In [25]:
ls = semi_supervised.LabelSpreading()

In [26]:
ls.fit(X, y)


Out[26]:
LabelSpreading(alpha=0.2, gamma=20, kernel='rbf', max_iter=30, n_neighbors=7,
        tol=0.001)

In [27]:
np.mean(ls.predict(X) == d.target)


Out[27]:
0.98666666666666669

In [29]:
# Label Propagation works by creating a graph of the data points,
# with weights placed at the edge equal to the following:
#   w[i,j](Theta) = d[i,j] / (Theta^2)
# The algorithm then works by labeled data points propagating
# their labels to the unlabeled data. This propagation is in part
# determined by edge weight.
# The edge weights can be placed in a matrix of transition
# probabilities. We can iteratively determine a good estimate of
# the actual labels.

In [ ]: