In [1]:
# Label propagation is a semi-supervised technique that makes use
# of the labeled and unlabeled data to learn about the unlabeled
# data. Quite often, data that will benefit from a classification
# algorithm is difficult to label. For example: labeling data
# might be very expensive, so only a subset is cost-effective to
# manually label.
In [2]:
# A problem area is censored data. Imagine a case where the
# frontier of time will affect your ability to gather labeled
# data.
In [10]:
from sklearn import datasets
import numpy as np
d = datasets.load_iris()
In [11]:
X = d.data.copy()
y = d.target.copy()
names = d.target_names.copy()
In [12]:
names = np.append(names, ['unlabeled'])
names
Out[12]:
In [13]:
# update y with -1 for the marker of the unlabeled case
In [14]:
y[np.random.choice([True, False], len(y))] = -1
In [15]:
y[:10]
Out[15]:
In [16]:
names[y[:10]]
Out[16]:
In [20]:
from sklearn import semi_supervised
lp = semi_supervised.LabelPropagation()
In [21]:
lp.fit(X, y)
Out[21]:
In [23]:
preds = lp.predict(X)
np.mean(preds == d.target)
Out[23]:
In [24]:
# LabelSpreading is related to LabelPropagation
In [25]:
ls = semi_supervised.LabelSpreading()
In [26]:
ls.fit(X, y)
Out[26]:
In [27]:
np.mean(ls.predict(X) == d.target)
Out[27]:
In [29]:
# Label Propagation works by creating a graph of the data points,
# with weights placed at the edge equal to the following:
# w[i,j](Theta) = d[i,j] / (Theta^2)
# The algorithm then works by labeled data points propagating
# their labels to the unlabeled data. This propagation is in part
# determined by edge weight.
# The edge weights can be placed in a matrix of transition
# probabilities. We can iteratively determine a good estimate of
# the actual labels.
In [ ]: