In [80]:
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
import scipy
import scipy.sparse
import joblib
import sklearn
import sklearn.datasets
import sklearn.cross_validation
import sys
from time import time
import numpy as np
%pylab inline
In [81]:
X, y = sklearn.datasets.load_svmlight_file('data/news20.binary')
In [82]:
instance_ids = np.arange(y.size)
In [ ]:
def test_label_prop(X, y, labeled_indices, unlabeled_indices):
L = X[labeled_indices]
L_ids = instance_ids[labeled_indices]
U = X[unlabeled_indices]
U_ids = instance_ids[unlabeled_indices]
y_l = y[labeled_indices]
y_u = y[unlabeled_indices]
yp = (y+1)/2
y_lp = (y_l+1)/2
y_up = (y_u+1)/2
l_ssl = np.copy(yp)
l_ssl[unlabeled_indices]=-1
for g in [100,500,1000]:
for a in [0.8, 0.9, 1.0]:
label_prop_model = LabelPropagation(kernel='rbf', gamma=g, alpha = a ,max_iter=100)
fit = label_prop_model.fit(X,l_ssl)
sl = fit.score(L,y_lp)
su = fit.score(U,y_up)
print "LP rbf", g, a, sl, su
for k in [1,2,3,4,5]:
for a in [0.8, 0.9, 1.0]:
label_prop_model = LabelPropagation(kernel='knn', n_neighbors=k, alpha = a ,max_iter=100)
fit = label_prop_model.fit(X,l_ssl)
sl = fit.score(L,y_lp)
su = fit.score(U,y_up)
print "LP knn", k, a, sl, su
label_spread_model = LabelSpreading(kernel='knn', n_neighbors=k, alpha = a ,max_iter=100)
fit = label_spread_model.fit(X,l_ssl)
sl = fit.score(L,y_lp)
su = fit.score(U,y_up)
print "LS knn", k, a, sl, su
In [102]:
for test_size in [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]:
splits = sklearn.cross_validation.StratifiedShuffleSplit(y, n_iter=1, test_size=test_size)
labeled_indices, unlabeled_indices = splits.__iter__().next()
print "testing split: ",test_size
test_label_prop(X, y, labeled_indices, unlabeled_indices)
print "....................."
In [ ]:
In [ ]: