In [1]:
import sys
from time import time
from pprint import pprint
import numpy as np
import scipy
import scipy.sparse
import scipy.sparse as sp
import joblib
import io
import os.path
import sklearn
import sklearn.svm
import sklearn.datasets
import sklearn.cross_validation
import sklearn.datasets
import sklearn.cross_validation
from sklearn.datasets import fetch_20newsgroups
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals.six import u, b
import warnings
warnings.filterwarnings('ignore')
%pylab inline
In [2]:
dataset = fetch_20newsgroups(data_home=".")
In [3]:
group_names = dataset.target_names
group_counts = [(dataset.target==idx).sum() for idx in range(len(group_names))]
print group_counts
half_docs_group_index = np.where((np.cumsum(group_counts) > 5600))[0][0]
print half_docs_group_index
y = (dataset.target > half_docs_group_index).astype(int)*2-1
print y.sum()
instance_ids = np.arange(y.size)
print instance_ids
In [4]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=4)
X = vectorizer.fit_transform(dataset.data)
In [5]:
print X.shape
In [6]:
from collections import Counter
Counter(y)
Out[6]:
In [7]:
svm = sklearn.svm.LinearSVC(penalty='l2', C=10, dual=False)
svm.fit(X, y)
Out[7]:
In [8]:
reconstruction_accuracy = sklearn.metrics.accuracy_score(y, svm.predict(X))
print reconstruction_accuracy
ncv = 10
print sklearn.cross_validation.cross_val_score(svm, X, y, cv=10).sum()/ncv
In [9]:
sklearn.datasets.dump_svmlight_file(X,y,'liblinear.all.in',zero_based=False)
In [10]:
def print_svmlight_infiles(L, y_l, U, y_u, test_size):
A = scipy.sparse.vstack((L,U))
unk_l = y_u*0
a_l = np.hstack((y_l, unk_l))
print A.shape, a_l.shape
training_file = 'svmlight.train.%.2f'%test_size
sklearn.datasets.dump_svmlight_file(A,a_l,training_file,zero_based=False)
testL_file = 'svmlight.testL.%.2f'%test_size
sklearn.datasets.dump_svmlight_file(L,y_l,testL_file,zero_based=False)
testU_file = 'svmlight.testU.%.2f'%test_size
sklearn.datasets.dump_svmlight_file(U,y_u,testU_file,zero_based=False)
In [11]:
def dump_svmlin(X, y, fX, fy):
X_value_pattern = u("%d:%.16g")
X_line_pattern = u(" %s\n")
y_line_pattern = u("%d")
is_sp = int(hasattr(X, "tocsr"))
for i in range(X.shape[0]):
if is_sp:
span = slice(X.indptr[i], X.indptr[i + 1])
row = zip(X.indices[span], X.data[span])
else:
nz = X[i] != 0
row = zip(np.where(nz)[0], X[i, nz])
s = " ".join(X_value_pattern % (j + 1, x) for j, x in row)
fX.write((X_line_pattern % s).encode('ascii'))
fy.write((y_line_pattern % y[i]).encode('ascii'))
In [14]:
def print_svmlin_infiles(L, y_l, U, y_u, test_size):
print "print_svmlin_infiles..."
A = scipy.sparse.vstack((L,U))
unk_l = y_u*0
a_l = np.hstack((y_l, unk_l))
print A.shape, a_l.shape
examples_file = open('svmlin.train.examples.%.2f'%test_size, "wb")
labels_file = open('svmlin.train.labels.%.2f'%test_size, "wb")
dump_svmlin(A, a_l, examples_file, labels_file)
examples_file = open('svmlin.testL.examples.%.2f'%test_size, "wb")
labels_file = open('svmlin.testL.labels.%.2f'%test_size, "wb")
dump_svmlin(L, y_l, examples_file, labels_file)
examples_file = open('svmlin.testU.examples.%.2f'%test_size, "wb")
labels_file = open('svmlin.testU.labels.%.2f'%test_size, "wb")
dump_svmlin(U, y_u, examples_file, labels_file)
In [15]:
def print_universvm_infiles(L, y_l, U, y_u, test_size):
A = scipy.sparse.vstack((L,U))
unk_l = y_u*0 - 3
a_l = np.hstack((y_l, unk_l))
print A.shape, a_l.shape
training_file = 'universvm.train.%.2f'%test_size
sklearn.datasets.dump_svmlight_file(A,a_l,training_file,zero_based=False)
testL_file = 'universvm.testL.%.2f'%test_size
sklearn.datasets.dump_svmlight_file(L,y_l,testL_file,zero_based=False)
testU_file = 'universvm.testU.%.2f'%test_size
sklearn.datasets.dump_svmlight_file(U,y_u,testU_file,zero_based=False)
In [16]:
test_size = 0.05
splits = sklearn.cross_validation.StratifiedShuffleSplit(y, n_iter=1, test_size=test_size)
labeled_indices, unlabeled_indices = splits.__iter__().next()
L = X[labeled_indices]
L_ids = instance_ids[labeled_indices]
U = X[unlabeled_indices]
U_ids = instance_ids[unlabeled_indices]
y_l = y[labeled_indices]
y_u = y[unlabeled_indices]
print X.shape, L.shape, U.shape
print_svmlight_infiles(L, y_l, U, y_u, test_size)
print_universvm_infiles(L, y_l, U, y_u, test_size)
print_svmlin_infiles(L, y_l, U, y_u, test_size)
In [1]:
In [1]:
In [1]:
In [ ]: