In [1]:
import sys
from time import time
from pprint import pprint

import numpy as np
import scipy
import scipy.sparse
import scipy.sparse as sp
import joblib

import io
import os.path

import sklearn
import sklearn.svm
import sklearn.datasets
import sklearn.cross_validation

import sklearn.datasets
import sklearn.cross_validation

from sklearn.datasets import fetch_20newsgroups
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.externals.six import u, b

import warnings
warnings.filterwarnings('ignore')

%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
dataset = fetch_20newsgroups(data_home=".")

In [3]:
group_names = dataset.target_names
group_counts = [(dataset.target==idx).sum() for idx in range(len(group_names))]
print group_counts
half_docs_group_index = np.where((np.cumsum(group_counts) > 5600))[0][0]
print half_docs_group_index
y = (dataset.target > half_docs_group_index).astype(int)*2-1
print y.sum()
instance_ids = np.arange(y.size)
print instance_ids


[480, 584, 591, 590, 578, 593, 585, 594, 598, 597, 600, 595, 591, 594, 593, 599, 546, 564, 465, 377]
9
-266
[    0     1     2 ..., 11311 11312 11313]

In [4]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=4)
X = vectorizer.fit_transform(dataset.data)

In [5]:
print X.shape


(11314, 30792)

In [6]:
from collections import Counter
Counter(y)


Out[6]:
Counter({-1: 5790, 1: 5524})

In [7]:
svm = sklearn.svm.LinearSVC(penalty='l2', C=10, dual=False)
svm.fit(X, y)


Out[7]:
LinearSVC(C=10, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)

In [8]:
reconstruction_accuracy = sklearn.metrics.accuracy_score(y, svm.predict(X))
print reconstruction_accuracy
ncv = 10
print sklearn.cross_validation.cross_val_score(svm, X, y, cv=10).sum()/ncv


0.99991161393
0.959962102395

In [9]:
sklearn.datasets.dump_svmlight_file(X,y,'liblinear.all.in',zero_based=False)

In [10]:
def print_svmlight_infiles(L, y_l, U, y_u, test_size):

    A = scipy.sparse.vstack((L,U))
    unk_l = y_u*0   
    a_l = np.hstack((y_l, unk_l))
    print A.shape, a_l.shape 
    training_file = 'svmlight.train.%.2f'%test_size
    sklearn.datasets.dump_svmlight_file(A,a_l,training_file,zero_based=False)
    
    testL_file = 'svmlight.testL.%.2f'%test_size
    sklearn.datasets.dump_svmlight_file(L,y_l,testL_file,zero_based=False)
    
    testU_file = 'svmlight.testU.%.2f'%test_size
    sklearn.datasets.dump_svmlight_file(U,y_u,testU_file,zero_based=False)

In [11]:
def dump_svmlin(X, y, fX, fy):
    X_value_pattern = u("%d:%.16g")
    X_line_pattern = u(" %s\n")
    y_line_pattern = u("%d")
    
    is_sp = int(hasattr(X, "tocsr"))

    for i in range(X.shape[0]):
        if is_sp:
            span = slice(X.indptr[i], X.indptr[i + 1])
            row = zip(X.indices[span], X.data[span])
        else:
            nz = X[i] != 0
            row = zip(np.where(nz)[0], X[i, nz])

        s = " ".join(X_value_pattern % (j + 1, x) for j, x in row)
        fX.write((X_line_pattern % s).encode('ascii'))
        fy.write((y_line_pattern % y[i]).encode('ascii'))

In [14]:
def print_svmlin_infiles(L, y_l, U, y_u, test_size):
    print "print_svmlin_infiles..."
    A = scipy.sparse.vstack((L,U))
    unk_l = y_u*0
    a_l = np.hstack((y_l, unk_l))
    print A.shape, a_l.shape 
    
    examples_file = open('svmlin.train.examples.%.2f'%test_size, "wb")
    labels_file = open('svmlin.train.labels.%.2f'%test_size, "wb")
    dump_svmlin(A, a_l, examples_file, labels_file)
    
    examples_file = open('svmlin.testL.examples.%.2f'%test_size, "wb")
    labels_file = open('svmlin.testL.labels.%.2f'%test_size, "wb")
    dump_svmlin(L, y_l, examples_file, labels_file)
    
    examples_file = open('svmlin.testU.examples.%.2f'%test_size, "wb")
    labels_file = open('svmlin.testU.labels.%.2f'%test_size, "wb")
    dump_svmlin(U, y_u, examples_file, labels_file)

In [15]:
def print_universvm_infiles(L, y_l, U, y_u, test_size):
    A = scipy.sparse.vstack((L,U))
    unk_l = y_u*0 - 3
    a_l = np.hstack((y_l, unk_l))
    print A.shape, a_l.shape 
    training_file = 'universvm.train.%.2f'%test_size
    sklearn.datasets.dump_svmlight_file(A,a_l,training_file,zero_based=False)
    
    testL_file = 'universvm.testL.%.2f'%test_size
    sklearn.datasets.dump_svmlight_file(L,y_l,testL_file,zero_based=False)
    
    testU_file = 'universvm.testU.%.2f'%test_size
    sklearn.datasets.dump_svmlight_file(U,y_u,testU_file,zero_based=False)

In [16]:
test_size = 0.05
splits = sklearn.cross_validation.StratifiedShuffleSplit(y, n_iter=1, test_size=test_size)
labeled_indices, unlabeled_indices = splits.__iter__().next()
L = X[labeled_indices]
L_ids = instance_ids[labeled_indices]

U = X[unlabeled_indices]
U_ids = instance_ids[unlabeled_indices]

y_l = y[labeled_indices]
y_u = y[unlabeled_indices]

print X.shape, L.shape, U.shape
print_svmlight_infiles(L, y_l, U, y_u, test_size)
print_universvm_infiles(L, y_l, U, y_u, test_size)
print_svmlin_infiles(L, y_l, U, y_u, test_size)


(11314, 30792) (10748, 30792) (566, 30792)
(11314, 30792) (11314,)
(11314, 30792) (11314,)
print_svmlin_infiles...
(11314, 30792) (11314,)

In [1]:


In [1]:


In [1]:


In [ ]: