In [2]:
import sys
from time import time
from pprint import pprint

import numpy as np
import scipy
import scipy.sparse
import joblib

import sklearn
import sklearn.svm
import sklearn.datasets
import sklearn.cross_validation

import sklearn.datasets
import sklearn.cross_validation

from sklearn.datasets import fetch_20newsgroups
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [3]:
dataset = fetch_20newsgroups(data_home=".")

In [4]:
group_names = dataset.target_names
group_counts = [(dataset.target==idx).sum() for idx in range(len(group_names))]
print group_counts
half_docs_group_index = np.where((np.cumsum(group_counts) > 5600))[0][0]
print half_docs_group_index
y = (dataset.target > half_docs_group_index).astype(int)*2-1
print y.sum()
instance_ids = np.arange(y.size)
print instance_ids


[480, 584, 591, 590, 578, 593, 585, 594, 598, 597, 600, 595, 591, 594, 593, 599, 546, 564, 465, 377]
9
-266
[    0     1     2 ..., 11311 11312 11313]

In [5]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=4)
X = vectorizer.fit_transform(dataset.data)

In [6]:
print X.shape


(11314, 30792)

In [7]:
from collections import Counter
Counter(y)


Out[7]:
Counter({-1: 5790, 1: 5524})

In [8]:
svm = sklearn.svm.LinearSVC(penalty='l2', C=10, dual=False)
svm.fit(X, y)


Out[8]:
LinearSVC(C=10, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)

In [9]:
reconstruction_accuracy = sklearn.metrics.accuracy_score(y, svm.predict(X))
print reconstruction_accuracy
ncv = 10
print sklearn.cross_validation.cross_val_score(svm, X, y, cv=10).sum()/ncv


0.99991161393
0.959962102395

In [10]:
sklearn.datasets.dump_svmlight_file(X,y,'liblinear.all.in',zero_based=False)

In [33]:
def print_svmlight_infiles(L, y_l, U, y_u, test_size):

    A = scipy.sparse.vstack((L,U))
    unk_l = y_u*0
    a_l = np.hstack((y_l, unk_l))
    print A.shape, a_l.shape 
    training_file = 'svmlight.train.%.2f'%test_size
    sklearn.datasets.dump_svmlight_file(A,a_l,training_file,zero_based=False)
    
    testL_file = 'svmlight.testL.%.2f'%test_size
    sklearn.datasets.dump_svmlight_file(L,y_l,testL_file,zero_based=False)
    
    testU_file = 'svmlight.testU.%.2f'%test_size
    sklearn.datasets.dump_svmlight_file(U,y_u,testU_file,zero_based=False)

In [36]:
test_size = 0.99
splits = sklearn.cross_validation.StratifiedShuffleSplit(y, n_iter=1, test_size=test_size)
labeled_indices, unlabeled_indices = splits.__iter__().next()
L = X[labeled_indices]
L_ids = instance_ids[labeled_indices]

U = X[unlabeled_indices]
U_ids = instance_ids[unlabeled_indices]

y_l = y[labeled_indices]
y_u = y[unlabeled_indices]

print X.shape, L.shape, U.shape
print_svmlight_infiles(L, y_l, U, y_u,test_size)


(11314, 30792) (113, 30792) (11201, 30792)
(11314, 30792) (11314,)

In [ ]:


In [1]:


In [1]:


In [1]:


In [ ]: