In [2]:
import sys
from time import time
from pprint import pprint
import numpy as np
import scipy
import scipy.sparse
import joblib
import sklearn
import sklearn.svm
import sklearn.datasets
import sklearn.cross_validation
import sklearn.datasets
import sklearn.cross_validation
from sklearn.datasets import fetch_20newsgroups
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')
%pylab inline
In [3]:
dataset = fetch_20newsgroups(data_home=".")
In [4]:
group_names = dataset.target_names
group_counts = [(dataset.target==idx).sum() for idx in range(len(group_names))]
print group_counts
half_docs_group_index = np.where((np.cumsum(group_counts) > 5600))[0][0]
print half_docs_group_index
y = (dataset.target > half_docs_group_index).astype(int)*2-1
print y.sum()
instance_ids = np.arange(y.size)
print instance_ids
In [5]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=4)
X = vectorizer.fit_transform(dataset.data)
In [6]:
print X.shape
In [7]:
from collections import Counter
Counter(y)
Out[7]:
In [8]:
svm = sklearn.svm.LinearSVC(penalty='l2', C=10, dual=False)
svm.fit(X, y)
Out[8]:
In [9]:
reconstruction_accuracy = sklearn.metrics.accuracy_score(y, svm.predict(X))
print reconstruction_accuracy
ncv = 10
print sklearn.cross_validation.cross_val_score(svm, X, y, cv=10).sum()/ncv
In [10]:
sklearn.datasets.dump_svmlight_file(X,y,'liblinear.all.in',zero_based=False)
In [33]:
def print_svmlight_infiles(L, y_l, U, y_u, test_size):
A = scipy.sparse.vstack((L,U))
unk_l = y_u*0
a_l = np.hstack((y_l, unk_l))
print A.shape, a_l.shape
training_file = 'svmlight.train.%.2f'%test_size
sklearn.datasets.dump_svmlight_file(A,a_l,training_file,zero_based=False)
testL_file = 'svmlight.testL.%.2f'%test_size
sklearn.datasets.dump_svmlight_file(L,y_l,testL_file,zero_based=False)
testU_file = 'svmlight.testU.%.2f'%test_size
sklearn.datasets.dump_svmlight_file(U,y_u,testU_file,zero_based=False)
In [36]:
test_size = 0.99
splits = sklearn.cross_validation.StratifiedShuffleSplit(y, n_iter=1, test_size=test_size)
labeled_indices, unlabeled_indices = splits.__iter__().next()
L = X[labeled_indices]
L_ids = instance_ids[labeled_indices]
U = X[unlabeled_indices]
U_ids = instance_ids[unlabeled_indices]
y_l = y[labeled_indices]
y_u = y[unlabeled_indices]
print X.shape, L.shape, U.shape
print_svmlight_infiles(L, y_l, U, y_u,test_size)
In [ ]:
In [1]:
In [1]:
In [1]:
In [ ]: