In [1]:
import sys
from time import time
from pprint import pprint

import numpy as np
import scipy
import scipy.sparse as sp
import joblib

import io
import os.path

import sklearn
import sklearn.svm
import sklearn.datasets
import sklearn.metrics
import sklearn.cross_validation


from sklearn.externals.six import u, b

import warnings
warnings.filterwarnings('ignore')

%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
X, y = sklearn.datasets.load_svmlight_file('real-sim')

In [3]:
X.shape


Out[3]:
(72309, 20958)

In [4]:
from collections import Counter
Counter(y)


Out[4]:
Counter({-1.0: 50071, 1.0: 22238})

In [5]:
splits = sklearn.cross_validation.StratifiedShuffleSplit(y, n_iter=1, test_size=0.50)
train_indices, test_indices = splits.__iter__().next()

In [6]:
instance_ids = np.arange(y.size)
X_train = X[train_indices]
train_ids = instance_ids[train_indices]

X_test = X[test_indices]
test_ids = instance_ids[test_indices]

train_labels = y[train_indices]
test_labels = y[test_indices]

In [7]:
svm = sklearn.svm.LinearSVC(penalty='l2', C=10, dual=False)
svm.fit(X_train, train_labels)


Out[7]:
LinearSVC(C=10, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)

In [10]:
accuracy = sklearn.metrics.accuracy_score(test_labels, svm.predict(X_test))
print 100.0*(1.0-accuracy)
#ncv = 10
#print sklearn.cross_validation.cross_val_score(svm, X_train, train_labels, cv=10).sum()/ncv
#print sklearn.cross_validation.cross_val_score(svm, X_test, test_labels, cv=10).sum()/ncv


3.22500345734

In [19]:
print X_train.shape
print X_test.shape


(36154, 20958)
(36155, 20958)

In [24]:
def print_universvm_infiles(L, y_l, U, y_u, HO, y_HO, U_size):
    A = scipy.sparse.vstack((L,U))
    unk_l = y_u*0 - 3
    a_l = np.hstack((y_l, unk_l))
    
    name = U_size

        
    print A.shape, a_l.shape , name
    training_file = 'universvm.train.%.4f'%name
    sklearn.datasets.dump_svmlight_file(A,a_l,training_file,zero_based=False)
    
    testL_file = 'universvm.testL.%.4f'%name
    sklearn.datasets.dump_svmlight_file(L,y_l,testL_file,zero_based=False)
    
    testU_file = 'universvm.testU.%.4f'%name
    sklearn.datasets.dump_svmlight_file(U,y_u,testU_file,zero_based=False)
    
    testHO_file = 'universvm.testHO.%.4f'%name
    sklearn.datasets.dump_svmlight_file(HO,y_HO,testHO_file,zero_based=False)

In [25]:
U_size = 0.0025
splits = sklearn.cross_validation.StratifiedShuffleSplit(train_labels, n_iter=1, test_size=1-U_size)
labeled_indices, unlabeled_indices = splits.__iter__().next()

L = X_train[labeled_indices]
L_ids = instance_ids[labeled_indices]

U = X_train[unlabeled_indices]
U_ids = instance_ids[unlabeled_indices]

y_l = train_labels[labeled_indices]
y_u = train_labels[unlabeled_indices]

print X_train.shape, L.shape, U.shape, X_test.shape
print_universvm_infiles(L, y_l, U, y_u, X_test, test_labels, U_size)

svm_small = sklearn.svm.LinearSVC(penalty='l2', C=10, dual=False)
svm_small.fit(L, y_l)
accuracy = sklearn.metrics.accuracy_score(test_labels, svm_small.predict(X_test))
print (1.0-accuracy)*100.0


(36154, 20958) (90, 20958) (36064, 20958) (36155, 20958)
(36154, 20958) (36154,) 0.0025
25.5123772646

In [11]:


In [11]:
#TODO
# repeat with svmlight, svmlin, and qn_s3vm
# try again with NMF features / clusters
#   can we cluster the docs effectively
#   and use NMf / auto encoder features?
# can we repeat with other data sets?
#
# can we find MMMF / Max Margin Clustering method?

In [ ]: