import sys
from time import time
from pprint import pprint

import numpy as np
import scipy
import scipy.sparse
import scipy.sparse as sp
import joblib

import io
import os.path

import sklearn
import sklearn.svm
import sklearn.datasets
import sklearn.cross_validation

import sklearn.datasets
import sklearn.cross_validation

from sklearn.datasets import fetch_20newsgroups
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.externals.six import u, b

import warnings

%pylab inline

dataset = fetch_20newsgroups(data_home=".")

group_names = dataset.target_names
group_counts = [( for idx in range(len(group_names))]
print group_counts
half_docs_group_index = np.where((np.cumsum(group_counts) > 5600))[0][0]
print half_docs_group_index
y = ( > half_docs_group_index).astype(int)*2-1
print y.sum()
instance_ids = np.arange(y.size)
print instance_ids

[480, 584, 591, 590, 578, 593, 585, 594, 598, 597, 600, 595, 591, 594, 593, 599, 546, 564, 465, 377]
[    0     1     2 ..., 11311 11312 11313]

vectorizer = TfidfVectorizer(max_df=0.5, min_df=4)
X = vectorizer.fit_transform(

print X.shape

(11314, 30792)

from collections import Counter

Counter({-1: 5790, 1: 5524})

svm = sklearn.svm.LinearSVC(penalty='l2', C=10, dual=False), y)

LinearSVC(C=10, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)

reconstruction_accuracy = sklearn.metrics.accuracy_score(y, svm.predict(X))
print reconstruction_accuracy
ncv = 10
print sklearn.cross_validation.cross_val_score(svm, X, y, cv=10).sum()/ncv


def print_svmlight_infiles(L, y_l, U, y_u, test_size):

    A = scipy.sparse.vstack((L,U))
    unk_l = y_u*0   
    a_l = np.hstack((y_l, unk_l))
    print A.shape, a_l.shape 
    training_file = 'svmlight.train.%.2f'%test_size
    testL_file = 'svmlight.testL.%.2f'%test_size
    testU_file = 'svmlight.testU.%.2f'%test_size

def dump_svmlin(X, y, fX, fy):
    X_value_pattern = u("%d:%.16g")
    X_line_pattern = u(" %s\n")
    y_line_pattern = u("%d")
    is_sp = int(hasattr(X, "tocsr"))

    for i in range(X.shape[0]):
        if is_sp:
            span = slice(X.indptr[i], X.indptr[i + 1])
            row = zip(X.indices[span],[span])
            nz = X[i] != 0
            row = zip(np.where(nz)[0], X[i, nz])

        s = " ".join(X_value_pattern % (j + 1, x) for j, x in row)
        fX.write((X_line_pattern % s).encode('ascii'))
        fy.write((y_line_pattern % y[i]).encode('ascii'))

def print_svmlin_infiles(L, y_l, U, y_u, test_size):
    print "print_svmlin_infiles..."
    A = scipy.sparse.vstack((L,U))
    unk_l = y_u*0
    a_l = np.hstack((y_l, unk_l))
    print A.shape, a_l.shape 
    examples_file = open('svmlin.train.examples.%.2f'%test_size, "wb")
    labels_file = open('svmlin.train.labels.%.2f'%test_size, "wb")
    dump_svmlin(A, a_l, examples_file, labels_file)
    examples_file = open('svmlin.testL.examples.%.2f'%test_size, "wb")
    labels_file = open('svmlin.testL.labels.%.2f'%test_size, "wb")
    dump_svmlin(L, y_l, examples_file, labels_file)
    examples_file = open('svmlin.testU.examples.%.2f'%test_size, "wb")
    labels_file = open('svmlin.testU.labels.%.2f'%test_size, "wb")
    dump_svmlin(U, y_u, examples_file, labels_file)

def print_universvm_infiles(L, y_l, U, y_u, test_size):
    A = scipy.sparse.vstack((L,U))
    unk_l = y_u*0 - 3
    a_l = np.hstack((y_l, unk_l))
    print A.shape, a_l.shape 
    training_file = 'universvm.train.%.2f'%test_size
    testL_file = 'universvm.testL.%.2f'%test_size
    testU_file = 'universvm.testU.%.2f'%test_size

test_size = 0.05
splits = sklearn.cross_validation.StratifiedShuffleSplit(y, n_iter=1, test_size=test_size)
labeled_indices, unlabeled_indices = splits.__iter__().next()
L = X[labeled_indices]
L_ids = instance_ids[labeled_indices]

U = X[unlabeled_indices]
U_ids = instance_ids[unlabeled_indices]

y_l = y[labeled_indices]
y_u = y[unlabeled_indices]

print X.shape, L.shape, U.shape
print_svmlight_infiles(L, y_l, U, y_u, test_size)
print_universvm_infiles(L, y_l, U, y_u, test_size)
print_svmlin_infiles(L, y_l, U, y_u, test_size)

(11314, 30792) (10748, 30792) (566, 30792)
(11314, 30792) (11314,)
(11314, 30792) (11314,)
(11314, 30792) (11314,)

