RCV1-v2 Dataset source here


In [1]:
import logging

from sklearn.datasets import fetch_rcv1
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn import svm
from keras.layers import Input, Dense
from keras.models import Model

logging.basicConfig()
rcv1 = fetch_rcv1()


Using Theano backend.

In [13]:
training_samples = 23149

X_train = rcv1.data[:training_samples].todense()
# X_test = rcv1.data[training_samples:].todense()

# X_train_rshp = X_train.reshape((len(X_train), np.prod(X_train.shape[1:])))
# X_test_rshp = X_test.reshape((len(X_test), np.prod(X_test.shape[1:])))
# y_train = rcv1.target[:training_samples]
# y_test = rcv1.target[training_samples:]

X_train.shape

# X_train.shape, X_train_rshp.shape,  X_test.shape, X_test_rshp.shape, y_train.shape, y_test.shape


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-13-c875bed75df7> in <module>()
      1 training_samples = 23149
      2 
----> 3 X_train = rcv1.data[:training_samples].todense()
      4 # X_test = rcv1.data[training_samples:].todense()
      5 

/home/felipe/venv2-global/local/lib/python2.7/site-packages/scipy/sparse/base.pyc in todense(self, order, out)
    629             `numpy.matrix` object that shares the same memory.
    630         """
--> 631         return np.asmatrix(self.toarray(order=order, out=out))
    632 
    633     def toarray(self, order=None, out=None):

/home/felipe/venv2-global/local/lib/python2.7/site-packages/scipy/sparse/compressed.pyc in toarray(self, order, out)
    938     def toarray(self, order=None, out=None):
    939         """See the docstring for `spmatrix.toarray`."""
--> 940         return self.tocoo(copy=False).toarray(order=order, out=out)
    941 
    942     ##############################################################

/home/felipe/venv2-global/local/lib/python2.7/site-packages/scipy/sparse/coo.pyc in toarray(self, order, out)
    248     def toarray(self, order=None, out=None):
    249         """See the docstring for `spmatrix.toarray`."""
--> 250         B = self._process_toarray_args(order, out)
    251         fortran = int(B.flags.f_contiguous)
    252         if not fortran and not B.flags.c_contiguous:

/home/felipe/venv2-global/local/lib/python2.7/site-packages/scipy/sparse/base.pyc in _process_toarray_args(self, order, out)
    815             return out
    816         else:
--> 817             return np.zeros(self.shape, dtype=self.dtype, order=order)
    818 
    819     def __numpy_ufunc__(self, func, method, pos, inputs, **kwargs):

MemoryError: 

In [7]:
encoding_dim = 32

input_doc = Input(shape=(23149,))

encoded = Dense(encoding_dim,activation='relu')(input_doc)
decoded = Dense(23149, activation='sigmoid')(encoded)

autoencoder = Model(input=input_doc, output=decoded)

encoder = Model(input=input_doc, output=encoded)

encoded_input = Input(shape=(encoding_dim,))
decoder_layer = autoencoder.layers[-1]
decoder = Model(input=encoded_input, output=decoder_layer(encoded_input))

autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

autoencoder.fit(X_train, X_train,
                verbose=2,
                nb_epoch=50,
                batch_size=256,
                shuffle=True,
                validation_data=(X_test, X_test))


---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-7-83af873d7e21> in <module>()
     20                 batch_size=256,
     21                 shuffle=True,
---> 22                 validation_data=(X_test, X_test))

/home/felipe/venv2-global/local/lib/python2.7/site-packages/keras/engine/training.pyc in fit(self, x, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight)
   1033                                                            class_weight=class_weight,
   1034                                                            check_batch_dim=False,
-> 1035                                                            batch_size=batch_size)
   1036         # prepare validation data
   1037         if validation_data:

/home/felipe/venv2-global/local/lib/python2.7/site-packages/keras/engine/training.pyc in _standardize_user_data(self, x, y, sample_weight, class_weight, check_batch_dim, batch_size)
    960                                    self.internal_input_shapes,
    961                                    check_batch_dim=False,
--> 962                                    exception_prefix='model input')
    963         y = standardize_input_data(y, self.output_names,
    964                                    output_shapes,

/home/felipe/venv2-global/local/lib/python2.7/site-packages/keras/engine/training.pyc in standardize_input_data(data, names, shapes, check_batch_dim, exception_prefix)
    106                                         ' to have shape ' + str(shapes[i]) +
    107                                         ' but got array with shape ' +
--> 108                                         str(array.shape))
    109     return arrays
    110 

Exception: Error when checking model input: expected input_2 to have shape (None, 23149) but got array with shape (23149, 47236)

In [3]:
clf = OneVsRestClassifier(svm.LinearSVC(penalty='l1',tol=0.01,multi_class='crammer_singer',dual=False))

In [4]:
clf.fit(X_train,y_train)


/home/felipe/venv2/local/lib/python2.7/site-packages/sklearn/multiclass.py:70: UserWarning: Label not 49 is present in all training examples.
  str(classes[c]))
/home/felipe/venv2/local/lib/python2.7/site-packages/sklearn/multiclass.py:70: UserWarning: Label not 80 is present in all training examples.
  str(classes[c]))
Out[4]:
OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='crammer_singer', penalty='l1', random_state=None,
     tol=0.01, verbose=0),
          n_jobs=1)

In [5]:
y_pred = clf.predict(X_test)

current_score = f1_score(y_test,y_pred,average='micro')

In [6]:
current_score


Out[6]:
0.80843419139591599