When predicting a large number of classes it often becomes inefficient to train a generalized linear model for each distinct class (i.e. the parameters for each softmax node at the end of a neural network). On proposed solution has been to use offline clustering of output label representations to create binary codes for each label and then use log(G) distinct sigmoid outputs at the end of a neural network (see "A Scalable Hierarchical Distributed Language Model" by Hinton and Mnih).

Why not instead learn an optimal encoding of the output labels interleaved with training the predictive model? Here we're experimenting with assigning initial random codes, training a multi-output linear regression, and then finding better output codes by averaging the predicted output codes of samples associated with each class.



In [1]:

    
import sklearn



In [2]:

    
data = sklearn.datasets.fetch_20newsgroups_vectorized()









    



/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/sklearn/datasets/twenty_newsgroups.py:89: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
  logger.warn("Downloading dataset from %s (14 MB)", URL)
WARNING:sklearn.datasets.twenty_newsgroups:Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)



In [3]:

    
data









    Out[3]:





{'data': <11314x130107 sparse matrix of type '<class 'numpy.float64'>'
 	with 1787565 stored elements in Compressed Sparse Row format>,
 'target': array([17,  7, 10, ..., 14, 12, 11]),
 'target_names': ['alt.atheism',
  'comp.graphics',
  'comp.os.ms-windows.misc',
  'comp.sys.ibm.pc.hardware',
  'comp.sys.mac.hardware',
  'comp.windows.x',
  'misc.forsale',
  'rec.autos',
  'rec.motorcycles',
  'rec.sport.baseball',
  'rec.sport.hockey',
  'sci.crypt',
  'sci.electronics',
  'sci.med',
  'sci.space',
  'soc.religion.christian',
  'talk.politics.guns',
  'talk.politics.mideast',
  'talk.politics.misc',
  'talk.religion.misc']}



In [4]:

    
X = data['data']; Y = data['target']



In [10]:

    
model = sklearn.linear_model.Ridge()



In [12]:

    
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=False).fit(X)
X_tfidf = tfidf.transform(X)



In [18]:









    Out[18]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [19]:

    
n_samples = len(Y)
n_labels = len(set(Y))
n_bits = int(np.ceil(np.log2(n_labels)))



In [28]:

    
# generate random binary codes for each output labels
output_codes = {}
 
for output_label in set(Y):
    candidate_code = tuple((np.random.randn(n_bits) > 0).astype(float))
    while candidate_code in output_codes.values():
        candidate_code = tuple((np.random.randn(n_bits) > 0).astype(float))
    output_codes[output_label] = candidate_code



In [29]:

    
output_codes









    Out[29]:





{0: (0.0, 0.0, 1.0, 1.0, 1.0),
 1: (1.0, 0.0, 0.0, 1.0, 0.0),
 2: (0.0, 1.0, 1.0, 0.0, 0.0),
 3: (1.0, 0.0, 1.0, 1.0, 0.0),
 4: (0.0, 1.0, 0.0, 0.0, 1.0),
 5: (1.0, 1.0, 0.0, 1.0, 0.0),
 6: (1.0, 1.0, 1.0, 0.0, 1.0),
 7: (1.0, 0.0, 1.0, 0.0, 1.0),
 8: (1.0, 1.0, 0.0, 1.0, 1.0),
 9: (1.0, 1.0, 1.0, 1.0, 0.0),
 10: (1.0, 0.0, 0.0, 0.0, 1.0),
 11: (1.0, 0.0, 0.0, 0.0, 0.0),
 12: (0.0, 1.0, 1.0, 1.0, 1.0),
 13: (0.0, 0.0, 0.0, 1.0, 1.0),
 14: (0.0, 0.0, 1.0, 0.0, 1.0),
 15: (1.0, 0.0, 1.0, 1.0, 1.0),
 16: (1.0, 1.0, 1.0, 1.0, 1.0),
 17: (0.0, 1.0, 1.0, 1.0, 0.0),
 18: (1.0, 0.0, 1.0, 0.0, 0.0),
 19: (0.0, 0.0, 1.0, 1.0, 0.0)}



In [30]:

    
encoded_Y = np.array([output_codes[yi] for yi in Y])



In [31]:

    
encoded_Y









    Out[31]:





array([[ 0.,  1.,  1.,  1.,  0.],
       [ 1.,  0.,  1.,  0.,  1.],
       [ 1.,  0.,  0.,  0.,  1.],
       ..., 
       [ 0.,  0.,  1.,  0.,  1.],
       [ 0.,  1.,  1.,  1.,  1.],
       [ 1.,  0.,  0.,  0.,  0.]])



In [32]:

    
model.fit(X_tfidf, encoded_Y)









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-32-c061f92a533a> in <module>()
----> 1 model.fit(X_tfidf, encoded_Y)

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/sklearn/linear_model/base.py in fit(self, X, y, n_jobs)
    388                 outs = Parallel(n_jobs=n_jobs_)(
    389                     delayed(sparse_lsqr)(X, y[:, j].ravel())
--> 390                     for j in range(y.shape[1]))
    391                 self.coef_ = np.vstack(out[0] for out in outs)
    392                 self.residues_ = np.vstack(out[3] for out in outs)

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    657             self._iterating = True
    658             for function, args, kwargs in iterable:
--> 659                 self.dispatch(function, args, kwargs)
    660 
    661             if pre_dispatch == "all" or n_jobs == 1:

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in dispatch(self, func, args, kwargs)
    404         """
    405         if self._pool is None:
--> 406             job = ImmediateApply(func, args, kwargs)
    407             index = len(self._jobs)
    408             if not _verbosity_filter(index, self.verbose):

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __init__(self, func, args, kwargs)
    138         # Don't delay the application, to avoid keeping the input
    139         # arguments in memory
--> 140         self.results = func(*args, **kwargs)
    141 
    142     def get(self):

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/scipy/sparse/linalg/isolve/lsqr.py in lsqr(A, b, damp, atol, btol, conlim, iter_lim, show, calc_var)
    361             u = (1/beta) * u
    362             anorm = sqrt(anorm**2 + alfa**2 + beta**2 + damp**2)
--> 363             v = A.rmatvec(u) - beta * v
    364             alfa = np.linalg.norm(v)
    365             if alfa > 0:

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/scipy/sparse/linalg/interface.py in rmatvec(self, x)
    358         if self.A_conj is None:
    359             self.A_conj = self.A.T.conj()
--> 360         return self.A_conj.dot(x)
    361 
    362 

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/scipy/sparse/base.py in dot(self, other)
    244 
    245         """
--> 246         return self * other
    247 
    248     def __eq__(self, other):

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/scipy/sparse/base.py in __mul__(self, other)
    294             # Fast path for the most common case
    295             if other.shape == (N,):
--> 296                 return self._mul_vector(other)
    297             elif other.shape == (N, 1):
    298                 return self._mul_vector(other.ravel()).reshape(M, 1)

/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/scipy/sparse/compressed.py in _mul_vector(self, other)
    454         # csr_matvec or csc_matvec
    455         fn = getattr(_sparsetools,self.format + '_matvec')
--> 456         fn(M, N, self.indptr, self.indices, self.data, other, result)
    457 
    458         return result

KeyboardInterrupt:



In [ ]: