What's the best way to count non-zero elements in a sparse array?



In [ ]:

    
# joeln: 
freq_joeln = np.diff(array.tocsc().indptr)
freq_joeln



In [ ]:

    
# eumiro
freq_eumiro_matrix = (array != 0).sum(axis=0)
freq_eumiro = np.array(freq_eumiro_matrix).ravel()
freq_eumiro



In [ ]:

    
# eumiro, but by row: 
row_count_matrix = (array != 0).sum(1)
row_count = np.array(row_count).ravel()



In [ ]:

    
# comprehension (so slow!)
nrows, ncols = array.shape
df = np.array([array[:, col].nnz for col in xrange(ncols)])



In [1]:

    
# nz = np.nonzero(array)



In [ ]:

Sparse covariance



In [4]:

    
def sparse_corrcoef(A, B=None):
    '''
    From http://stackoverflow.com/q/19231268/424651
    '''
    if B is not None:
        A = sparse.vstack((A, B), format='csr')

    A = A.astype(np.float64)

    # compute the covariance matrix
    # (see http://stackoverflow.com/questions/16062804/)
    A = A - A.mean(1)
    norm = A.shape[1] - 1.
    C = np.dot(A, A.T.conjugate()) / norm

    # the correlation coefficients are given by
    # C_{i,j} / sqrt(C_{ii} * C_{jj})
    d = np.diag(C)
    coeffs = C / np.sqrt(np.outer(d, d))

    return coeffs



In [ ]:

    
# test sparse_corrcoef against np.corrcoef
a = sparse.rand(100, 100000, density=0.1, format='csr')
b = sparse.rand(100, 100000, density=0.1, format='csr')

coeffs1 = np.sparse_corrcoef(a, b)
coeffs2 = np.corrcoef(a.todense(), b.todense())

print np.allclose(coeffs1, coeffs2)



In [2]:

    
# covariance
array_corrcoefs = sparse_corrcoef(array)
array_corrcoefs.shape









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-63b4c253d973> in <module>()
      1 # covariance
----> 2 array_corrcoefs = sparse_corrcoef(array)
      3 array_corrcoefs.shape

NameError: name 'sparse_corrcoef' is not defined



In [ ]:

    
coeffs3 = sparse_corrcoef(a)
plt.imshow(coeffs3)