What's the best way to count non-zero elements in a sparse array?
In [ ]:
# joeln:
freq_joeln = np.diff(array.tocsc().indptr)
freq_joeln
In [ ]:
# eumiro
freq_eumiro_matrix = (array != 0).sum(axis=0)
freq_eumiro = np.array(freq_eumiro_matrix).ravel()
freq_eumiro
In [ ]:
# eumiro, but by row:
row_count_matrix = (array != 0).sum(1)
row_count = np.array(row_count).ravel()
In [ ]:
# comprehension (so slow!)
nrows, ncols = array.shape
df = np.array([array[:, col].nnz for col in xrange(ncols)])
In [1]:
# nz = np.nonzero(array)
In [ ]:
In [4]:
def sparse_corrcoef(A, B=None):
'''
From http://stackoverflow.com/q/19231268/424651
'''
if B is not None:
A = sparse.vstack((A, B), format='csr')
A = A.astype(np.float64)
# compute the covariance matrix
# (see http://stackoverflow.com/questions/16062804/)
A = A - A.mean(1)
norm = A.shape[1] - 1.
C = np.dot(A, A.T.conjugate()) / norm
# the correlation coefficients are given by
# C_{i,j} / sqrt(C_{ii} * C_{jj})
d = np.diag(C)
coeffs = C / np.sqrt(np.outer(d, d))
return coeffs
In [ ]:
# test sparse_corrcoef against np.corrcoef
a = sparse.rand(100, 100000, density=0.1, format='csr')
b = sparse.rand(100, 100000, density=0.1, format='csr')
coeffs1 = np.sparse_corrcoef(a, b)
coeffs2 = np.corrcoef(a.todense(), b.todense())
print np.allclose(coeffs1, coeffs2)
In [2]:
# covariance
array_corrcoefs = sparse_corrcoef(array)
array_corrcoefs.shape
In [ ]:
coeffs3 = sparse_corrcoef(a)
plt.imshow(coeffs3)