In [55]:
%matplotlib inline
import numpy as np
In [56]:
np.set_printoptions(precision=2, suppress=True)
In [57]:
def cosine_dist(u, v, axis):
"""Returns cosine of angle betwwen two vectors."""
return 1 - (u*v).sum(axis)/(np.sqrt((u**2).sum(axis))*np.sqrt((v**2).sum(axis)))
In [58]:
u = np.array([1,2,3])
v = np.array([4,5,6])
Note 1: We write the dot product as the sum of element-wise products. This allows us to generalize when u, v are matrices rather than vectors. The norms in the denominator are calculated in the same way.
In [59]:
u @ v
Out[59]:
In [60]:
(u * v).sum()
Out[60]:
Note 2: Broadcasting
In [61]:
M = np.array([[1.,2,3],[4,5,6]])
M.shape
Out[61]:
Note 2A: Broadcasting for M as collection of row vectors. How we broadcast and which axis to broadcast over are determined by the need to end up with a 2x2 matrix.
In [62]:
M[None,:,:].shape, M[:,None,:].shape
Out[62]:
In [63]:
(M[None,:,:] + M[:,None,:]).shape
Out[63]:
In [64]:
cosine_dist(M[None,:,:], M[:,None,:], 2)
Out[64]:
Note 2B: Broadcasting for M as a collection of column vectors. How we broadcast and which axis to broadcast over are determined by the need to end up with a 3x3 matrix.
In [65]:
M[:,None,:].shape, M[:,:,None].shape
Out[65]:
In [66]:
(M[:,None,:] + M[:,:,None]).shape
Out[66]:
In [67]:
cosine_dist(M[:,None,:], M[:,:,None], 0)
Out[67]:
Note 1: Using collections.Counter and pandas.DataFrame reduces the amount of code to write.
In [68]:
M = np.array([[1, 0, 0, 1, 0, 0, 0, 0, 0],
[1, 0, 1, 0, 0, 0, 0, 0, 0],
[1, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 1, 0, 1, 0, 0, 0, 0],
[0, 1, 1, 2, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 1, 0, 0, 0, 0],
[0, 1, 0, 0, 1, 0, 0, 0, 0],
[0, 0, 1, 1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 1, 1, 1, 0],
[0, 0, 0, 0, 0, 0, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 0, 1, 1]])
In [69]:
M.shape
Out[69]:
In [70]:
U, s, V = np.linalg.svd(M, full_matrices=False)
In [71]:
U.shape, s.shape, V.shape
Out[71]:
In [72]:
s[2:] = 0
M2 = U @ np.diag(s) @ V
In [73]:
from scipy.stats import spearmanr
In [74]:
r2 = spearmanr(M2)[0]
In [75]:
r2
Out[75]:
In [77]:
r2[np.tril_indices_from(r2[:5, :5], -1)]
Out[77]:
In [78]:
r2[np.tril_indices_from(r2[5:, 5:], -1)]
Out[78]:
Follow explanation here
k = 10
T, s, D = sparsesvd(csc_matrix(df), k=100)
doc = {'mystery': open('mystery.txt').read()}
terms = tf_idf(doc)
query_terms = df.join(terms).fillna(0)['mystery']
q = query_terms.T.dot(T.T.dot(np.diag(1.0/s)))
ranked_docs = df.columns[np.argsort(cosine_dist(q, x))][::-1]
In [ ]: