In [55]:
%matplotlib inline
import numpy as np

In [56]:
np.set_printoptions(precision=2, suppress=True)

Exercise 1


In [57]:
def cosine_dist(u, v, axis):
    """Returns cosine of angle betwwen two vectors."""
    return 1 - (u*v).sum(axis)/(np.sqrt((u**2).sum(axis))*np.sqrt((v**2).sum(axis)))

In [58]:
u = np.array([1,2,3])
v = np.array([4,5,6])

Note 1: We write the dot product as the sum of element-wise products. This allows us to generalize when u, v are matrices rather than vectors. The norms in the denominator are calculated in the same way.


In [59]:
u @ v


Out[59]:
32

In [60]:
(u * v).sum()


Out[60]:
32

Note 2: Broadcasting


In [61]:
M = np.array([[1.,2,3],[4,5,6]])
M.shape


Out[61]:
(2, 3)

Note 2A: Broadcasting for M as collection of row vectors. How we broadcast and which axis to broadcast over are determined by the need to end up with a 2x2 matrix.


In [62]:
M[None,:,:].shape, M[:,None,:].shape


Out[62]:
((1, 2, 3), (2, 1, 3))

In [63]:
(M[None,:,:] + M[:,None,:]).shape


Out[63]:
(2, 2, 3)

In [64]:
cosine_dist(M[None,:,:], M[:,None,:], 2)


Out[64]:
array([[ 0.  ,  0.03],
       [ 0.03,  0.  ]])

Note 2B: Broadcasting for M as a collection of column vectors. How we broadcast and which axis to broadcast over are determined by the need to end up with a 3x3 matrix.


In [65]:
M[:,None,:].shape, M[:,:,None].shape


Out[65]:
((2, 1, 3), (2, 3, 1))

In [66]:
(M[:,None,:] + M[:,:,None]).shape


Out[66]:
(2, 3, 3)

In [67]:
cosine_dist(M[:,None,:], M[:,:,None], 0)


Out[67]:
array([[ 0.  ,  0.01,  0.02],
       [ 0.01, -0.  ,  0.  ],
       [ 0.02,  0.  ,  0.  ]])

Exeercise 2

Note 1: Using collections.Counter and pandas.DataFrame reduces the amount of code to write.

Exercise 3


In [68]:
M = np.array([[1, 0, 0, 1, 0, 0, 0, 0, 0],
    [1, 0, 1, 0, 0, 0, 0, 0, 0],
    [1, 1, 0, 0, 0, 0, 0, 0, 0],
    [0, 1, 1, 0, 1, 0, 0, 0, 0],
    [0, 1, 1, 2, 0, 0, 0, 0, 0],
    [0, 1, 0, 0, 1, 0, 0, 0, 0],
    [0, 1, 0, 0, 1, 0, 0, 0, 0],
    [0, 0, 1, 1, 0, 0, 0, 0, 0],
    [0, 1, 0, 0, 0, 0, 0, 0, 1],
    [0, 0, 0, 0, 0, 1, 1, 1, 0],
    [0, 0, 0, 0, 0, 0, 1, 1, 1],
    [0, 0, 0, 0, 0, 0, 0, 1, 1]])

In [69]:
M.shape


Out[69]:
(12, 9)

In [70]:
U, s, V = np.linalg.svd(M, full_matrices=False)

In [71]:
U.shape, s.shape, V.shape


Out[71]:
((12, 9), (9,), (9, 9))

In [72]:
s[2:] = 0
M2 = U @ np.diag(s) @ V

In [73]:
from scipy.stats import spearmanr

In [74]:
r2 = spearmanr(M2)[0]

In [75]:
r2


Out[75]:
array([[ 1.  ,  0.85,  1.  ,  1.  ,  0.72, -0.84, -0.84, -0.84, -0.8 ],
       [ 0.85,  1.  ,  0.85,  0.85,  0.97, -0.56, -0.56, -0.56, -0.48],
       [ 1.  ,  0.85,  1.  ,  1.  ,  0.72, -0.84, -0.84, -0.84, -0.8 ],
       [ 1.  ,  0.85,  1.  ,  1.  ,  0.72, -0.84, -0.84, -0.84, -0.8 ],
       [ 0.72,  0.97,  0.72,  0.72,  1.  , -0.39, -0.39, -0.39, -0.3 ],
       [-0.84, -0.56, -0.84, -0.84, -0.39,  1.  ,  1.  ,  1.  ,  0.98],
       [-0.84, -0.56, -0.84, -0.84, -0.39,  1.  ,  1.  ,  1.  ,  0.98],
       [-0.84, -0.56, -0.84, -0.84, -0.39,  1.  ,  1.  ,  1.  ,  0.98],
       [-0.8 , -0.48, -0.8 , -0.8 , -0.3 ,  0.98,  0.98,  0.98,  1.  ]])

In [77]:
r2[np.tril_indices_from(r2[:5, :5], -1)]


Out[77]:
array([ 0.85,  1.  ,  0.85,  1.  ,  0.85,  1.  ,  0.72,  0.97,  0.72,  0.72])

In [78]:
r2[np.tril_indices_from(r2[5:, 5:], -1)]


Out[78]:
array([ 0.85,  1.  ,  0.85,  1.  ,  0.85,  1.  ])

Exercise 4

  • Part 2 is similar to previous questions
  • Part 3 is Googling
  • Part 4: defining the query vector

Follow explanation here

k = 10
T, s, D = sparsesvd(csc_matrix(df), k=100)

doc = {'mystery': open('mystery.txt').read()}
terms = tf_idf(doc)
query_terms = df.join(terms).fillna(0)['mystery']
q = query_terms.T.dot(T.T.dot(np.diag(1.0/s)))

ranked_docs = df.columns[np.argsort(cosine_dist(q, x))][::-1]

In [ ]: