In [55]:

    
%matplotlib inline
import numpy as np



In [56]:

    
np.set_printoptions(precision=2, suppress=True)

Exercise 1



In [57]:

    
def cosine_dist(u, v, axis):
    """Returns cosine of angle betwwen two vectors."""
    return 1 - (u*v).sum(axis)/(np.sqrt((u**2).sum(axis))*np.sqrt((v**2).sum(axis)))



In [58]:

    
u = np.array([1,2,3])
v = np.array([4,5,6])

Note 1: We write the dot product as the sum of element-wise products. This allows us to generalize when u, v are matrices rather than vectors. The norms in the denominator are calculated in the same way.



In [59]:

    
u @ v









    Out[59]:





32



In [60]:

    
(u * v).sum()









    Out[60]:





32

Note 2: Broadcasting



In [61]:

    
M = np.array([[1.,2,3],[4,5,6]])
M.shape









    Out[61]:





(2, 3)

Note 2A: Broadcasting for M as collection of row vectors. How we broadcast and which axis to broadcast over are determined by the need to end up with a 2x2 matrix.



In [62]:

    
M[None,:,:].shape, M[:,None,:].shape









    Out[62]:





((1, 2, 3), (2, 1, 3))



In [63]:

    
(M[None,:,:] + M[:,None,:]).shape









    Out[63]:





(2, 2, 3)



In [64]:

    
cosine_dist(M[None,:,:], M[:,None,:], 2)









    Out[64]:





array([[ 0.  ,  0.03],
       [ 0.03,  0.  ]])

Note 2B: Broadcasting for M as a collection of column vectors. How we broadcast and which axis to broadcast over are determined by the need to end up with a 3x3 matrix.



In [65]:

    
M[:,None,:].shape, M[:,:,None].shape









    Out[65]:





((2, 1, 3), (2, 3, 1))



In [66]:

    
(M[:,None,:] + M[:,:,None]).shape









    Out[66]:





(2, 3, 3)



In [67]:

    
cosine_dist(M[:,None,:], M[:,:,None], 0)









    Out[67]:





array([[ 0.  ,  0.01,  0.02],
       [ 0.01, -0.  ,  0.  ],
       [ 0.02,  0.  ,  0.  ]])

Exeercise 2

Note 1: Using collections.Counter and pandas.DataFrame reduces the amount of code to write.

Exercise 3



In [68]:

    
M = np.array([[1, 0, 0, 1, 0, 0, 0, 0, 0],
    [1, 0, 1, 0, 0, 0, 0, 0, 0],
    [1, 1, 0, 0, 0, 0, 0, 0, 0],
    [0, 1, 1, 0, 1, 0, 0, 0, 0],
    [0, 1, 1, 2, 0, 0, 0, 0, 0],
    [0, 1, 0, 0, 1, 0, 0, 0, 0],
    [0, 1, 0, 0, 1, 0, 0, 0, 0],
    [0, 0, 1, 1, 0, 0, 0, 0, 0],
    [0, 1, 0, 0, 0, 0, 0, 0, 1],
    [0, 0, 0, 0, 0, 1, 1, 1, 0],
    [0, 0, 0, 0, 0, 0, 1, 1, 1],
    [0, 0, 0, 0, 0, 0, 0, 1, 1]])



In [69]:

    
M.shape









    Out[69]:





(12, 9)



In [70]:

    
U, s, V = np.linalg.svd(M, full_matrices=False)



In [71]:

    
U.shape, s.shape, V.shape









    Out[71]:





((12, 9), (9,), (9, 9))



In [72]:

    
s[2:] = 0
M2 = U @ np.diag(s) @ V



In [73]:

    
from scipy.stats import spearmanr



In [74]:

    
r2 = spearmanr(M2)[0]



In [75]:

    
r2









    Out[75]:





array([[ 1.  ,  0.85,  1.  ,  1.  ,  0.72, -0.84, -0.84, -0.84, -0.8 ],
       [ 0.85,  1.  ,  0.85,  0.85,  0.97, -0.56, -0.56, -0.56, -0.48],
       [ 1.  ,  0.85,  1.  ,  1.  ,  0.72, -0.84, -0.84, -0.84, -0.8 ],
       [ 1.  ,  0.85,  1.  ,  1.  ,  0.72, -0.84, -0.84, -0.84, -0.8 ],
       [ 0.72,  0.97,  0.72,  0.72,  1.  , -0.39, -0.39, -0.39, -0.3 ],
       [-0.84, -0.56, -0.84, -0.84, -0.39,  1.  ,  1.  ,  1.  ,  0.98],
       [-0.84, -0.56, -0.84, -0.84, -0.39,  1.  ,  1.  ,  1.  ,  0.98],
       [-0.84, -0.56, -0.84, -0.84, -0.39,  1.  ,  1.  ,  1.  ,  0.98],
       [-0.8 , -0.48, -0.8 , -0.8 , -0.3 ,  0.98,  0.98,  0.98,  1.  ]])



In [77]:

    
r2[np.tril_indices_from(r2[:5, :5], -1)]









    Out[77]:





array([ 0.85,  1.  ,  0.85,  1.  ,  0.85,  1.  ,  0.72,  0.97,  0.72,  0.72])



In [78]:

    
r2[np.tril_indices_from(r2[5:, 5:], -1)]









    Out[78]:





array([ 0.85,  1.  ,  0.85,  1.  ,  0.85,  1.  ])

Exercise 4

Part 2 is similar to previous questions
Part 3 is Googling
Part 4: defining the query vector

Follow explanation here

k = 10
T, s, D = sparsesvd(csc_matrix(df), k=100)

doc = {'mystery': open('mystery.txt').read()}
terms = tf_idf(doc)
query_terms = df.join(terms).fillna(0)['mystery']
q = query_terms.T.dot(T.T.dot(np.diag(1.0/s)))

ranked_docs = df.columns[np.argsort(cosine_dist(q, x))][::-1]



In [ ]: