In [3]:

    
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Data Generation



In [4]:

    
np.random.seed(10)
p, q = (np.random.rand(i, 2) for i in (4, 5))
p_big, q_big = (np.random.rand(i, 80) for i in (100, 120))

print(p, "\n\n", q)









    



[[0.77132064 0.02075195]
 [0.63364823 0.74880388]
 [0.49850701 0.22479665]
 [0.19806286 0.76053071]] 

 [[0.16911084 0.08833981]
 [0.68535982 0.95339335]
 [0.00394827 0.51219226]
 [0.81262096 0.61252607]
 [0.72175532 0.29187607]]






    Out[4]:





1.4142135623730951

Solution



In [9]:

    
def naive(p, q):
    x = []
    for i in range(len(p)):
        x.append([])
        for j in range(len(q)):
            x.append(np.sqrt((p[i][0] - q[i][0]) + (p[i][1] - q[i][1])))
    return x

Use matching indices

Instead of iterating through indices, one can use them directly to parallelize the operations with Numpy.



In [10]:

    
rows, cols = np.indices((p.shape[0], q.shape[0]))
print(rows, end='\n\n')
print(cols)









    



[[0 0 0 0 0]
 [1 1 1 1 1]
 [2 2 2 2 2]
 [3 3 3 3 3]]

[[0 1 2 3 4]
 [0 1 2 3 4]
 [0 1 2 3 4]
 [0 1 2 3 4]]



In [11]:

    
print(p[rows.ravel()], end='\n\n')
print(q[cols.ravel()])









    



[[0.77132064 0.02075195]
 [0.77132064 0.02075195]
 [0.77132064 0.02075195]
 [0.77132064 0.02075195]
 [0.77132064 0.02075195]
 [0.63364823 0.74880388]
 [0.63364823 0.74880388]
 [0.63364823 0.74880388]
 [0.63364823 0.74880388]
 [0.63364823 0.74880388]
 [0.49850701 0.22479665]
 [0.49850701 0.22479665]
 [0.49850701 0.22479665]
 [0.49850701 0.22479665]
 [0.49850701 0.22479665]
 [0.19806286 0.76053071]
 [0.19806286 0.76053071]
 [0.19806286 0.76053071]
 [0.19806286 0.76053071]
 [0.19806286 0.76053071]]

[[0.16911084 0.08833981]
 [0.68535982 0.95339335]
 [0.00394827 0.51219226]
 [0.81262096 0.61252607]
 [0.72175532 0.29187607]
 [0.16911084 0.08833981]
 [0.68535982 0.95339335]
 [0.00394827 0.51219226]
 [0.81262096 0.61252607]
 [0.72175532 0.29187607]
 [0.16911084 0.08833981]
 [0.68535982 0.95339335]
 [0.00394827 0.51219226]
 [0.81262096 0.61252607]
 [0.72175532 0.29187607]
 [0.16911084 0.08833981]
 [0.68535982 0.95339335]
 [0.00394827 0.51219226]
 [0.81262096 0.61252607]
 [0.72175532 0.29187607]]



In [19]:

    
def with_indices(p, q):
    diff = p[rows.ravel()] - q[cols.ravel()]
    return np.sqrt(diff.dot(diff.T))

Use a library

scipy is the equivalent of matlab toolboxes and have a lot to offer. Actually the pairwise computation is part of the library through the spatial module.



In [20]:

    
from scipy.spatial.distance import cdist

def scipy_version(p, q):
    return cdist(p, q)

Numpy Magic



In [21]:

    
def tensor_broadcasting(p, q):
    return np.sqrt(np.sum((p[:,np.newaxis,:]-q[np.newaxis,:,:])**2, axis=2))

Compare methods



In [22]:

    
methods = [naive, with_indices, scipy_version, tensor_broadcasting]
timers = []
for f in methods:
    r = %timeit -o f(p_big, q_big)
    timers.append(r)









    



/Users/cosminrusu/miniconda3/envs/ml-ex01/lib/python3.7/site-packages/ipykernel_launcher.py:6: RuntimeWarning: invalid value encountered in sqrt
  






    



35.3 ms ± 710 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)






    



/Users/cosminrusu/miniconda3/envs/ml-ex01/lib/python3.7/site-packages/ipykernel_launcher.py:3: RuntimeWarning: invalid value encountered in sqrt
  This is separate from the ipykernel package so we can avoid doing imports until






    



21.9 µs ± 504 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
794 µs ± 19.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.72 ms ± 35.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)



In [23]:

    
plt.figure(figsize=(10,6))
plt.bar(np.arange(len(methods)), [r.best*1000 for r in timers], log=False)  # Set log to True for logarithmic scale
plt.xticks(np.arange(len(methods))+0.2, [f.__name__ for f in methods], rotation=30)
plt.xlabel('Method')
plt.ylabel('Time (ms)')
plt.show()



In [ ]: