Distance and similarity functions



In [ ]:

    
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import sklearn as sk
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
import seaborn as sns

import time

%matplotlib inline

Brief intro to numpy: http://www.numpy.org/ or for more references: http://docs.scipy.org/doc/numpy/reference/index.html

Why numpy?



In [ ]:

    
def trad_version():
    t1 = time.time()
    X = range(10000000)
    Y = range(10000000)
    Z = []
    for i in range(len(X)):
        Z.append(X[i] + Y[i])
    return time.time() - t1

def numpy_version():
    t1 = time.time()
    X = np.arange(10000000)
    Y = np.arange(10000000)
    Z = X + Y
    return time.time() - t1


traditional_time = trad_version()
numpy_time = numpy_version()
print "Traditional time = "+ str(traditional_time)
print "Numpy time       = "+ str(numpy_time)

Arrays in numpy



In [ ]:

    
#1-dimensional arrays
x = np.array([2,5,18,14,4])
print "\n Deterministic 1-dimensional array \n"
print x

x = np.random.rand(5)
print "\n Random 1-dimensional array \n"
print x

#2-dimensional arrays
x = np.array([[2,5,18,14,4], [12,15,1,2,8]])
print "\n Deterministic 2-dimensional array \n"
print x

x = np.random.rand(5,5)
print "\n Random 2-dimensional array \n"
print x
print x.shape

Manipulating and aggregating arrays



In [ ]:

    
x = np.random.rand(5)
print x
print x+1

Aggregates



In [ ]:

    
x = np.random.rand(2,4)
print x
print np.mean(x)
print np.mean(x,0)
print np.std(x)
print np.std(x,1)
print np.median(x)
print np.median(x,1)
print np.sum(x)
print np.sum(x,1)
print np.prod(x)
print np.prod(x,1)

Generating synthetic data

Random data are the simplest data one can generate. Other types of data following different distributions can be generated with functions extensively discussed below.

http://docs.scipy.org/doc/numpy/reference/routines.random.html



In [ ]:

    
A = np.random.rand(2,15)
print (A)



In [ ]:

    
B = np.random.randint(2,size = (2,15))
print B

Euclidean distance



In [ ]:

    
D = np.sqrt(np.sum(np.square(A[0,:]-A[1,:])))
print D



In [ ]:

    
def my_euclidean_dist(x,y):
    return np.sqrt(np.sum(np.square(x-y)))



In [ ]:

    
d1 = my_euclidean_dist(A[0,:],A[1,:])
d2 = my_euclidean_dist(B[0,:],B[1,:])
print d1
print d2

Becoming more effective and efficient using scikit-learn, a set of libraries for data mining, data analysis and machine learning http://scikit-learn.org/stable/

Generating data with specific structure using sklearn.datasets



In [ ]:

    
X, y = sk_data.make_blobs(n_samples=100, centers=3, n_features=2,center_box=(-30.0, 30.0),random_state=0)
print X.shape, y.shape, type(y)

plt.plot(X[y==1,0],X[y==1,1],'bo')
plt.plot(X[y==0,0],X[y==0,1],'go')
plt.plot(X[y==2,0],X[y==2,1],'ro')



In [ ]:

    
euclidean_dists = metrics.euclidean_distances(X)
# print euclidean_dists.shape

z = y
idx = np.argsort(z)
rearranged_dists = euclidean_dists[idx,:][:,idx]



# Plot the matrices in a single row, using fig, (ax1, ax2) = plt.subplots(1,2,figsize=(15,10))
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(15,10))
sns.heatmap(euclidean_dists, xticklabels=False, yticklabels=False, linewidths=0, ax=ax1, square=True, cbar=False)
sns.heatmap(rearranged_dists, xticklabels=False, yticklabels=False, linewidths=0, ax=ax2, square=True, cbar=False)

Another way of generating data using sklearn.datasets



In [ ]:

    
data, rows, columns = sk_data.make_biclusters(
    shape=(300, 50), n_clusters=2, noise=0.5,
    shuffle=False, random_state=0)
#data[data>5] = 1
sns.heatmap(data, xticklabels=False, yticklabels=False, linewidths=0)
print type(data)
print data.shape



In [ ]:

    
newd = np.reshape(data,data.shape[0]*data.shape[1])
plt.hist(newd)



In [ ]:

    
data[data<40] = 0
data[data>=40] = 1
sns.heatmap(data, xticklabels=False, yticklabels=False, linewidths=0)



In [ ]:

    
shuffled_data, row_idx = sk.utils.shuffle(data, np.arange(data.shape[0]), random_state=100)
shuffled_data, col_idx = sk.utils.shuffle(shuffled_data.T, np.arange(data.shape[1]), random_state=100)
shuffled_data = shuffled_data.T
sns.heatmap(shuffled_data, xticklabels=False, yticklabels=False, linewidths=0)



In [ ]:

    
jacc_dists = metrics.pairwise_distances(data,Y=None,metric='jaccard', n_jobs=1)

sns.heatmap(jacc_dists, xticklabels=False, yticklabels=False, linewidths=0)



In [ ]:

    
jacc_dists = metrics.pairwise_distances(shuffled_data,Y=None,metric='jaccard', n_jobs=1)


y = [ row_idx.tolist().index(i) for i in range(len(row_idx))]

rearranged_dists = jacc_dists[y,:][:,y]


sns.heatmap(rearranged_dists, xticklabels=False, yticklabels=False, linewidths=0)

We can compute pairwise distances using the sklearn.metrics functions summarized here: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics



In [1]:

    
# Code for setting the style of the notebook
from IPython.core.display import HTML
def css_styling():
    styles = open("../theme/custom.css", "r").read()
    return HTML(styles)
css_styling()









    Out[1]: