Distance and similarity functions


In [ ]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import sklearn as sk
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
import seaborn as sns

import time

%matplotlib inline

Brief intro to numpy: http://www.numpy.org/ or for more references: http://docs.scipy.org/doc/numpy/reference/index.html

Why numpy?


In [ ]:
def trad_version():
    t1 = time.time()
    X = range(10000000)
    Y = range(10000000)
    Z = []
    for i in range(len(X)):
        Z.append(X[i] + Y[i])
    return time.time() - t1

def numpy_version():
    t1 = time.time()
    X = np.arange(10000000)
    Y = np.arange(10000000)
    Z = X + Y
    return time.time() - t1


traditional_time = trad_version()
numpy_time = numpy_version()
print "Traditional time = "+ str(traditional_time)
print "Numpy time       = "+ str(numpy_time)

Arrays in numpy


In [ ]:
#1-dimensional arrays
x = np.array([2,5,18,14,4])
print "\n Deterministic 1-dimensional array \n"
print x

x = np.random.rand(5)
print "\n Random 1-dimensional array \n"
print x

#2-dimensional arrays
x = np.array([[2,5,18,14,4], [12,15,1,2,8]])
print "\n Deterministic 2-dimensional array \n"
print x

x = np.random.rand(5,5)
print "\n Random 2-dimensional array \n"
print x
print x.shape

Manipulating and aggregating arrays


In [ ]:
x = np.random.rand(5)
print x
print x+1

Aggregates


In [ ]:
x = np.random.rand(2,4)
print x
print np.mean(x)
print np.mean(x,0)
print np.std(x)
print np.std(x,1)
print np.median(x)
print np.median(x,1)
print np.sum(x)
print np.sum(x,1)
print np.prod(x)
print np.prod(x,1)

Generating synthetic data

Random data are the simplest data one can generate. Other types of data following different distributions can be generated with functions extensively discussed below.

http://docs.scipy.org/doc/numpy/reference/routines.random.html


In [ ]:
A = np.random.rand(2,15)
print (A)

In [ ]:
B = np.random.randint(2,size = (2,15))
print B

Euclidean distance


In [ ]:
D = np.sqrt(np.sum(np.square(A[0,:]-A[1,:])))
print D

In [ ]:
def my_euclidean_dist(x,y):
    return np.sqrt(np.sum(np.square(x-y)))

In [ ]:
d1 = my_euclidean_dist(A[0,:],A[1,:])
d2 = my_euclidean_dist(B[0,:],B[1,:])
print d1
print d2

Becoming more effective and efficient using scikit-learn, a set of libraries for data mining, data analysis and machine learning http://scikit-learn.org/stable/

Generating data with specific structure using sklearn.datasets


In [ ]:
X, y = sk_data.make_blobs(n_samples=100, centers=3, n_features=2,center_box=(-30.0, 30.0),random_state=0)
print X.shape, y.shape, type(y)

plt.plot(X[y==1,0],X[y==1,1],'bo')
plt.plot(X[y==0,0],X[y==0,1],'go')
plt.plot(X[y==2,0],X[y==2,1],'ro')

In [ ]:
euclidean_dists = metrics.euclidean_distances(X)
# print euclidean_dists.shape

z = y
idx = np.argsort(z)
rearranged_dists = euclidean_dists[idx,:][:,idx]



# Plot the matrices in a single row, using fig, (ax1, ax2) = plt.subplots(1,2,figsize=(15,10))
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(15,10))
sns.heatmap(euclidean_dists, xticklabels=False, yticklabels=False, linewidths=0, ax=ax1, square=True, cbar=False)
sns.heatmap(rearranged_dists, xticklabels=False, yticklabels=False, linewidths=0, ax=ax2, square=True, cbar=False)

Another way of generating data using sklearn.datasets


In [ ]:
data, rows, columns = sk_data.make_biclusters(
    shape=(300, 50), n_clusters=2, noise=0.5,
    shuffle=False, random_state=0)
#data[data>5] = 1
sns.heatmap(data, xticklabels=False, yticklabels=False, linewidths=0)
print type(data)
print data.shape

In [ ]:
newd = np.reshape(data,data.shape[0]*data.shape[1])
plt.hist(newd)

In [ ]:
data[data<40] = 0
data[data>=40] = 1
sns.heatmap(data, xticklabels=False, yticklabels=False, linewidths=0)

In [ ]:
shuffled_data, row_idx = sk.utils.shuffle(data, np.arange(data.shape[0]), random_state=100)
shuffled_data, col_idx = sk.utils.shuffle(shuffled_data.T, np.arange(data.shape[1]), random_state=100)
shuffled_data = shuffled_data.T
sns.heatmap(shuffled_data, xticklabels=False, yticklabels=False, linewidths=0)

In [ ]:
jacc_dists = metrics.pairwise_distances(data,Y=None,metric='jaccard', n_jobs=1)

sns.heatmap(jacc_dists, xticklabels=False, yticklabels=False, linewidths=0)

In [ ]:
jacc_dists = metrics.pairwise_distances(shuffled_data,Y=None,metric='jaccard', n_jobs=1)


y = [ row_idx.tolist().index(i) for i in range(len(row_idx))]

rearranged_dists = jacc_dists[y,:][:,y]


sns.heatmap(rearranged_dists, xticklabels=False, yticklabels=False, linewidths=0)

We can compute pairwise distances using the sklearn.metrics functions summarized here: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics


In [1]:
# Code for setting the style of the notebook
from IPython.core.display import HTML
def css_styling():
    styles = open("../theme/custom.css", "r").read()
    return HTML(styles)
css_styling()


Out[1]: