In [ ]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import sklearn as sk
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
import seaborn as sns
import time
%matplotlib inline
In [ ]:
def trad_version():
t1 = time.time()
X = range(10000000)
Y = range(10000000)
Z = []
for i in range(len(X)):
Z.append(X[i] + Y[i])
return time.time() - t1
def numpy_version():
t1 = time.time()
X = np.arange(10000000)
Y = np.arange(10000000)
Z = X + Y
return time.time() - t1
traditional_time = trad_version()
numpy_time = numpy_version()
print "Traditional time = "+ str(traditional_time)
print "Numpy time = "+ str(numpy_time)
In [ ]:
#1-dimensional arrays
x = np.array([2,5,18,14,4])
print "\n Deterministic 1-dimensional array \n"
print x
x = np.random.rand(5)
print "\n Random 1-dimensional array \n"
print x
#2-dimensional arrays
x = np.array([[2,5,18,14,4], [12,15,1,2,8]])
print "\n Deterministic 2-dimensional array \n"
print x
x = np.random.rand(5,5)
print "\n Random 2-dimensional array \n"
print x
print x.shape
In [ ]:
x = np.random.rand(5)
print x
print x+1
In [ ]:
x = np.random.rand(2,4)
print x
print np.mean(x)
print np.mean(x,0)
print np.std(x)
print np.std(x,1)
print np.median(x)
print np.median(x,1)
print np.sum(x)
print np.sum(x,1)
print np.prod(x)
print np.prod(x,1)
Random data are the simplest data one can generate. Other types of data following different distributions can be generated with functions extensively discussed below.
http://docs.scipy.org/doc/numpy/reference/routines.random.html
In [ ]:
A = np.random.rand(2,15)
print (A)
In [ ]:
B = np.random.randint(2,size = (2,15))
print B
In [ ]:
D = np.sqrt(np.sum(np.square(A[0,:]-A[1,:])))
print D
In [ ]:
def my_euclidean_dist(x,y):
return np.sqrt(np.sum(np.square(x-y)))
In [ ]:
d1 = my_euclidean_dist(A[0,:],A[1,:])
d2 = my_euclidean_dist(B[0,:],B[1,:])
print d1
print d2
Generating data with specific structure using sklearn.datasets
In [ ]:
X, y = sk_data.make_blobs(n_samples=100, centers=3, n_features=2,center_box=(-30.0, 30.0),random_state=0)
print X.shape, y.shape, type(y)
plt.plot(X[y==1,0],X[y==1,1],'bo')
plt.plot(X[y==0,0],X[y==0,1],'go')
plt.plot(X[y==2,0],X[y==2,1],'ro')
In [ ]:
euclidean_dists = metrics.euclidean_distances(X)
# print euclidean_dists.shape
z = y
idx = np.argsort(z)
rearranged_dists = euclidean_dists[idx,:][:,idx]
# Plot the matrices in a single row, using fig, (ax1, ax2) = plt.subplots(1,2,figsize=(15,10))
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(15,10))
sns.heatmap(euclidean_dists, xticklabels=False, yticklabels=False, linewidths=0, ax=ax1, square=True, cbar=False)
sns.heatmap(rearranged_dists, xticklabels=False, yticklabels=False, linewidths=0, ax=ax2, square=True, cbar=False)
Another way of generating data using sklearn.datasets
In [ ]:
data, rows, columns = sk_data.make_biclusters(
shape=(300, 50), n_clusters=2, noise=0.5,
shuffle=False, random_state=0)
#data[data>5] = 1
sns.heatmap(data, xticklabels=False, yticklabels=False, linewidths=0)
print type(data)
print data.shape
In [ ]:
newd = np.reshape(data,data.shape[0]*data.shape[1])
plt.hist(newd)
In [ ]:
data[data<40] = 0
data[data>=40] = 1
sns.heatmap(data, xticklabels=False, yticklabels=False, linewidths=0)
In [ ]:
shuffled_data, row_idx = sk.utils.shuffle(data, np.arange(data.shape[0]), random_state=100)
shuffled_data, col_idx = sk.utils.shuffle(shuffled_data.T, np.arange(data.shape[1]), random_state=100)
shuffled_data = shuffled_data.T
sns.heatmap(shuffled_data, xticklabels=False, yticklabels=False, linewidths=0)
In [ ]:
jacc_dists = metrics.pairwise_distances(data,Y=None,metric='jaccard', n_jobs=1)
sns.heatmap(jacc_dists, xticklabels=False, yticklabels=False, linewidths=0)
In [ ]:
jacc_dists = metrics.pairwise_distances(shuffled_data,Y=None,metric='jaccard', n_jobs=1)
y = [ row_idx.tolist().index(i) for i in range(len(row_idx))]
rearranged_dists = jacc_dists[y,:][:,y]
sns.heatmap(rearranged_dists, xticklabels=False, yticklabels=False, linewidths=0)
We can compute pairwise distances using the sklearn.metrics functions summarized here: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
In [1]:
# Code for setting the style of the notebook
from IPython.core.display import HTML
def css_styling():
styles = open("../theme/custom.css", "r").read()
return HTML(styles)
css_styling()
Out[1]: