Content under Creative Commons Attribution license CC-BY 4.0, code under MIT license (c)2014 Sergio Rojas (srojas@usb.ve) and Erik A Christensen (erikcny@aol.com).
This chapter deal with tools appropriate for data mining, and explores the modules such as stats (for statistics), spatial (for data structures), and cluster (for clustering and vector quantization)
In [1]:
%matplotlib inline
Please, check the book to get a full explanation of what is done by the following code and what its output represents
In [2]:
import scipy.misc
from scipy.stats import signaltonoise
from scipy.stats import norm # Gaussian distribution
lena=scipy.misc.lena().astype(float)
lena+= norm.rvs(loc=0,scale=16,size=lena.shape)
In [3]:
signaltonoise(lena,axis=None)
Out[3]:
In [4]:
import numpy
from scipy.stats import pareto
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (15.0, 5.0)
x=numpy.linspace(1,10,1000)
plt.subplot(131); plt.plot(pareto.pdf(x,5))
plt.subplot(132); plt.plot(pareto.cdf(x,5))
plt.subplot(133); plt.plot(pareto.rvs(5,size=1000))
plt.show()
Please, check the book to get a full explanation of what is done by the following code and what its output represents
In [5]:
import numpy
data = numpy.array([[113,105,130,101,138,118,87,116,75,96, \
122,103,116,107,118,103,111,104,111,89,78,100,89,85,88], \
[137,105,133,108,115,170,103,145,78,107, \
84,148,147,87,166,146,123,135,112,93,76,116,78,101,123]])
In [6]:
dataDiff = data[1,:]-data[0,:]
dataDiff.mean(), dataDiff.std()
Out[6]:
In [7]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (15.0, 5.0)
plt.hist(dataDiff)
plt.show()
In [8]:
from scipy.stats import ttest_1samp
t_stat,p_value=ttest_1samp(dataDiff,0.0)
In [9]:
print (p_value/2.0)
Please, check the book to get a full explanation of what is done by the following code and what its output represents
In [10]:
from scipy.stats import norm # Gaussian distribution
mean,std=norm.fit(dataDiff)
In [11]:
plt.hist(dataDiff, normed=1)
x=numpy.linspace(dataDiff.min(),dataDiff.max(),1000)
pdf=norm.pdf(x,mean,std)
plt.plot(x,pdf)
Out[11]:
In [12]:
from scipy.stats import gaussian_kde
pdf = gaussian_kde(dataDiff)
pdf = pdf.evaluate(x)
plt.hist(dataDiff, normed=1)
plt.plot(x,pdf,'k')
plt.savefig("hist2.png")
plt.show()
In [13]:
plt.hist(dataDiff, normed=1)
plt.plot(x,pdf,'k.-',label='Kernel fit')
plt.plot(x,norm.pdf(x,mean,std),'r',label='Normal fit')
plt.legend()
plt.savefig("hist3.png")
plt.show()
Please, check the book to get a full explanation of what is done by the following code and what its output represents
In [14]:
import numpy
from scipy.spatial.distance import minkowski
Square=numpy.mgrid[-1.1:1.1:512j,-1.1:1.1:512j]
X=Square[0]; Y=Square[1]
f=lambda x,y,p: minkowski([x,y],[0.0,0.0],p)<=1.0
Ball=lambda p:numpy.vectorize(f)(X,Y,p)
In [16]:
import matplotlib.pylab as plt
plt.imshow(Ball(3), cmap = plt.cm.gray)
plt.axis('off')
plt.subplots_adjust(left=0.0127,bottom=0.0164,right=0.987,top=0.984)
plt.show() # IT WILL TAKE SOME TIME FOR THE PLOT TO SHOW UP
In [17]:
import scipy.stats
from scipy.spatial.distance import pdist
V=scipy.stats.randint.rvs(0.4,3,size=(5,4))-1
print (V)
In [18]:
pdist(V,metric='cityblock')
Out[18]:
In [19]:
from scipy.spatial.distance import cdist
V=scipy.stats.randint.rvs(0.4, 2, size=(3,4)).astype(bool)
W=scipy.stats.randint.rvs(0.4, 3, size=(2,4)).astype(bool)
cdist(V,W,'jaccard')
Out[19]:
In [20]:
from scipy.spatial import KDTree
data=scipy.stats.randint.rvs(0.4,10,size=(10,4))
print (data)
In [21]:
tree=KDTree(data)
tree.query([0,0,0,0])
Out[21]:
Please, check the book to get a full explanation of what is done by the following code and what its output represents
In [22]:
import numpy
from scipy.stats import norm
from numpy import array,vstack
data=norm.rvs(0,0.3,size=(10000,2))
inside_ball=numpy.hypot(data[:,0],data[:,1])<1.0
data=data[inside_ball]
data = vstack((data, data+array([1,1]),data+array([-1,1])))
In [23]:
from scipy.cluster.vq import *
centroids, distortion = kmeans(data,3)
cluster_assignment, distances = vq(data,centroids)
In [24]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (8.0, 6.0)
plt.plot(data[cluster_assignment==0,0], data[cluster_assignment==0,1], 'ro')
plt.plot(data[cluster_assignment==1,0], data[cluster_assignment==1,1], 'b+')
plt.plot(data[cluster_assignment==2,0], data[cluster_assignment==2,1], 'k.')
plt.show()
Please, check the book to get a full explanation of what is done by the following code and what its output represents
In [25]:
import numpy
file=open("dentitio.dat","r") # open the file
lines=file.readlines() # read each line in memory
file.close() # close the file
mammals=[] # this stores the names
dataset=numpy.zeros((len(lines),8)) # this stores the data
for index,line in enumerate(lines):
mammals.append( line[0:27].rstrip(" ").capitalize() )
for tooth in range(8):
dataset[index,tooth]=int(line[27+tooth])
In [26]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10.0, 20.0)
from scipy.cluster.hierarchy import linkage, dendrogram
Z=linkage(dataset)
dendrogram(Z, labels=mammals, orientation="right")
plt.show()
Content under Creative Commons Attribution license CC-BY 4.0, code under MIT license (c)2014 Sergio Rojas (srojas@usb.ve) and Erik A Christensen (erikcny@aol.com).