模块使用

1 scipy.io

读取矩阵数据



In [1]:

    
import numpy as np
from scipy import io as spio
a = np.ones((3,3))
spio.savemat('file.mat',{'a':a})
data = spio.loadmat('file.mat',struct_as_record=True)
data['a']









    Out[1]:





array([[ 1.,  1.,  1.],
       [ 1.,  1.,  1.],
       [ 1.,  1.,  1.]])

读取图像



In [2]:

    
from scipy import misc
misc.imread('fname.png')









    Out[2]:





array([[[255, 255, 255, 255],
        [255, 255, 255, 255],
        [255, 255, 255, 255],
        ..., 
        [255, 255, 255, 255],
        [255, 255, 255, 255],
        [255, 255, 255, 255]],

       [[255, 255, 255, 255],
        [255, 255, 255, 255],
        [255, 255, 255, 255],
        ..., 
        [255, 255, 255, 255],
        [255, 255, 255, 255],
        [255, 255, 255, 255]],

       [[255, 255, 255, 255],
        [255, 255, 255, 255],
        [255, 255, 255, 255],
        ..., 
        [255, 255, 255, 255],
        [255, 255, 255, 255],
        [255, 255, 255, 255]],

       ..., 
       [[255, 255, 255, 255],
        [255, 255, 255, 255],
        [255, 255, 255, 255],
        ..., 
        [255, 255, 255, 255],
        [255, 255, 255, 255],
        [255, 255, 255, 255]],

       [[255, 255, 255, 255],
        [255, 255, 255, 255],
        [255, 255, 255, 255],
        ..., 
        [255, 255, 255, 255],
        [255, 255, 255, 255],
        [255, 255, 255, 255]],

       [[255, 255, 255, 255],
        [255, 255, 255, 255],
        [255, 255, 255, 255],
        ..., 
        [255, 255, 255, 255],
        [255, 255, 255, 255],
        [255, 255, 255, 255]]], dtype=uint8)



In [7]:

    
import matplotlib.pyplot as plt
plt.imread('fname.png')









    Out[7]:





array([[[ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        ..., 
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.]],

       [[ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        ..., 
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.]],

       [[ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        ..., 
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.]],

       ..., 
       [[ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        ..., 
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.]],

       [[ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        ..., 
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.]],

       [[ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        ..., 
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.],
        [ 1.,  1.,  1.,  1.]]], dtype=float32)

文本文件
numpy.loadtxt() / numpy.savetxt()
txt/csv文件
numpy.genfromtxt()/numpy.recfromcsv()
二进制文件
numpy.load() / numpy.save()

2 scipy.linalg

计算行列式



In [8]:

    
from scipy import linalg
arr= np.array([[1,2],
              [3,4]])
linalg.det(arr)









    Out[8]:





-2.0



In [9]:

    
arr = np.array([[3,2],
               [6,4]])
linalg.det(arr)









    Out[9]:





6.661338147750939e-16



In [10]:

    
linalg.det(np.ones(3,4))









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-10-b81c09915c43> in <module>()
----> 1 linalg.det(np.ones(3,4))

/Users/gaufung/Anaconda/anaconda/lib/python2.7/site-packages/numpy/core/numeric.pyc in ones(shape, dtype, order)
    188 
    189     """
--> 190     a = empty(shape, dtype, order)
    191     multiarray.copyto(a, 1, casting='unsafe')
    192     return a

TypeError: data type not understood

计算逆矩阵



In [12]:

    
arr = np.array([[1,2],[3,4]])
iarr = linalg.inv(arr)
iarr









    Out[12]:





array([[-2. ,  1. ],
       [ 1.5, -0.5]])



In [14]:

    
# 验证
np.allclose(np.dot(arr,iarr),np.eye(2))









    Out[14]:





True



In [15]:

    
# 奇异矩阵求逆抛出异常
arr = np.array([[3,2],[6,4]])
linalg.inv(arr)









    



---------------------------------------------------------------------------
LinAlgError                               Traceback (most recent call last)
<ipython-input-15-260bf2b0df41> in <module>()
      1 # 奇异矩阵求逆抛出异常
      2 arr = np.array([[3,2],[6,4]])
----> 3 linalg.inv(arr)

/Users/gaufung/Anaconda/anaconda/lib/python2.7/site-packages/scipy/linalg/basic.pyc in inv(a, overwrite_a, check_finite)
    685         inv_a, info = getri(lu, piv, lwork=lwork, overwrite_lu=1)
    686     if info > 0:
--> 687         raise LinAlgError("singular matrix")
    688     if info < 0:
    689         raise ValueError('illegal value in %d-th argument of internal '

LinAlgError: singular matrix

奇异值分解



In [17]:

    
arr = np.arange(9).reshape((3,3)) + np.diag([1,0,1])
uarr,spec,vharr = linalg.svd(arr)
spec









    Out[17]:





array([ 14.88982544,   0.45294236,   0.29654967])



In [18]:

    
sarr =  np.diag(spec)
svd_mat = uarr.dot(sarr).dot(vharr)
np.allclose(svd_mat,arr)









    Out[18]:





True

SVD常用于统计和信号处理领域。其他的一些标准分解方法(QR, LU, Cholesky, Schur) 在 scipy.linalg 中也能够找到。

3 优化



In [19]:

    
from scipy import optimize
def f(x):
    return x**2 + 10*np.sin(x)
x = np.arange(-10,10,0.1)
plt.plot(x,f(x))
plt.show()

此函数有一个全局最小值，约为-1.3，含有一个局部最小值，约为3.8.
在寻找最小值的过程中，确定初始值，用梯度下降的方法，bfgs是一个很好的方法。



In [20]:

    
optimize.fmin_bfgs(f,0)









    



Optimization terminated successfully.
         Current function value: -7.945823
         Iterations: 5
         Function evaluations: 18
         Gradient evaluations: 6






    Out[20]:





array([-1.30644012])



In [21]:

    
# 但是方法的缺陷是陷入局部最优解
optimize.fmin_bfgs(f,5)









    



Optimization terminated successfully.
         Current function value: 8.315586
         Iterations: 5
         Function evaluations: 18
         Gradient evaluations: 6






    Out[21]:





array([ 3.83746711])

可以在一个区间中找到一个最小值



In [23]:

    
xmin_local = optimize.fminbound(f,0,10)
xmin_local









    Out[23]:





3.8374671194983834

寻找函数的零点



In [24]:

    
# guess 1 
root = optimize.fsolve(f,1)
root









    Out[24]:





array([ 0.])



In [25]:

    
# guess -2.5
root = optimize.fsolve(f,-2.5)
root









    Out[25]:





array([-2.47948183])

曲线拟合

从函数f中采样得到一些含有噪声的数据



In [26]:

    
xdata = np.linspace(-10,10,num=20)
ydata = f(xdata)+np.random.randn(xdata.size)

我们已经知道函数的形式$x^2+\sin(x)$,但是每一项的系数不清楚，因此进行拟合处理



In [33]:

    
def f2(x,a,b):
    return a*x**2 + b*np.sin(x)
guess=[3,2]
params,params_covariance = optimize.curve_fit(f2, xdata, ydata, guess)
params









    Out[33]:





array([  1.00123877,  10.36873314])

绘制结果



In [41]:

    
x = np.arange(-10,10,0.1)
def f(x):
    return x**2 + 10 * np.sin(x)
grid = (-10,10,0.1)
xmin_global = optimize.brute(f,(grid,))
xmin_local = optimize.fminbound(f,0,10)
root = optimize.fsolve(f,1)
root2 = optimize.fsolve(f,-2.5)
xdata = np.linspace(-10,10,num=20)
np.random.seed(1234)
ydata = f(xdata)+np.random.randn(xdata.size)
def f2(x,a,b):
    return a*x**2 + b * np.sin(x)
guess=[2,2]
params,_ =optimize.curve_fit(f2,xdata,ydata,guess)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(x,f(x),'b-',label='f(x)')
ax.plot(x,f2(x,*params),'r--',label='Curve fit result')
xmins = np.array([xmin_global[0],xmin_local])
ax.plot(xmins,f(xmins),'go',label='minize')
roots = np.array([root,root2])
ax.plot(roots,f(roots),'kv',label='Roots')
ax.legend()
ax.set_xlabel('x')
ax.set_ylabel('f(x)')
plt.show()

4 统计

直方图和概率密度统计



In [42]:

    
a = np.random.normal(size=1000)
bins = np.arange(-4,5)
bins









    Out[42]:





array([-4, -3, -2, -1,  0,  1,  2,  3,  4])



In [44]:

    
histogram = np.histogram(a,bins=bins,normed=True)[0]
bins = 0.5*(bins[1:]+bins[:-1])
bins









    Out[44]:





array([-3.5, -2.5, -1.5, -0.5,  0.5,  1.5,  2.5,  3.5])



In [46]:

    
from scipy import stats
b =stats.norm.pdf(bins)
plt.plot(bins,histogram)
plt.plot(bins,b)
plt.show()

百分位数

百分位是累计概率分布函数的一个估计



In [47]:

    
np.median(a)









    Out[47]:





0.0070206524754117226



In [48]:

    
stats.scoreatpercentile(a,50)









    Out[48]:





0.0070206524754117226



In [49]:

    
stats.scoreatpercentile(a,90)









    Out[49]:





1.3003779670735938

统计检验

统计检验的结果常用作一个决策指标。例如，如果我们有两组观察点，它们都来自高斯过程，我们可以使用 T-检验来判断两组观察点是都显著不同:



In [50]:

    
a = np.random.normal(0,1,size=100)
b = np.random.normal(0,1,size=10)
stats.ttest_ind(a,b)









    Out[50]:





Ttest_indResult(statistic=-1.1408989974867128, pvalue=0.25643553551908738)

返回结果分成连个部分

T检验统计量
使用检验的统计量的值
P值
如果结果接近1，表明符合预期，接近0表明不符合预期。

5 插值



In [62]:

    
measured_time = np.linspace(0,1,10)
noise = (np.random.random(10)*2-1) *1e-1
measure = np.sin(2*np.pi*measured_time) + noise
from scipy.interpolate import interp1d
linear_interp = interp1d(measured_time, measure)
computed_time = np.linspace(0, 1, 50)
linear_results = linear_interp(computed_time)
cublic_interp = interp1d(measured_time, measure, kind='cubic')
cublic_results = cublic_interp(computed_time)
plt.plot(measured_time,measure,'o',label='points')
plt.plot(computed_time,linear_results,'r-',label='linear interp')
plt.plot(computed_time,cublic_results,'y-',label='cublic interp')
plt.legend()
3 plt.show()

练习

温度曲线拟合

阿拉斯加每个月温度的最大值和最小值数据见下表：

最小值	最大值	最小值	最大值
-62	17	-9	37
-59	19	-13	37
-56	21	-25	31
-46	28	-46	23
-32	33	-52	19
-18	38	-48	18

要求

绘制温度图像
拟合出一条函数曲线
使用scipy.optimize.curvie_fit()来拟合函数
画出函数图像。
判断最大值和最小值的偏置是否合理



In [25]:

    
import numpy as np
import matplotlib.pyplot as plt
months = np.arange(1,13)
mins = [-62,-59,-56,-46,-32,-18,-9,-13,-25,-46,-52,-48]
maxes = [17,19,21,28,33,38,37,37,31,23,19,18]
fig,ax = plt.subplots()
plt.plot(months,mins,'b-',label='min')
plt.plot(months,maxes,'r-',label='max')
plt.ylim(-80,80)
plt.xlim(0.5,12.5)
plt.xlabel('month')
plt.ylabel('temperature')
plt.xticks([1,2,3,4,5,6,7,8,9,10,11,12],
           ['Jan.','Feb.','Mar.','Apr.','May.','Jun.','Jul.','Aug.','Sep,','Oct.','Nov.','Dec.'])
plt.legend()
plt.title('Alaska temperature')
plt.show()

从图像上来看，温度的最高值和最低值都符合二次函数的特点，$y = at^2+bt+c$,其中$c$为时间$t$的偏置。



In [41]:

    
from scipy import optimize
def f(t,a,b,c):
    return a * t**2+b*t+c
guess = [-1,8,50]
params_min,_ = optimize.curve_fit(f,months,mins,guess)
params_max,_ = optimize.curve_fit(f,months,maxes,guess)
times = np.linspace(1,12,30)
plt.plot(times,f(times,*params_min),'b--',label='min_fit')
plt.plot(times,f(times,*params_max),'r--',label='max_fit')
plt.plot(months,mins,'bo',label='min')
plt.plot(months,maxes,'ro',label='max')
plt.ylim(-80,80)
plt.xlim(0.5,12.5)
plt.xlabel('month')
plt.ylabel('temperature')
plt.xticks([1,2,3,4,5,6,7,8,9,10,11,12],
           ['Jan.','Feb.','Mar.','Apr.','May.','Jun.','Jul.','Aug.','Sep,','Oct.','Nov.','Dec.'])
plt.title('Alaska temperature')
plt.show()

温度最高值拟合效果较好，但温度最低值拟合效果不太好

求解最小值

驼峰函数 $$f(x,y)=(4-2.1x^2+\frac{x^4}{3})x^2+xy+(4y^2-4)y^2$$

限制变量范围: $-2<x<2,-1<y<1$
使用 numpy.meshgrid() 和 pylab.imshow() 目测最小值所在区域
使用 scipy.optimize.fmin_bfgs() 或者其他的用于可以求解多维函数最小值的算法



In [44]:

    
import numpy as np
from scipy import optimize
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D


def sixhump(x):
    return (4 - 2.1*x[0]**2 + x[0]**4 / 3.) * x[0]**2 + x[0] * x[1] + (-4 + 4*x[1]**2) * x[1] **2

x = np.linspace(-2, 2)
y = np.linspace(-1, 1)
xg, yg = np.meshgrid(x, y)

#plt.figure()  # simple visualization for use in tutorial
#plt.imshow(sixhump([xg, yg]))
#plt.colorbar()

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
surf = ax.plot_surface(xg, yg, sixhump([xg, yg]), rstride=1, cstride=1,
                       cmap=plt.cm.jet, linewidth=0, antialiased=False)

ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('f(x, y)')
ax.set_title('Six-hump Camelback function')
plt.show()



In [53]:

    
min1 = optimize.fmin_bfgs(sixhump,[0,-0.5])
min2 = optimize.fmin_bfgs(sixhump,[0,0.5])
min3 = optimize.fmin_bfgs(sixhump,[-1.4,1.0])









    



Optimization terminated successfully.
         Current function value: -1.031628
         Iterations: 6
         Function evaluations: 36
         Gradient evaluations: 9
Optimization terminated successfully.
         Current function value: -1.031628
         Iterations: 6
         Function evaluations: 36
         Gradient evaluations: 9
Optimization terminated successfully.
         Current function value: -0.215464
         Iterations: 8
         Function evaluations: 48
         Gradient evaluations: 12



In [55]:

    
local1 = sixhump(min1)
local2 = sixhump(min2)
local3 = sixhump(min3)
print local1,local2,local3









    



-1.03162845349 -1.03162845349 -0.215463824384