In [2]:
from numpy import cos,sin #避免使用
import numpy as np #np.method()
In [11]:
r1 = range(5)
r2 = np.arange(5)
r3 = xrange(5)
In [3]:
print r1,r2,r3
In [4]:
for i in r1:
print i,
print '\n'
for i in r2:
print i,
print '\n'
for i in r3:
print i,
In [5]:
print type(r1),type(r2),type(r3)
In [3]:
print np.arange(0,5),np.arange(5,0),np.arange(5,0,-1)
In [4]:
np.arange #Tab
Out[4]:
In [6]:
np.arange?
In [7]:
def vecadd(n):
a = range(n)
b = range(n)
c = []
for i in range(n):
a[i] = i*2
b[i] = i+3
c.append(a[i]+b[i])
return c
def np_vecadd(n):
a = np.arange(n)*2
b = np.arange(n)+3
c = a+b
return c
N = 10000000
为了比较性能,使用ipython的“魔法函数”timeit,或者datetime库的计时函数:
In [8]:
%timeit vecadd(N)
In [9]:
%timeit np_vecadd(N)
In [12]:
import datetime
t1 = datetime.datetime.now()
result = vecadd(N)
t2 = datetime.datetime.now()
t2-t1
Out[12]:
In [13]:
np.arange(5)+3 #ufunc or broadcasting
Out[13]:
In [14]:
np.arange(5)*2
Out[14]:
In [15]:
a,b,c = 0,2,11
print np.linspace(a,b,c),'\n'*2,np.arange(c)/float(c-1)*(b-a)+a
In [12]:
#使用np.ndarray类型的r2
print r2,type(r2),r2.dtype,r2.shape
In [13]:
#Immunity of Mutables
arr1 = np.array([np.arange(5),np.arange(5)])
arr2 = np.array([r2,r2])
arr3 = [r2,r2]
print arr1,'\n'*2,arr2,'\n'*2,arr3
r2[2] = 10
print arr1,'\n'*2,arr2,'\n'*2,arr3
r2[2] = 2
#不需要关心类型是否可变,在np.array创建过程中就已经建立独立拷贝
In [18]:
print arr1,arr1.shape
In [19]:
print arr1[0,0],arr1[1,2],arr1.dtype
In [20]:
print np.float64(arr1) #正确
In [21]:
arr1.dtype = np.float64 #错误
print arr1
arr1.dtype = np.int64
print arr1
In [22]:
# bool : 单个bit的布尔变量,仅存True或者False
# inti : 当前平台的整数类型,int32或int64
import platform
platform.architecture()
Out[22]:
In [24]:
i = np.float64(3)
print i
In [25]:
c = np.complex128(1+2j)
print c.real,c.imag
In [26]:
print i.dtype.itemsize,c.dtype.itemsize
In [27]:
sku = np.dtype([('sku_id',np.int32),('desc',np.str_,50),('value',np.float64)])
print sku
sku2 = np.dtype({'names':['sku_id','desc','value'],'formats':['<i4','S50','<f8']})
print sku2
In [28]:
online_shop = np.array([(1,'apple',2.3),(2.1,5,5),(3,'banana',True)],dtype=sku)
print online_shop
In [30]:
arr = np.arange(12)
print arr,arr.shape
In [31]:
print arr[3:5],arr[-5:-2],arr[3:28:5]
In [32]:
arr = arr.reshape(3,2,2)
print arr
In [33]:
arr.resize((3,4))
print arr,'\n',arr.ndim,'\n',arr.shape
In [34]:
print arr.T,'\n'*2,arr.transpose()
In [20]:
arrflat = arr.flat
In [21]:
print arrflat
In [22]:
print np.array([item for item in arrflat],dtype=np.int64)
In [23]:
print arr.flatten(order='C'),'\n'*2, arr.flatten(order='F'),'\n'*2,type(arr.flatten())
In [38]:
print arr.ravel(),'\n'*2,type(arr.ravel())
In [24]:
print np.vstack((arr,arr)),'\n'*2,np.row_stack((arr,arr)),'\n'*2, np.concatenate((arr,arr),axis=0)
In [25]:
print np.hstack((arr,arr)),'\n'*2,np.column_stack((arr,arr)),'\n'*2, np.concatenate((arr,arr),axis=1)
In [26]:
print np.dstack((arr,arr))
In [35]:
arr.resize((3,4,1))
In [36]:
print arr,'\n'*2,np.concatenate((arr,arr),axis=2)
In [37]:
print arr,'\n'*2,np.tile(arr,(1,1,3))
In [38]:
arr2 = np.arange(8).reshape(2,2,2)
print arr2
In [39]:
print np.vsplit(arr2,2), '\n'*2, np.split(arr2,2,axis=0)
In [40]:
print np.hsplit(arr2,2), '\n'*2, np.split(arr2,2,axis=1)
In [48]:
print np.dsplit(arr2,2), '\n'*2, np.split(arr2,2,axis=2)
In [41]:
arr_random = np.random.randint(1,6,15)
print arr_random
In [42]:
print arr_random.max(),arr_random.min(),arr_random.mean(),arr_random.var(),arr_random.std()
Ufunc类函数返回逐个计算值:
很多Ufunc即Universal Functional在Python中是以C语言级别实现的,效率高,建议使用。
In [43]:
print np.log(arr_random),'\n'*2,np.exp(arr_random),'\n'*2,arr_random>2,'\n'*2,np.mod(arr_random,2)
过滤值:类似SAS的Compress(其实是反过来)
In [52]:
arr_random.compress(arr_random>2)
Out[52]:
过滤值:类似R语法
In [44]:
arr_random[arr_random>2]
Out[44]:
类似R的语法中,用于过滤条件的np.array是可以进行布尔操作的:
In [45]:
print arr_random[(arr_random>1) & (arr_random<4)],
print arr_random[(arr_random>1) ^ (arr_random<4)]
Python的Clip,对值进行裁剪
In [46]:
arr_random.clip(2,4)
Out[46]:
可以方便的进行时间序列操作——差分
In [47]:
arr_random = np.random.rand(10)
print arr_random,'\n'*2,np.diff(arr_random)
差分其实是一种最简单的卷积:
In [49]:
w = np.array([1,-1])
len = 2
print arr_random,'\n'*2,np.convolve(w,arr_random),'\n'*2,np.convolve(w,arr_random)[len-1:-len+1]
差分的反向操作——累计加和(累积乘法类似):
In [50]:
diff = np.convolve(w,arr_random)[0:-1]
print diff
print diff.cumsum(),'\n'*2,diff.cumprod()
排序时间到了:给出排序结果,本地排序,并保留原数组下标位置
In [54]:
arr_random = np.random.rand(10)
In [55]:
print np.sort(arr_random)
In [56]:
print arr_random
In [58]:
arr_random.sort()
print arr_random
In [63]:
arr_random = np.random.rand(10)
使用enumerate和sorted排序:
关于更多匿名函数(Lambda)使用,请参考Python函数式编程,也就是Functional Programming相关内容。
In [64]:
s = sorted(enumerate(arr_random),key=lambda x:x[1],reverse=False)
print s
In [65]:
zip(*s)[0]
Out[65]:
In [66]:
print type(enumerate(arr_random))
for idx,value in enumerate(arr_random):
print idx,value
print zip(xrange(10),arr_random)
In [67]:
print np.lexsort((np.arange(10),arr_random)) #接受一个参数,传入元组
In [68]:
print np.argsort(arr_random) #相当于lexsort参数元组第一列给下标
In [63]:
arr2d_random = np.random.rand(10).reshape(5,2)
arr2d_zeros = np.zeros_like(arr2d_random)
arr2d_ones = np.ones_like(arr2d_random)
In [64]:
print arr2d_random,'\n'*2,arr2d_zeros,'\n'*2,arr2d_ones
直接填充常用值:
In [71]:
print np.zeros((5,2)), '\n'*2 , np.ones((5,2)),'\n'*2,np.eye(3)
善于利用函数者,可以使用函数,传入下标作为参数,生成二维数组:
”首相卡梅隆“函数:
In [72]:
def CameronThePrimeMinister(i,j):
return (i+1)*(j+1)
print np.fromfunction(CameronThePrimeMinister,(9,9))
习惯R者,使用broadcasting,生成二维数组,更具体的原理和应用在后续介绍:
9 * 1的二维数组和长度为9的一维数组broadcasting两次,互相补齐对方较短的维度:
In [59]:
x = np.arange(1,10)
print x.reshape(-1,1),'\n'*2,x,'\n'*2,x.reshape(-1,1) * x
更进一步缩减代码,生成正交栅格用于广播的方法,支持灵活的运算:
状如切片,功能如range/linspace,实数第三参数代表步长,虚数第三参数代表总长度
In [60]:
x,y = np.ogrid[0:1:6j,0:1:6j]
print x,y
print np.exp(-x**2-y**2)
习惯数据库的笛卡尔积者,可以用outer关键字进行某些运算,支持传入iterable更能节省空间和时间:
In [61]:
np.multiply.outer(xrange(1,10),xrange(1,10))
Out[61]:
二维数组和矩阵有一些不同:
矩阵可以方便的求逆、直接做乘法,二维数组的*只能作为np.multiply而不是np.dot:
In [65]:
cov = np.cov(arr2d_random.T)
print cov
In [66]:
stdiag = np.diag(np.sqrt(np.diag(cov)))
print stdiag
In [67]:
invstdiag = np.array(np.mat(stdiag).I)
print invstdiag
In [69]:
#错误
invstdiag*cov*invstdiag
Out[69]:
In [70]:
#正确
np.mat(stdiag).I*np.mat(cov)*np.mat(stdiag).I
Out[70]:
In [71]:
#验证
np.corrcoef(arr2d_random.T)
Out[71]:
In [72]:
x = np.random.rand(30).reshape(15,2)
y = x[:,0]*0.7 - x[:,1]*0.2 + 0.1+0.1*np.random.rand(15)
In [73]:
print y,'\n'*2,x
In [75]:
Xcvr = np.mat(np.hstack((np.ones(15).reshape(15,1),x)))
print Xcvr
给出模型参数的估计:$\hat{\beta} = (X^TX)^{-1}X^Ty $
In [76]:
H = Xcvr*(Xcvr.T*Xcvr).I*Xcvr.T
betahats = np.dot((Xcvr.T*Xcvr).I*Xcvr.T,y)
preds = np.dot(H,y)
print betahats,'\n'*2,preds
对照statsmodels的OLS结果:
In [77]:
import statsmodels.formula.api as sm
model = sm.OLS(y,Xcvr).fit()
model.summary()
Out[77]:
In [78]:
rand_relation = (np.random.rand(100)>0.7).reshape(10,10)
rand_relation.dtype=np.int8
In [88]:
rand_relation
Out[88]:
In [89]:
rand_relation = rand_relation-np.diag(np.diag(rand_relation))
In [79]:
rand_relation
Out[79]:
In [80]:
invrowsum = 1/(np.sum(rand_relation,axis=1)+1e-12)
print invrowsum
diag_invrowsum = np.diag(invrowsum)
In [81]:
tpm = np.dot(diag_invrowsum,rand_relation+1e-12/10)
tpm = (tpm>1e-12/10)*tpm
print tpm
In [82]:
import scipy.sparse as ssp
sp_tpm = ssp.csr_matrix(tpm)
In [83]:
sp_tpm
Out[83]:
In [95]:
for i in xrange(10):
sp_tpm = sp_tpm.dot(sp_tpm)
print repr(sp_tpm)
sp_tpm_a = sp_tpm.toarray()
print sp_tpm_a,'\n'*2,sp_tpm_a[0,:]
In [96]:
a = np.random.rand(10).reshape(5,2)
print a
np.save('a.npy',a)
a_copy = np.load('a.npy')
print a_copy
保存多个变量时,等号前面是变量的key,从结果r里面可以取到。
如果不使用key来保存a,那么通过“arr_0”,"arr_1"来取到。
In [97]:
b = range(5)
np.savez('a_and_b.npz',a,bdata=b)
r = np.load('a_and_b.npz')
print r["arr_0"],'\n'*2,r["bdata"]
最后,对txt,csv,tsv的读写:
csv意为comma separated values
tsv意为tab separated values
In [98]:
np.savetxt("a.txt",a,delimiter = '\t')
a_copy2 = np.loadtxt("a.txt",delimiter = '\t')
print a_copy2
In [ ]: