Numpy

从表格数据使用数组
Numpy提供了一个genfromtxt函数可以从表格数据中创建数组,数据存放到Numpy数组中后,系统处理数据就轻松的多。


In [1]:
# use StringIo 
import numpy as np
from StringIO import StringIO
in_data = StringIO("10,20,30\n45,65,23\n33,54,62")

In [2]:
# 使用Numpy的genfromtxt来读取数据,并创建一个Numpy数组
data = np.genfromtxt(in_data,dtype=int,delimiter=",")
data


Out[2]:
array([[10, 20, 30],
       [45, 65, 23],
       [33, 54, 62]])

In [3]:
# clear some col that we don't use
in_data = StringIO("10,20,30\n45,65,23\n33,54,62")
data = np.genfromtxt(in_data,dtype=int,delimiter=",",usecols=(0,1))
data


Out[3]:
array([[10, 20],
       [45, 65],
       [33, 54]])

In [4]:
# set column name
in_data = StringIO("10,20,30\n45,65,23\n33,54,62")
data = np.genfromtxt(in_data,dtype=int,delimiter=',',names="a,b,c")
data


Out[4]:
array([(10, 20, 30), (45, 65, 23), (33, 54, 62)], 
      dtype=[('a', '<i4'), ('b', '<i4'), ('c', '<i4')])

也可以将第一行作为列名


In [5]:
in_data = StringIO("a,b,c\n10,20,30\n45,65,23\n33,54,62")
data = np.genfromtxt(in_data,dtype=int,delimiter=',',names=True)
data


Out[5]:
array([(10, 20, 30), (45, 65, 23), (33, 54, 62)], 
      dtype=[('a', '<i4'), ('b', '<i4'), ('c', '<i4')])

对列进行预处理


In [6]:
# 首先看一下不进行预处理的结果
# 30kg,inr2000,31.13,56.45,1
# 45kg,inr3000,34.34,346.2,2

in_data = StringIO('30kg,inr2000,31.13,56.45,1\n45kg,inr3000,34.34,346.2,2')
data = np.genfromtxt(in_data,delimiter=',')
data


Out[6]:
array([[    nan,     nan,   31.13,   56.45,    1.  ],
       [    nan,     nan,   34.34,  346.2 ,    2.  ]])

可以看到输出结果中有nan出现的情况,这不是我们想要的结果

因此处理这样的数据时,我们需要进行预处理


In [7]:
import numpy as np
from StringIO import StringIO

# 定义一个数据集
in_data = StringIO('30kg,inr2000,31.13,56.45,1\n45kg,inr3000,34.34,346.2,2')

# 使用模板预处理
strip_func_1 = lambda x:float(x.rstrip('kg'))
strip_func_2 = lambda x:float(x.lstrip('inr'))

# 创建一个函数的字典
convert_funcs = {0:strip_func_1,1:strip_func_2}

# 将面板用到genfromtxt
data = np.genfromtxt(in_data,delimiter=',',converters=convert_funcs)
data


Out[7]:
array([[  3.00000000e+01,   2.00000000e+03,   3.11300000e+01,
          5.64500000e+01,   1.00000000e+00],
       [  4.50000000e+01,   3.00000000e+03,   3.43400000e+01,
          3.46200000e+02,   2.00000000e+00]])

当数据中有缺失值时


In [8]:
in_data = StringIO('10,20,30\n23,,34\n36,31,76')
miss_func = lambda x:float(x.strip() or -1)
data = np.genfromtxt(in_data,delimiter=',',converters={1:miss_func})
data


Out[8]:
array([[ 10.,  20.,  30.],
       [ 23.,  -1.,  34.],
       [ 36.,  31.,  76.]])

Numpy 是一个用来高效的处理数组和矩阵的python库
大多数python科学计算库中都在内部使用numpy处理数组和矩阵的操作


In [1]:
import numpy as np
# 创建数组
a_list = [1,2,3]
an_array = np.array(a_list)
print an_array
# 指定数据类型
an_array = np.array(a_list,dtype=float)
print an_array


[1 2 3]
[ 1.  2.  3.]

In [2]:
# 创建矩阵
a_listoflist = [[1,2,3],[5,6,7],[8,9,10]]
a_matrix = np.matrix(a_listoflist,dtype=float)
print a_matrix


[[  1.   2.   3.]
 [  5.   6.   7.]
 [  8.   9.  10.]]

In [3]:
# 矩阵a元素个数
print a_matrix.size


9

In [4]:
# 矩阵维度
print a_matrix.ndim


2

In [6]:
# 矩阵的行、列
print a_matrix.shape


(3, 3)

In [9]:
def display_shape(a):
    print a
    print 'Number of elements in a=%d'%(a.size)
    print 'Number of dimensions in a = %d'%(a.ndim)
    print 'Rows and cols in a',a.shape

numpy创建数组的方式


In [10]:
# 使用np.arange来创建
created_array = np.arange(1,10,dtype=float)
display_shape(created_array)


[ 1.  2.  3.  4.  5.  6.  7.  8.  9.]
Number of elements in a=9
Number of dimensions in a = 1
Rows and cols in a (9,)

In [11]:
# 使用np.linspace来创建
# 使用linspace,我们知道在给定的范围里有多少个元素。默认情况下,返回50个元素
created_array = np.linspace(1,10)
display_shape(created_array)


[  1.           1.18367347   1.36734694   1.55102041   1.73469388
   1.91836735   2.10204082   2.28571429   2.46938776   2.65306122
   2.83673469   3.02040816   3.20408163   3.3877551    3.57142857
   3.75510204   3.93877551   4.12244898   4.30612245   4.48979592
   4.67346939   4.85714286   5.04081633   5.2244898    5.40816327
   5.59183673   5.7755102    5.95918367   6.14285714   6.32653061
   6.51020408   6.69387755   6.87755102   7.06122449   7.24489796
   7.42857143   7.6122449    7.79591837   7.97959184   8.16326531
   8.34693878   8.53061224   8.71428571   8.89795918   9.08163265
   9.26530612   9.44897959   9.63265306   9.81632653  10.        ]
Number of elements in a=50
Number of dimensions in a = 1
Rows and cols in a (50,)

In [12]:
# 使用np.logspace来创建Numpy数组
created_array = np.logspace(1,10,base=10.0)
display_shape(created_array)


[  1.00000000e+01   1.52641797e+01   2.32995181e+01   3.55648031e+01
   5.42867544e+01   8.28642773e+01   1.26485522e+02   1.93069773e+02
   2.94705170e+02   4.49843267e+02   6.86648845e+02   1.04811313e+03
   1.59985872e+03   2.44205309e+03   3.72759372e+03   5.68986603e+03
   8.68511374e+03   1.32571137e+04   2.02358965e+04   3.08884360e+04
   4.71486636e+04   7.19685673e+04   1.09854114e+05   1.67683294e+05
   2.55954792e+05   3.90693994e+05   5.96362332e+05   9.10298178e+05
   1.38949549e+06   2.12095089e+06   3.23745754e+06   4.94171336e+06
   7.54312006e+06   1.15139540e+07   1.75751062e+07   2.68269580e+07
   4.09491506e+07   6.25055193e+07   9.54095476e+07   1.45634848e+08
   2.22299648e+08   3.39322177e+08   5.17947468e+08   7.90604321e+08
   1.20679264e+09   1.84206997e+09   2.81176870e+09   4.29193426e+09
   6.55128557e+09   1.00000000e+10]
Number of elements in a=50
Number of dimensions in a = 1
Rows and cols in a (50,)

In [13]:
# arange 指定步长
created_array = np.arange(1,10,2,dtype=int)
display_shape(created_array)


[1 3 5 7 9]
Number of elements in a=5
Number of dimensions in a = 1
Rows and cols in a (5,)

In [14]:
# 创建一个所以元素都是1的特殊矩阵
ones_matrix = np.ones((3,3))
display_shape(ones_matrix)


[[ 1.  1.  1.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]]
Number of elements in a=9
Number of dimensions in a = 2
Rows and cols in a (3, 3)

In [15]:
# 创建一个所有元素都为0的矩阵
zeros_matrix = np.zeros((3,3))
display_shape(zeros_matrix)


[[ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
Number of elements in a=9
Number of dimensions in a = 2
Rows and cols in a (3, 3)

In [16]:
# 验证矩阵
identity_matrix = np.eye(N=3,M=3,k=0)
display_shape(identity_matrix)


[[ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]]
Number of elements in a=9
Number of dimensions in a = 2
Rows and cols in a (3, 3)

参数k控制了从1开始的索引值


In [17]:
identity_matrix = np.eye(N=3,M=3,k=1)
display_shape(identity_matrix)


[[ 0.  1.  0.]
 [ 0.  0.  1.]
 [ 0.  0.  0.]]
Number of elements in a=9
Number of dimensions in a = 2
Rows and cols in a (3, 3)

reshape函数可以控制数组的形态


In [18]:
a_matrix = np.arange(9).reshape(3,3)
display_shape(a_matrix)


[[0 1 2]
 [3 4 5]
 [6 7 8]]
Number of elements in a=9
Number of dimensions in a = 2
Rows and cols in a (3, 3)

raval和flatten函数可以将矩阵转化为一维数组

随机数


In [19]:
# 产生1-100的10个随机整数
general_random_numbers = np.random.randint(1,100,size=10)
print general_random_numbers


[43 37 68 28 70 43 44 13 17 95]

In [20]:
# 使用normal包
uniform_rnd_numbers = np.random.normal(loc=0.2,scale=0.2,size=10)
print uniform_rnd_numbers


[ 0.31560127  0.38196501 -0.09393321  0.39450662  0.36126174 -0.02476259
  0.13728167  0.19001759  0.24057992 -0.27137875]

In [21]:
uniform_rnd_numbers = np.random.normal(loc=0.2,scale=0.2,size=(3,3))
print uniform_rnd_numbers


[[ 0.33783552  0.21471375  0.60085734]
 [ 0.10925539 -0.0676383   0.09488882]
 [ 0.02735518  0.16188257  0.09601844]]