In [1]:
# use StringIo
import numpy as np
from StringIO import StringIO
in_data = StringIO("10,20,30\n45,65,23\n33,54,62")
In [2]:
# 使用Numpy的genfromtxt来读取数据,并创建一个Numpy数组
data = np.genfromtxt(in_data,dtype=int,delimiter=",")
data
Out[2]:
In [3]:
# clear some col that we don't use
in_data = StringIO("10,20,30\n45,65,23\n33,54,62")
data = np.genfromtxt(in_data,dtype=int,delimiter=",",usecols=(0,1))
data
Out[3]:
In [4]:
# set column name
in_data = StringIO("10,20,30\n45,65,23\n33,54,62")
data = np.genfromtxt(in_data,dtype=int,delimiter=',',names="a,b,c")
data
Out[4]:
也可以将第一行作为列名
In [5]:
in_data = StringIO("a,b,c\n10,20,30\n45,65,23\n33,54,62")
data = np.genfromtxt(in_data,dtype=int,delimiter=',',names=True)
data
Out[5]:
In [6]:
# 首先看一下不进行预处理的结果
# 30kg,inr2000,31.13,56.45,1
# 45kg,inr3000,34.34,346.2,2
in_data = StringIO('30kg,inr2000,31.13,56.45,1\n45kg,inr3000,34.34,346.2,2')
data = np.genfromtxt(in_data,delimiter=',')
data
Out[6]:
可以看到输出结果中有nan出现的情况,这不是我们想要的结果
因此处理这样的数据时,我们需要进行预处理
In [7]:
import numpy as np
from StringIO import StringIO
# 定义一个数据集
in_data = StringIO('30kg,inr2000,31.13,56.45,1\n45kg,inr3000,34.34,346.2,2')
# 使用模板预处理
strip_func_1 = lambda x:float(x.rstrip('kg'))
strip_func_2 = lambda x:float(x.lstrip('inr'))
# 创建一个函数的字典
convert_funcs = {0:strip_func_1,1:strip_func_2}
# 将面板用到genfromtxt
data = np.genfromtxt(in_data,delimiter=',',converters=convert_funcs)
data
Out[7]:
当数据中有缺失值时
In [8]:
in_data = StringIO('10,20,30\n23,,34\n36,31,76')
miss_func = lambda x:float(x.strip() or -1)
data = np.genfromtxt(in_data,delimiter=',',converters={1:miss_func})
data
Out[8]:
Numpy 是一个用来高效的处理数组和矩阵的python库
大多数python科学计算库中都在内部使用numpy处理数组和矩阵的操作
In [1]:
import numpy as np
# 创建数组
a_list = [1,2,3]
an_array = np.array(a_list)
print an_array
# 指定数据类型
an_array = np.array(a_list,dtype=float)
print an_array
In [2]:
# 创建矩阵
a_listoflist = [[1,2,3],[5,6,7],[8,9,10]]
a_matrix = np.matrix(a_listoflist,dtype=float)
print a_matrix
In [3]:
# 矩阵a元素个数
print a_matrix.size
In [4]:
# 矩阵维度
print a_matrix.ndim
In [6]:
# 矩阵的行、列
print a_matrix.shape
In [9]:
def display_shape(a):
print a
print 'Number of elements in a=%d'%(a.size)
print 'Number of dimensions in a = %d'%(a.ndim)
print 'Rows and cols in a',a.shape
In [10]:
# 使用np.arange来创建
created_array = np.arange(1,10,dtype=float)
display_shape(created_array)
In [11]:
# 使用np.linspace来创建
# 使用linspace,我们知道在给定的范围里有多少个元素。默认情况下,返回50个元素
created_array = np.linspace(1,10)
display_shape(created_array)
In [12]:
# 使用np.logspace来创建Numpy数组
created_array = np.logspace(1,10,base=10.0)
display_shape(created_array)
In [13]:
# arange 指定步长
created_array = np.arange(1,10,2,dtype=int)
display_shape(created_array)
In [14]:
# 创建一个所以元素都是1的特殊矩阵
ones_matrix = np.ones((3,3))
display_shape(ones_matrix)
In [15]:
# 创建一个所有元素都为0的矩阵
zeros_matrix = np.zeros((3,3))
display_shape(zeros_matrix)
In [16]:
# 验证矩阵
identity_matrix = np.eye(N=3,M=3,k=0)
display_shape(identity_matrix)
参数k控制了从1开始的索引值
In [17]:
identity_matrix = np.eye(N=3,M=3,k=1)
display_shape(identity_matrix)
reshape函数可以控制数组的形态
In [18]:
a_matrix = np.arange(9).reshape(3,3)
display_shape(a_matrix)
raval和flatten函数可以将矩阵转化为一维数组
In [19]:
# 产生1-100的10个随机整数
general_random_numbers = np.random.randint(1,100,size=10)
print general_random_numbers
In [20]:
# 使用normal包
uniform_rnd_numbers = np.random.normal(loc=0.2,scale=0.2,size=10)
print uniform_rnd_numbers
In [21]:
uniform_rnd_numbers = np.random.normal(loc=0.2,scale=0.2,size=(3,3))
print uniform_rnd_numbers