In [1]:
from numpy import *
import operator
def createDataSet():
group = array([[1.0, 1.1],[1.0, 1.0],[0, 0],[0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
先了解程序清单 2-1 要用到的几个函数
tile
函数将数组横向及纵向复制得到新的数组
In [2]:
from numpy import *
tile(1, 3)
Out[2]:
In [3]:
tile(2.5,(2,4))
Out[3]:
In [4]:
tile([1,3],(2,3))
Out[4]:
In [5]:
a=[1,3]
a=array(a)
tile(a,(2,3))
Out[5]:
**
是幂运算
In [6]:
3**2
Out[6]:
In [7]:
a=array([[1, 2], [3, 4]])
a**2
Out[7]:
In [8]:
b=mat([[1, 2], [3, 4]])
b**2
Out[8]:
可见幂运算对array来说是element-wise的,而对于matrix来说,就是矩阵的乘法做幂。
同样地,array的乘法是element-wise的,matrix的乘法就是线性代数的矩阵乘法
矩阵除法要用
linalg.solve(A, B)
In [9]:
linalg.solve(b**2,b)
Out[9]:
.I 求矩阵的逆
.T 求矩阵的转置
In [10]:
b.I
Out[10]:
In [11]:
b.T
Out[11]:
sum
求和
In [12]:
a=array([[1, 2],[3, 4]])
a.sum()
Out[12]:
In [13]:
a.sum(0)
Out[13]:
In [14]:
a.sum(1)
Out[14]:
sum(0)
按列求和
sum(1)
按行求和
min()
max()
两个函数同样0列1行
In [15]:
a.min()
Out[15]:
In [16]:
a.min(0)
Out[16]:
In [17]:
a.min(1)
Out[17]:
dict.get(x,0)
在字典中查找指定的键对应的值,若找不到则返回第二个参数的值
In [18]:
d={'a':1,'b':2,'c':3,'d':4}
d.get('b')
Out[18]:
In [19]:
d.get('e')
该函数为
get(key,default=None)
第二个参数默认为None,则若不指定第二个参数,函数返回None
In [20]:
d.get('e',5)
Out[20]:
Python 2.7
中
dict.iteritems()
返回迭代器dict.items()
返回字典的复制Python 3
中
dict.items()
返回迭代器dict.iteritems()
该函数在 Python 3
中不存在了我用的 Python 3
,所以下面的代码中,我用的是 dict.items()
operator.itemgetter
函数可以获取一个对象指定序号的数据
operator.itemgetter
获取的不是值,而是一个函数,通过该函数作用到对象上才能获取值。
一般该函数用在 sorted
函数中。
需要 import operator
模块
In [21]:
a={'a':3,'b':2,'c':5,'d':1}
import operator
sorted(a.items(),key=operator.itemgetter(1),reverse=False)
Out[21]:
以上方法可以对字典按值排序
排序从小到大,reverse=True
则是从大到小
In [22]:
sorted(a.items(),key=operator.itemgetter(0),reverse=False)
Out[22]:
以上方法按键排序
程序清单 2-1
In [23]:
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize, 1)) - dataSet
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances ** 0.5
sortedDistIndicies = distances.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
程序清单 2-2
In [24]:
def file2matrix(filename):
fr = open(filename)
arrayOfLines = fr.readlines()
numberOfLines = len(arrayOfLines)
returnMat = zeros((numberOfLines, 3))
classLabelVector = []
index = 0
for line in arrayOfLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index, :] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
In [25]:
datingDataMat, datingLabels = file2matrix('Ch02/datingTestSet2.txt')
In [26]:
datingDataMat
Out[26]:
In [27]:
datingLabels[0:20]
Out[27]:
In [28]:
%matplotlib inline
In [29]:
import matplotlib
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(datingDataMat[:, 0], datingDataMat[:, 2], 10.0*array(datingLabels), 255.0*array(datingLabels))
plt.show()
In [30]:
def autoNorm(dataset):
minVals = dataset.min(0)
maxVals = dataset.max(0)
ranges = maxVals - minVals
m = dataset.shape[0]
normDataset = dataset - tile(minVals, (m, 1))
normDataset = normDataset / (tile(ranges, (m, 1)))
return normDataset, ranges, minVals
In [31]:
normMat, ranges, minVals = autoNorm(datingDataMat)
In [32]:
normMat
Out[32]:
In [33]:
ranges
Out[33]:
In [34]:
minVals
Out[34]:
In [35]:
def datingClassTest():
hoRatio = 0.1
datingDataMat, datingLabels = file2matrix('Ch02/datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m * hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print("the classifier came back with %d,the real answer is %d" %(classifierResult, datingLabels[i]))
#print(classifierResult)
if classifierResult != datingLabels[i]:
errorCount+=1.0
print("the total error rate is %f" %(errorCount / float(numTestVecs)))
In [36]:
datingClassTest()
In [37]:
def classifyPerson():
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float(input("percentage of time spent playing video games?"))
ffMiles = float(input("frequent flier miles earned consumed per year?"))
iceCream = float(input("liters of ice cream consumed per year?"))
datingDataMat, datingLabels = file2matrix('Ch02/datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles, percentTats, iceCream])
classifierResult = classify0((inArr - minVals) / ranges, normMat, datingLabels, 3)
print("you will probably like this person " + str(resultList[classifierResult - 1]))
In [38]:
classifyPerson()
In [39]:
def img2vector(filename):
returnVect=zeros((1, 1024))
fr=open(filename)
for i in range(32):
linestr=fr.readline()
for j in range(32):
returnVect[0, 32 * i + j] = int(linestr[j])
return returnVect
In [40]:
testVector = img2vector('Ch02/digits/trainingDigits/0_13.txt')
In [41]:
testVector[0, 0:31]
Out[41]:
In [42]:
from os import listdir
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir('Ch02/digits/trainingDigits')
m = len(trainingFileList)
trainingMat = zeros((m, 1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i, :] = img2vector('Ch02/digits/trainingDigits/%s' %(fileNameStr))
testFileList = listdir("Ch02/digits/testDigits/")
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('Ch02/digits/testDigits/%s' %(fileNameStr))
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
#print("the classifier came back with %d, the real answer is %d " %(classifierResult, classNumStr))
if classifierResult != classNumStr:
print("the classifier came back with %d, the real answer is %d " %(classifierResult, classNumStr))
errorCount += 1.0
print("the total number of errors is %d" %(errorCount))
print("the total error rate is %f" %(errorCount / float(mTest)))
In [43]:
handwritingClassTest()
In [ ]: