scikit-learn 机器学习
In [3]:
from sklearn import datasets
iris = datasets.load_iris()
# 取前10行数据
iris.data[0:10]
Out[3]:
每一个花卉的种类,访问 iris 的 target 属性
In [4]:
iris.target
Out[4]:
总共150个数据,可能取值为0,1,2。分别表示不同鸢尾花的种类
In [5]:
iris.target_names
Out[5]:
In [32]:
import pandas as pd
frame = pd.DataFrame(iris.data,
columns=['sepal_length','sepal_width','petal_length','petal_width'])
frame['category']=iris.target
category_dict=dict(zip([0,1,2],['setosa','versicolor','virginica']))
frame['kinds'] = frame['category'].map(category_dict)
del frame['category']
frame.groupby('kinds').describe()
Out[32]:
In [104]:
groups = list(frame.groupby('kinds'))
current_group = groups[0]
petal_length = current_group[1]['petal_length']
petal_width = current_group[1]['petal_width']
sepal_length = current_group[1]['sepal_length']
sepal_width = current_group[1]['sepal_width']
plt.figure()
plt.suptitle(current_group[0]);
plt.subplot(221)
plt.hist(petal_length,color='r')
plt.title('petal_length')
plt.subplot(222)
plt.hist(petal_width,color='g')
plt.title('petal_width')
plt.subplot(223)
plt.hist(sepal_length,color='b')
plt.xlabel('sepal_length')
plt.subplot(224)
plt.hist(sepal_width,color='y')
plt.xlabel('sepal_width')
plt.show()
In [105]:
groups = list(frame.groupby('kinds'))
current_group = groups[1]
petal_length = current_group[1]['petal_length']
petal_width = current_group[1]['petal_width']
sepal_length = current_group[1]['sepal_length']
sepal_width = current_group[1]['sepal_width']
plt.figure()
plt.suptitle(current_group[0]);
plt.subplot(221)
plt.hist(petal_length,color='r')
plt.title('petal_length')
plt.subplot(222)
plt.hist(petal_width,color='g')
plt.title('petal_width')
plt.subplot(223)
plt.hist(sepal_length,color='b')
plt.xlabel('sepal_length')
plt.subplot(224)
plt.hist(sepal_width,color='y')
plt.xlabel('sepal_width')
plt.show()
In [106]:
groups = list(frame.groupby('kinds'))
current_group = groups[2]
petal_length = current_group[1]['petal_length']
petal_width = current_group[1]['petal_width']
sepal_length = current_group[1]['sepal_length']
sepal_width = current_group[1]['sepal_width']
plt.figure()
plt.suptitle(current_group[0]);
plt.subplot(221)
plt.hist(petal_length,color='r')
plt.title('petal_length')
plt.subplot(222)
plt.hist(petal_width,color='g')
plt.title('petal_width')
plt.subplot(223)
plt.hist(sepal_length,color='b')
plt.xlabel('sepal_length')
plt.subplot(224)
plt.hist(sepal_width,color='y')
plt.xlabel('sepal_width')
plt.show()
In [9]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpathes
from sklearn import datasets
iris = datasets.load_iris()
x = iris.data[:,0]
y = iris.data[:,1]
species = iris.target
x_min, x_max = x.min() - 0.5, x.max() + 0.5
y_min, y_max = y.min() - 0.5, y.max() + 0.5
plt.figure()
plt.title(u'Iris Datase - classification by sepal sizes')
plt.scatter(x,y,c=species)
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()
In [10]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpathes
from sklearn import datasets
iris = datasets.load_iris()
x = iris.data[:,2]
y = iris.data[:,3]
species = iris.target
x_min, x_max = x.min() - 0.5, x.max() + 0.5
y_min, y_max = y.min() - 0.5, y.max() + 0.5
plt.figure()
plt.title(u'Iris Datase - classification by petal sizes')
plt.scatter(x,y,c=species)
plt.xlabel('petal length')
plt.ylabel('petal width')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()
In [39]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
iris = datasets.load_iris()
x = iris.data[:,1]
y = iris.data[:,2]
species = iris.target
pca = PCA(n_components=3)
x_reduced = pca.fit_transform(iris.data)
print 'Feature Convariance Matrix:'
print pca.get_covariance()
# draw 3d
fig = plt.figure()
ax = Axes3D(fig)
ax.set_title('Iris Dataset by PCA',size=14)
ax.scatter(x_reduced[:,0],x_reduced[:,1],x_reduced[:,2],c=species)
ax.set_xlabel('first eigenvector')
ax.set_ylabel('second eigenvector')
ax.set_zlabel('third eigenvector')
ax.w_xaxis.set_ticklabels(())
ax.w_yaxis.set_ticklabels(())
ax.w_zaxis.set_ticklabels(())
plt.show()
针对已有的数据,创建一个分类器,给定一种鸢尾花的测量数据,判断该鸢尾花属于哪一种类别。k-邻近算法思想非常简单,对于已有的数据,构建一个分类器,对于未知的花卉数据,判断与其最近的k个已有花卉数据的种类,根据少数服从多数的原则,判断未知花卉的种类。在这里一个非常重要的概念是距离,为了方便,我们选择欧式距离作为最近判断。$$d=\sqrt(\Delta petal\_length^2 + \Delta petal\_width^2 + \Delta sepal\_length^2 + \Delta sepal\_width^2)$$ 为了验证模型的正确,对于已有的150条数据,选择$70%$为训练数据(training data),剩下的为验证数据(testing data)。
In [108]:
import numpy as np
np.random.seed(0)
iris = datasets.load_iris()
x = iris.data
y = iris.target
i = np.random.permutation(len(iris.data))
x_train = x[i[:-45]]
y_train = y[i[:-45]]
x_test = x[i[-45:]]
y_test = y[i[-45:]]
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(x_train,y_train)
Out[108]:
In [109]:
knn.predict(x_test)
Out[109]:
In [110]:
y_test
Out[110]:
In [112]:
result = knn.predict(x_test) - (y_test)
result
Out[112]:
正确率为96%,再绘制出分界面
In [113]:
from matplotlib.colors import ListedColormap
x = iris.data[:,:2]
y = iris.target
x_min,x_max = x[:,0].min() -.5, x[:,0].max() + .5
y_min,y_max = x[:,1].min() -.5, x[:,1].max() + .5
cmap_light = ListedColormap(['#AAAAFF','#AAFFAA','#FFAAAA'])
h = .02
xx,yy = np.meshgrid(np.arange(x_min,x_max,h),np.arange(y_min,y_max,h))
knn = KNeighborsClassifier()
knn.fit(x,y)
Z = knn.predict(np.c_[xx.ravel(),yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx,yy,Z,cmap=cmap_light)
plt.scatter(x[:,0],x[:,1],c=y)
plt.xlim(xx.min(),xx.max())
plt.ylim(yy.min(),yy.max())
plt.show()
In [114]:
from matplotlib.colors import ListedColormap
x = iris.data[:,2:4]
y = iris.target
x_min,x_max = x[:,0].min() -.5, x[:,0].max() + .5
y_min,y_max = x[:,1].min() -.5, x[:,1].max() + .5
cmap_light = ListedColormap(['#AAAAFF','#AAFFAA','#FFAAAA'])
h = .02
xx,yy = np.meshgrid(np.arange(x_min,x_max,h),np.arange(y_min,y_max,h))
knn = KNeighborsClassifier()
knn.fit(x,y)
Z = knn.predict(np.c_[xx.ravel(),yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx,yy,Z,cmap=cmap_light)
plt.scatter(x[:,0],x[:,1],c=y)
plt.xlim(xx.min(),xx.max())
plt.ylim(yy.min(),yy.max())
plt.show()
SVM(support vector machines) 是一系列机器学习方法, 可分为SVR(support vector regression,支持向量回归)和SVC(support vector classification,支持向量分类)
In [6]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
x = np.array([[1,3],[1,2],[1,1.5],[1.5,2],[2,3],[2.5,1.5],
[2,1],[3,1],[3,2],[3.5,1],[3.5,3]])
y = [0]*6 + [1]*5
plt.scatter(x[:,0],x[:,1],c=y,s=50,alpha=0.9)
plt.show()
In [8]:
svc = svm.SVC(kernel='linear').fit(x,y)
X, Y = np.mgrid[0:4:200j,0:4:200j]
Z = svc.decision_function(np.c_[X.ravel(),Y.ravel()])
Z = Z.reshape(X.shape)
plt.contourf(X,Y,Z>0,alpha=0.4)
plt.contour(X,Y,Z,color=['k'],linestyle=['-'],level=[0])
plt.scatter(x[:,0],x[:,1],c=y,s=50,alpha=0.9)
plt.show()
In [10]:
svc.predict([1.5,2.5])
Out[10]:
In [11]:
svc.predict([2.5,1])
Out[11]:
正则化参数C对结果的印象
C越大,泛化能力越小
C越小,泛化能力越大
In [14]:
svc = svm.SVC(kernel='linear',C=1).fit(x,y)
X, Y = np.mgrid[0:4:200j,0:4:200j]
Z = svc.decision_function(np.c_[X.ravel(),Y.ravel()])
Z = Z.reshape(X.shape)
plt.contourf(X,Y,Z>0,alpha=0.4)
plt.contour(X,Y,Z,color=['k','k','k'],linestyle=['--','-','--'],level=[-1,0,1])
plt.scatter(svc.support_vectors_[:,0], svc.support_vectors_[:,1], s=120, facecolors='none')
plt.scatter(x[:,0],x[:,1],c=y,s=50,alpha=0.9)
plt.show()
In [15]:
svc = svm.SVC(kernel='linear',C=0.1).fit(x,y)
X, Y = np.mgrid[0:4:200j,0:4:200j]
Z = svc.decision_function(np.c_[X.ravel(),Y.ravel()])
Z = Z.reshape(X.shape)
plt.contourf(X,Y,Z>0,alpha=0.4)
plt.contour(X,Y,Z,color=['k','k','k'],linestyle=['--','-','--'],level=[-1,0,1])
plt.scatter(svc.support_vectors_[:,0], svc.support_vectors_[:,1], s=120, facecolors='none')
plt.scatter(x[:,0],x[:,1],c=y,s=50,alpha=0.9)
plt.show()
In [16]:
svc = svm.SVC(kernel='poly',C=1,degree=3).fit(x,y)
X, Y = np.mgrid[0:4:200j,0:4:200j]
Z = svc.decision_function(np.c_[X.ravel(),Y.ravel()])
Z = Z.reshape(X.shape)
plt.contourf(X,Y,Z>0,alpha=0.4)
plt.contour(X,Y,Z,color=['k','k','k'],linestyle=['--','-','--'],level=[-1,0,1])
plt.scatter(svc.support_vectors_[:,0], svc.support_vectors_[:,1], s=120, facecolors='none')
plt.scatter(x[:,0],x[:,1],c=y,s=50,alpha=0.9)
plt.show()
In [17]:
svc = svm.SVC(kernel='rbf',C=1,gamma=3).fit(x,y)
X, Y = np.mgrid[0:4:200j,0:4:200j]
Z = svc.decision_function(np.c_[X.ravel(),Y.ravel()])
Z = Z.reshape(X.shape)
plt.contourf(X,Y,Z>0,alpha=0.4)
plt.contour(X,Y,Z,color=['k','k','k'],linestyle=['--','-','--'],level=[-1,0,1])
plt.scatter(svc.support_vectors_[:,0], svc.support_vectors_[:,1], s=120, facecolors='none')
plt.scatter(x[:,0],x[:,1],c=y,s=50,alpha=0.9)
plt.show()
In [22]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
iris =datasets.load_iris()
x = iris.data[:,:2]
y = iris.target
h = 0.5
svc = svm.SVC(kernel='linear',C=1.0).fit(x,y)
x_min,x_max = x[:,0].min() - .5, x[:,0].max() +.5
y_min,y_max = x[:,1].min() - .5, x[:,1].max() +.5
h = 0.05
X, Y = np.meshgrid(np.arange(x_min,x_max,h), np.arange(y_min,y_max,h))
Z = svc.predict(np.c_[X.ravel(),Y.ravel()])
Z = Z.reshape(X.shape)
plt.contourf(X,Y,Z,alpha=0.4)
plt.contour(X,Y,Z,colors='k')
plt.scatter(x[:,0],x[:,1],c=y)
plt.show()
In [23]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
iris =datasets.load_iris()
x = iris.data[:,:2]
y = iris.target
h = 0.5
svc = svm.SVC(kernel='poly',C=1.0,degree=3).fit(x,y)
x_min,x_max = x[:,0].min() - .5, x[:,0].max() +.5
y_min,y_max = x[:,1].min() - .5, x[:,1].max() +.5
h = 0.05
X, Y = np.meshgrid(np.arange(x_min,x_max,h), np.arange(y_min,y_max,h))
Z = svc.predict(np.c_[X.ravel(),Y.ravel()])
Z = Z.reshape(X.shape)
plt.contourf(X,Y,Z,alpha=0.4)
plt.contour(X,Y,Z,colors='k')
plt.scatter(x[:,0],x[:,1],c=y)
plt.show()
In [24]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
iris =datasets.load_iris()
x = iris.data[:,:2]
y = iris.target
h = 0.5
svc = svm.SVC(kernel='rbf',C=1.0,gamma=3).fit(x,y)
x_min,x_max = x[:,0].min() - .5, x[:,0].max() +.5
y_min,y_max = x[:,1].min() - .5, x[:,1].max() +.5
h = 0.05
X, Y = np.meshgrid(np.arange(x_min,x_max,h), np.arange(y_min,y_max,h))
Z = svc.predict(np.c_[X.ravel(),Y.ravel()])
Z = Z.reshape(X.shape)
plt.contourf(X,Y,Z,alpha=0.4)
plt.contour(X,Y,Z,colors='k')
plt.scatter(x[:,0],x[:,1],c=y)
plt.show()
In [ ]: