In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
digits_traiin = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra', header=None)

In [3]:
digits_test = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes', header=None)

In [4]:
X_train = digits_traiin[np.arange(64)]
Y_train = digits_traiin[64]

In [5]:
X_tesy = digits_test[np.arange(64)]
Y_test = digits_test[64]

In [6]:
#显示手写数字图片的特征经过PCA压缩后的二维空间分布
from sklearn.decomposition import PCA

In [7]:
#初始化一个可以讲高纬度特征向量(六十四维)压缩到2个纬度的PCA
estimator = PCA(n_components=2)

In [8]:
X_pca = estimator.fit_transform(X_train)

In [9]:
def plot_pca_scatter():
    colors = ['black','blue','purple','yellow','white','red','lime','cyan','orange','gray']
    for i in range(len(colors)):
        px = X_pca[:,0][Y_train.as_matrix()==i]
        py = X_pca[:,1][Y_train.as_matrix()==i]
        plt.scatter(px, py, c = colors[i])
    plt.legend(np.arange(0,10).astype(str))
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.show()
plot_pca_scatter()


/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/collections.py:548: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == 'face':

In [10]:
#使用压缩后的未读进行训练,取20个特征
estimator = PCA(n_components=20)

In [11]:
pca_X_train = estimator.fit_transform(X_train)

In [12]:
pca_X_test = estimator.transform(X_tesy)

In [13]:
from sklearn.svm import LinearSVC

In [14]:
svc = LinearSVC()
svc.fit(X_train, Y_train)
y_pred = svc.predict(X_tesy)
print y_pred


[0 1 1 ..., 8 9 8]

In [15]:
pca_svc = LinearSVC()

In [16]:
pca_svc.fit(pca_X_train, Y_train)


Out[16]:
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [17]:
pca_y_pred = pca_svc.predict(pca_X_test)

In [18]:
from sklearn.metrics import classification_report

In [19]:
y_pred = y_pred.reshape(-1,1)
pca_y_pred = pca_y_pred.reshape(-1,1)
print svc.score(X_tesy, Y_test)
print classification_report(Y_test, y_pred, target_names=np.arange(10).astype(str))
print pca_svc.score(pca_X_test, Y_test)
print classification_report(Y_test, pca_y_pred, target_names=np.arange(10).astype(str))


0.927100723428
             precision    recall  f1-score   support

          0       0.99      0.98      0.99       178
          1       0.75      0.98      0.85       182
          2       0.99      0.95      0.97       177
          3       0.98      0.92      0.95       183
          4       0.95      0.96      0.95       181
          5       0.89      0.96      0.93       182
          6       0.99      0.97      0.98       181
          7       0.98      0.91      0.94       179
          8       0.87      0.80      0.83       174
          9       0.95      0.84      0.89       180

avg / total       0.93      0.93      0.93      1797

0.923761825264
             precision    recall  f1-score   support

          0       0.97      0.97      0.97       178
          1       0.90      0.89      0.90       182
          2       0.97      0.98      0.97       177
          3       0.96      0.90      0.93       183
          4       0.94      0.97      0.96       181
          5       0.87      0.96      0.91       182
          6       0.97      0.97      0.97       181
          7       0.97      0.89      0.93       179
          8       0.86      0.82      0.84       174
          9       0.83      0.89      0.86       180

avg / total       0.93      0.92      0.92      1797