In [1]:
%matplotlib inline
from matplotlib import pyplot as plt, cm
from skimage import io
from skimage import data, segmentation, filters, color, img_as_float, img_as_ubyte, exposure, feature, measure, morphology
from skimage.color import rgb2gray
from skimage.feature import hog
from skimage.morphology import square
import cv2
import numpy as np
from glob import glob
import os
from sklearn.metrics import classification_report
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from time import time

In [ ]:
# plan 1
# train data : captcha 1000(feature : HOG)
# test data : captcha 200 images (feature : HOG)

In [38]:
# 새로운 200개의 이미지를 test 데이터로 만들자.
p = "./data_test"
md5list = glob(os.path.join(p, "*.png"))
md5list = [os.path.split(fname)[1] for fname in md5list]
print "the number of files is %s" %len(md5list)

features = []
lables = []

t0 = time()

# captcha를 preprossing후 mnist처럼 numpy array로 만들자
for fname in md5list:
    lable = os.path.split(fname)[1].split("_")[1][:5]
    im = io.imread(os.path.join(p, fname))
    w, h, _ = im.shape

    for x in range(w):
        for j in range(h):

            if im[x][j][0] == im[x][j][1] and im[x][j][1] == im[x][j][2] and im[x][j][2] == im[x][j][0]:
                im[x][j][0] = 255
                im[x][j][1] = 255
                im[x][j][2] = 255

    im_gray = rgb2gray(im)
    im_gray = img_as_ubyte(im_gray)
    im_gray = morphology.opening(im_gray, square(2))
    im_gray_equalize = exposure.equalize_hist(im_gray)

    threshold = filters.threshold_otsu(im_gray_equalize).copy()
    threshold = im_gray_equalize < threshold
    threshold = img_as_ubyte(threshold)

    bw = morphology.closing(im_gray_equalize < threshold, square(3))
    cleared = bw.copy()

    im_th = cleared
    ctrs, hier = cv2.findContours(img_as_ubyte(im_th.copy()), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rects = [cv2.boundingRect(ctr) for ctr in ctrs]
    rects = sorted(rects, key=lambda tup: tup[0])

    if len(rects) != 5:
        continue


    for rect, l in zip(rects, lable):
        # Draw the rectangles
        cv2.rectangle(threshold, (rect[0], rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (0, 255, 0), 1) 

        # Make the rectangular region around the digit
        roi = threshold[rect[1]:rect[1]+rect[3], rect[0]:rect[0]+rect[2]]
        roi = cv2.resize(roi, (28, 28), interpolation=cv2.INTER_AREA)
        roi = morphology.closing(roi, square(4))
        
        features.append(roi.ravel())
        lables.append([l])

features = np.array(features, 'int16')
labels = np.array(lables, 'int').ravel()

# features, lables의 차원을 출력
print features.shape
print labels.shape
print "escape time : ", round(time()-t0, 3), "s"

t0 = time()
list_hog_fd = []
for feature in features:
    fd = hog(feature.reshape((28, 28)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualise=False)
    list_hog_fd.append(fd)
hog_features = np.array(list_hog_fd, 'float64')
print "escape time : ", round(time()-t0, 3), "s"

classifiers = glob("./pkl/hog/skt/*.pkl")

for classifier in classifiers:
    clf = joblib.load(classifier)
    print clf
    print classification_report(labels, clf.predict(hog_features))
    print accuracy_score(labels, clf.predict(hog_features))
    print confusion_matrix(labels, clf.predict(hog_features))
    print "=" * 100


the number of files is 200
(1000, 784)
(1000,)
escape time :  13.673 s
escape time :  0.936 s
BernoulliNB(alpha=1, binarize=0.0, class_prior=None, fit_prior=True)
             precision    recall  f1-score   support

          0       0.92      0.79      0.85       103
          1       0.76      0.65      0.70       130
          2       0.86      0.73      0.79       132
          3       0.62      0.45      0.52        94
          4       0.81      0.88      0.84       134
          5       0.71      0.72      0.71       109
          6       0.82      0.32      0.46        57
          7       0.79      0.91      0.85       148
          8       0.45      0.83      0.58        93

avg / total       0.76      0.73      0.73      1000

0.731
[[ 81   1   2   3   9   0   0   1   6]
 [  0  85   1  10   0   0   1  31   2]
 [  0   5  97   4   0   6   0   2  18]
 [  0   1   0  42   1   9   2   0  39]
 [  1   4   2   1 118   2   0   0   6]
 [  3   6   3   4   6  78   0   1   8]
 [  2   7   4   0  10   6  18   0  10]
 [  1   1   2   0   1   3   0 135   5]
 [  0   2   2   4   1   6   1   0  77]]
====================================================================================================
DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=5,
            random_state=None, splitter='best')
             precision    recall  f1-score   support

          0       0.88      0.90      0.89       103
          1       0.95      0.98      0.97       130
          2       0.92      0.91      0.92       132
          3       0.92      0.83      0.87        94
          4       0.96      0.94      0.95       134
          5       0.84      0.84      0.84       109
          6       0.58      0.75      0.66        57
          7       0.96      0.91      0.94       148
          8       0.92      0.88      0.90        93

avg / total       0.90      0.90      0.90      1000

0.897
[[ 93   2   0   0   1   1   3   2   1]
 [  1 128   0   0   0   0   1   0   0]
 [  1   0 120   1   2   2   4   1   1]
 [  1   3   1  78   0   6   3   0   2]
 [  1   0   3   0 126   2   2   0   0]
 [  1   0   2   3   1  92   7   2   1]
 [  4   1   4   2   1   0  43   0   2]
 [  3   1   0   1   0   2   6 135   0]
 [  1   0   0   0   0   5   5   0  82]]
====================================================================================================
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')
             precision    recall  f1-score   support

          0       0.95      0.92      0.94       103
          1       0.93      1.00      0.96       130
          2       0.97      0.92      0.94       132
          3       0.97      0.95      0.96        94
          4       0.99      0.96      0.97       134
          5       0.96      0.91      0.93       109
          6       0.69      0.86      0.77        57
          7       0.97      0.97      0.97       148
          8       0.94      0.94      0.94        93

avg / total       0.95      0.94      0.94      1000

0.941
[[ 95   2   0   0   0   0   3   2   1]
 [  0 130   0   0   0   0   0   0   0]
 [  1   1 121   1   0   0   6   1   1]
 [  0   1   0  89   0   2   1   0   1]
 [  1   2   1   1 128   0   1   0   0]
 [  0   1   1   0   1  99   5   1   1]
 [  2   2   1   1   0   0  49   0   2]
 [  1   0   0   0   0   2   2 143   0]
 [  0   1   1   0   0   0   4   0  87]]
====================================================================================================
LinearSVC(C=100.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)
             precision    recall  f1-score   support

          0       0.93      0.92      0.93       103
          1       0.95      1.00      0.97       130
          2       0.93      0.92      0.92       132
          3       0.92      0.93      0.92        94
          4       0.98      0.95      0.96       134
          5       0.94      0.87      0.90       109
          6       0.67      0.72      0.69        57
          7       0.96      0.95      0.96       148
          8       0.88      0.91      0.89        93

avg / total       0.92      0.92      0.92      1000

0.922
[[ 95   2   0   1   0   0   2   1   2]
 [  0 130   0   0   0   0   0   0   0]
 [  0   0 121   1   0   2   4   2   2]
 [  0   1   1  87   0   2   1   0   2]
 [  1   1   2   0 127   0   2   0   1]
 [  2   1   1   1   0  95   6   1   2]
 [  4   0   3   4   2   0  41   1   2]
 [  0   1   1   0   1   1   2 141   1]
 [  0   1   1   1   0   1   3   1  85]]
====================================================================================================
LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
             precision    recall  f1-score   support

          0       0.93      0.92      0.93       103
          1       0.94      1.00      0.97       130
          2       0.93      0.93      0.93       132
          3       0.92      0.93      0.92        94
          4       0.98      0.95      0.97       134
          5       0.96      0.87      0.91       109
          6       0.68      0.75      0.72        57
          7       0.96      0.95      0.96       148
          8       0.89      0.91      0.90        93

avg / total       0.93      0.93      0.93      1000

0.926
[[ 95   2   0   1   0   0   2   1   2]
 [  0 130   0   0   0   0   0   0   0]
 [  0   1 123   1   0   0   5   1   1]
 [  0   1   1  87   0   2   0   0   3]
 [  1   1   2   0 127   0   2   0   1]
 [  2   1   1   1   0  95   6   1   2]
 [  4   1   3   4   1   0  43   1   0]
 [  0   1   1   0   1   1   2 141   1]
 [  0   0   1   1   0   1   3   2  85]]
====================================================================================================
Pipeline(steps=[('rbm', BernoulliRBM(batch_size=10, learning_rate=0.001, n_components=200, n_iter=20,
       random_state=None, verbose=True)), ('logistic', LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001))])
             precision    recall  f1-score   support

          0       0.71      0.92      0.81       103
          1       0.82      0.88      0.85       130
          2       0.96      0.92      0.94       132
          3       0.93      0.86      0.90        94
          4       0.98      0.96      0.97       134
          5       0.86      0.78      0.82       109
          6       0.00      0.00      0.00        57
          7       0.83      0.97      0.89       148
          8       0.77      0.91      0.83        93

avg / total       0.81      0.85      0.83      1000

0.853
[[ 95   3   0   0   0   0   0   2   3]
 [  0 115   0   0   0   0   0  15   0]
 [  0   5 121   1   0   1   0   3   1]
 [  0   2   2  81   0   7   0   0   2]
 [  0   4   2   0 128   0   0   0   0]
 [  8   6   0   1   0  85   0   4   5]
 [ 28   3   1   2   1   3   0   5  14]
 [  1   1   0   0   1   1   0 143   1]
 [  1   2   0   2   0   2   0   1  85]]
====================================================================================================
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=10, n_estimators=70, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)
             precision    recall  f1-score   support

          0       0.98      0.92      0.95       103
          1       0.94      1.00      0.97       130
          2       0.98      0.92      0.95       132
          3       0.98      0.94      0.96        94
          4       0.99      0.96      0.97       134
          5       0.93      0.89      0.91       109
          6       0.66      0.86      0.75        57
          7       0.97      0.97      0.97       148
          8       0.89      0.91      0.90        93

avg / total       0.94      0.94      0.94      1000

0.937
[[ 95   2   0   0   0   1   3   1   1]
 [  0 130   0   0   0   0   0   0   0]
 [  0   0 122   1   0   0   5   1   3]
 [  0   1   0  88   0   1   2   0   2]
 [  0   0   1   1 128   1   2   1   0]
 [  0   3   0   0   0  97   6   1   2]
 [  2   2   1   0   0   1  49   0   2]
 [  0   0   0   0   1   2   2 143   0]
 [  0   1   1   0   0   1   5   0  85]]
====================================================================================================
SVC(C=100.0, cache_size=1000, class_weight=None, coef0=0.0, degree=3,
  gamma=0.125, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)
             precision    recall  f1-score   support

          0       0.98      0.93      0.96       103
          1       0.93      1.00      0.96       130
          2       0.98      0.94      0.96       132
          3       0.92      0.96      0.94        94
          4       0.98      0.96      0.97       134
          5       0.97      0.91      0.94       109
          6       0.70      0.86      0.77        57
          7       0.98      0.97      0.97       148
          8       0.94      0.90      0.92        93

avg / total       0.95      0.94      0.94      1000

0.943
[[ 96   3   0   0   0   0   2   1   1]
 [  0 130   0   0   0   0   0   0   0]
 [  0   0 124   1   1   0   5   1   0]
 [  0   1   0  90   0   1   0   0   2]
 [  0   1   1   2 128   0   2   0   0]
 [  0   2   0   1   0  99   5   1   1]
 [  2   2   1   2   1   0  49   0   0]
 [  0   0   0   1   1   1   1 143   1]
 [  0   1   0   1   0   1   6   0  84]]
====================================================================================================

In [ ]:
# plan 2
# train data : captcha 1000(feature : No)
# test data : captcha 200 images (feature : No)

In [40]:
# 새로운 200개의 이미지를 test 데이터로 만들자.
p = "./data_test"
md5list = glob(os.path.join(p, "*.png"))
md5list = [os.path.split(fname)[1] for fname in md5list]
print "the number of files is %s" %len(md5list)

features = []
lables = []

t0 = time()

# captcha를 preprossing후 mnist처럼 numpy array로 만들자
for fname in md5list:
    lable = os.path.split(fname)[1].split("_")[1][:5]
    im = io.imread(os.path.join(p, fname))
    w, h, _ = im.shape

    for x in range(w):
        for j in range(h):

            if im[x][j][0] == im[x][j][1] and im[x][j][1] == im[x][j][2] and im[x][j][2] == im[x][j][0]:
                im[x][j][0] = 255
                im[x][j][1] = 255
                im[x][j][2] = 255

    im_gray = rgb2gray(im)
    im_gray = img_as_ubyte(im_gray)
    im_gray = morphology.opening(im_gray, square(2))
    im_gray_equalize = exposure.equalize_hist(im_gray)

    threshold = filters.threshold_otsu(im_gray_equalize).copy()
    threshold = im_gray_equalize < threshold
    threshold = img_as_ubyte(threshold)

    bw = morphology.closing(im_gray_equalize < threshold, square(3))
    cleared = bw.copy()

    im_th = cleared
    ctrs, hier = cv2.findContours(img_as_ubyte(im_th.copy()), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rects = [cv2.boundingRect(ctr) for ctr in ctrs]
    rects = sorted(rects, key=lambda tup: tup[0])

    if len(rects) != 5:
        continue


    for rect, l in zip(rects, lable):
        # Draw the rectangles
        cv2.rectangle(threshold, (rect[0], rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (0, 255, 0), 1) 

        # Make the rectangular region around the digit
        roi = threshold[rect[1]:rect[1]+rect[3], rect[0]:rect[0]+rect[2]]
        roi = cv2.resize(roi, (28, 28), interpolation=cv2.INTER_AREA)
        roi = morphology.closing(roi, square(4))
        
        features.append(roi.ravel())
        lables.append([l])

features = np.array(features, 'int16')
labels = np.array(lables, 'int').ravel()

# features, lables의 차원을 출력
print features.shape
print labels.shape
print "escape time : ", round(time()-t0, 3), "s"

t0 = time()
def scale(X, eps = 0.001):
    # scale the data points s.t the columns of the feature space
    # (i.e the predictors) are within the range [0, 1]
    return (X - np.min(X, axis = 0)) / (np.max(X, axis = 0) + eps)

features = features.astype("float32")
features = scale(features)

print "escape time : ", round(time()-t0, 3), "s"

classifiers = glob("./pkl/scale/skt/*.pkl")

for classifier in classifiers:
    clf = joblib.load(classifier)
    print clf
    print classification_report(labels, clf.predict(features))
    print accuracy_score(labels, clf.predict(features))
    print confusion_matrix(labels, clf.predict(features))
    print "=" * 100


the number of files is 200
(1000, 784)
(1000,)
escape time :  13.622 s
escape time :  0.014 s
BernoulliNB(alpha=1, binarize=0.0, class_prior=None, fit_prior=True)
             precision    recall  f1-score   support

          0       0.97      0.92      0.95       103
          1       0.87      0.95      0.90       130
          2       0.99      0.90      0.94       132
          3       0.99      0.93      0.96        94
          4       1.00      0.95      0.97       134
          5       0.93      0.85      0.89       109
          6       0.49      0.84      0.62        57
          7       0.98      0.97      0.97       148
          8       0.98      0.86      0.91        93

avg / total       0.94      0.92      0.92      1000

0.915
[[ 95   1   0   0   0   0   5   1   1]
 [  0 123   0   0   0   0   7   0   0]
 [  1   4 119   1   0   1   5   1   0]
 [  0   1   0  87   0   0   6   0   0]
 [  0   2   1   0 127   0   4   0   0]
 [  0   5   0   0   0  93  10   1   0]
 [  2   4   0   0   0   2  48   0   1]
 [  0   1   0   0   0   3   1 143   0]
 [  0   1   0   0   0   1  11   0  80]]
====================================================================================================
DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=50,
            random_state=None, splitter='best')
             precision    recall  f1-score   support

          0       0.95      0.90      0.93       103
          1       0.87      0.99      0.92       130
          2       0.95      0.89      0.92       132
          3       0.87      0.91      0.89        94
          4       0.91      0.94      0.92       134
          5       0.93      0.84      0.88       109
          6       0.71      0.81      0.75        57
          7       0.97      0.95      0.96       148
          8       0.99      0.88      0.93        93

avg / total       0.92      0.91      0.91      1000

0.911
[[ 93   2   0   2   1   0   3   1   1]
 [  0 129   0   1   0   0   0   0   0]
 [  0   2 117   4   3   0   4   2   0]
 [  0   3   0  86   3   1   0   1   0]
 [  1   2   1   2 126   1   1   0   0]
 [  0   5   2   2   2  92   5   1   0]
 [  2   4   0   1   2   2  46   0   0]
 [  1   0   2   1   0   2   2 140   0]
 [  1   2   1   0   2   1   4   0  82]]
====================================================================================================
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')
             precision    recall  f1-score   support

          0       0.94      0.92      0.93       103
          1       0.87      0.99      0.92       130
          2       0.98      0.91      0.94       132
          3       0.97      0.97      0.97        94
          4       0.99      0.96      0.97       134
          5       0.95      0.89      0.92       109
          6       0.82      0.82      0.82        57
          7       0.97      0.97      0.97       148
          8       0.90      0.92      0.91        93

avg / total       0.94      0.94      0.94      1000

0.937
[[ 95   2   0   0   1   1   2   1   1]
 [  0 129   0   0   0   0   0   0   1]
 [  1   4 120   3   0   0   0   2   2]
 [  0   1   0  91   0   0   1   0   1]
 [  1   1   1   0 128   0   1   0   2]
 [  0   5   0   0   0  97   2   2   3]
 [  3   5   1   0   0   1  47   0   0]
 [  0   1   0   0   0   2   1 144   0]
 [  1   1   1   0   0   1   3   0  86]]
====================================================================================================
LinearSVC(C=10.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)
             precision    recall  f1-score   support

          0       0.93      0.92      0.93       103
          1       0.93      0.99      0.96       130
          2       0.97      0.91      0.94       132
          3       0.97      0.93      0.95        94
          4       0.98      0.96      0.97       134
          5       0.96      0.89      0.92       109
          6       0.69      0.86      0.77        57
          7       0.95      0.96      0.96       148
          8       0.91      0.91      0.91        93

avg / total       0.94      0.93      0.93      1000

0.932
[[ 95   1   0   0   0   1   3   2   1]
 [  0 129   0   0   0   0   0   0   1]
 [  2   1 120   1   0   0   5   2   1]
 [  1   2   0  87   1   2   0   1   0]
 [  0   2   0   1 128   0   1   0   2]
 [  0   1   2   1   0  97   7   1   0]
 [  2   2   1   0   0   0  49   1   2]
 [  0   1   0   0   0   1   3 142   1]
 [  2   0   1   0   2   0   3   0  85]]
====================================================================================================
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
             precision    recall  f1-score   support

          0       0.96      0.92      0.94       103
          1       0.96      0.99      0.97       130
          2       0.96      0.92      0.94       132
          3       0.97      0.94      0.95        94
          4       0.96      0.96      0.96       134
          5       0.95      0.89      0.92       109
          6       0.72      0.86      0.78        57
          7       0.95      0.97      0.96       148
          8       0.93      0.94      0.93        93

avg / total       0.94      0.94      0.94      1000

0.939
[[ 95   0   0   0   1   1   3   2   1]
 [  0 129   0   0   0   0   0   0   1]
 [  1   1 122   1   0   1   4   1   1]
 [  1   1   0  88   1   2   0   1   0]
 [  0   1   1   0 129   0   1   0   2]
 [  0   0   2   1   1  97   6   2   0]
 [  2   2   1   0   0   0  49   1   2]
 [  0   1   0   0   1   1   2 143   0]
 [  0   0   1   1   1   0   3   0  87]]
====================================================================================================
Pipeline(steps=[('rbm', BernoulliRBM(batch_size=10, learning_rate=0.001, n_components=200, n_iter=80,
       random_state=None, verbose=True)), ('logistic', LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001))])
             precision    recall  f1-score   support

          0       0.97      0.92      0.95       103
          1       0.96      0.99      0.98       130
          2       0.95      0.92      0.93       132
          3       0.98      0.95      0.96        94
          4       0.98      0.96      0.97       134
          5       0.91      0.90      0.90       109
          6       0.68      0.82      0.75        57
          7       0.97      0.97      0.97       148
          8       0.91      0.92      0.92        93

avg / total       0.94      0.94      0.94      1000

0.937
[[ 95   0   1   0   1   1   3   1   1]
 [  0 129   0   0   0   0   0   1   0]
 [  1   0 121   1   0   1   5   1   2]
 [  0   1   0  89   0   0   3   0   1]
 [  0   1   1   0 129   2   1   0   0]
 [  0   2   1   0   0  98   5   1   2]
 [  2   0   1   1   1   3  47   0   2]
 [  0   0   1   0   0   2   2 143   0]
 [  0   1   1   0   1   1   3   0  86]]
====================================================================================================
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=20, n_estimators=80, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)
             precision    recall  f1-score   support

          0       0.98      0.92      0.95       103
          1       0.95      1.00      0.97       130
          2       0.98      0.92      0.95       132
          3       0.98      0.95      0.96        94
          4       0.98      0.96      0.97       134
          5       0.90      0.89      0.89       109
          6       0.68      0.88      0.76        57
          7       0.97      0.97      0.97       148
          8       0.96      0.92      0.94        93

avg / total       0.95      0.94      0.94      1000

0.942
[[ 95   1   0   0   1   1   3   1   1]
 [  0 130   0   0   0   0   0   0   0]
 [  0   0 122   2   0   1   4   2   1]
 [  0   1   1  89   0   2   1   0   0]
 [  0   0   1   0 129   2   2   0   0]
 [  0   2   0   0   0  97   8   1   1]
 [  2   2   0   0   0   2  50   0   1]
 [  0   0   0   0   0   2   2 144   0]
 [  0   1   0   0   1   1   4   0  86]]
====================================================================================================
SVC(C=10.0, cache_size=1000, class_weight=None, coef0=0.0, degree=3,
  gamma=0.03125, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)
             precision    recall  f1-score   support

          0       0.98      0.92      0.95       103
          1       0.98      1.00      0.99       130
          2       0.98      0.92      0.95       132
          3       0.95      0.96      0.95        94
          4       0.99      0.96      0.98       134
          5       0.89      0.93      0.91       109
          6       0.68      0.86      0.76        57
          7       0.99      0.97      0.98       148
          8       0.98      0.95      0.96        93

avg / total       0.95      0.95      0.95      1000

0.947
[[ 95   0   0   1   0   2   3   1   1]
 [  0 130   0   0   0   0   0   0   0]
 [  0   0 122   1   0   3   5   1   0]
 [  0   1   1  90   0   1   1   0   0]
 [  0   0   0   2 129   2   1   0   0]
 [  0   0   0   0   0 101   8   0   0]
 [  2   2   0   0   1   2  49   0   1]
 [  0   0   0   0   0   2   3 143   0]
 [  0   0   1   1   0   1   2   0  88]]
====================================================================================================

In [28]:
# plan 3
# train data : MNIST(feature : HOG)
# test data : captcha 200 images (feature : HOG)

In [23]:
# 새로운 200개의 이미지를 test 데이터로 만들자.
p = "./data_test"
md5list = glob(os.path.join(p, "*.png"))
md5list = [os.path.split(fname)[1] for fname in md5list]
print "the number of files is %s" %len(md5list)

features = []
lables = []

t0 = time()

# captcha를 preprossing후 mnist처럼 numpy array로 만들자
for fname in md5list:
    lable = os.path.split(fname)[1].split("_")[1][:5]
    im = io.imread(os.path.join(p, fname))
    w, h, _ = im.shape

    for x in range(w):
        for j in range(h):

            if im[x][j][0] == im[x][j][1] and im[x][j][1] == im[x][j][2] and im[x][j][2] == im[x][j][0]:
                im[x][j][0] = 255
                im[x][j][1] = 255
                im[x][j][2] = 255

    im_gray = rgb2gray(im)
    im_gray = img_as_ubyte(im_gray)
    im_gray = morphology.opening(im_gray, square(2))
    im_gray_equalize = exposure.equalize_hist(im_gray)

    threshold = filters.threshold_otsu(im_gray_equalize).copy()
    threshold = im_gray_equalize < threshold
    threshold = img_as_ubyte(threshold)

    bw = morphology.closing(im_gray_equalize < threshold, square(3))
    cleared = bw.copy()

    im_th = cleared
    ctrs, hier = cv2.findContours(img_as_ubyte(im_th.copy()), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rects = [cv2.boundingRect(ctr) for ctr in ctrs]
    rects = sorted(rects, key=lambda tup: tup[0])

    if len(rects) != 5:
        continue


    for rect, l in zip(rects, lable):
        # Draw the rectangles
        cv2.rectangle(threshold, (rect[0], rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (0, 255, 0), 1) 

        # Make the rectangular region around the digit
        roi = threshold[rect[1]:rect[1]+rect[3], rect[0]:rect[0]+rect[2]]
        roi = cv2.resize(roi, (28, 28), interpolation=cv2.INTER_AREA)
        roi = morphology.closing(roi, square(4))
        
        features.append(roi.ravel())
        lables.append([l])

features = np.array(features, 'int16')
labels = np.array(lables, 'int').ravel()

# features, lables의 차원을 출력
print features.shape
print labels.shape
print "escape time : ", round(time()-t0, 3), "s"

t0 = time()
list_hog_fd = []
for feature in features:
    fd = hog(feature.reshape((28, 28)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualise=False)
    list_hog_fd.append(fd)
hog_features = np.array(list_hog_fd, 'float64')
print "escape time : ", round(time()-t0, 3), "s"


the number of files is 200
(1000, 784)
(1000,)
escape time :  10.812 s
escape time :  0.971 s

In [34]:
classifiers = glob("./pkl/hog/mnist/*.pkl")

for classifier in classifiers:
    clf = joblib.load(classifier)
    print clf
    print classification_report(labels, clf.predict(hog_features))
    print accuracy_score(labels, clf.predict(hog_features))
    print confusion_matrix(labels, clf.predict(hog_features))
    print "=" * 100


BernoulliNB(alpha=1, binarize=0.0, class_prior=None, fit_prior=True)
             precision    recall  f1-score   support

          0       0.57      0.79      0.66       103
          1       0.00      0.00      0.00       130
          2       0.42      0.48      0.45       132
          3       0.23      0.44      0.30        94
          4       0.11      0.05      0.07       134
          5       0.11      0.06      0.07       109
          6       0.05      0.07      0.06        57
          7       0.56      0.72      0.63       148
          8       0.38      0.47      0.42        93

avg / total       0.28      0.35      0.31      1000

0.353
[[ 81   2   7   2   2   2   2   2   3]
 [  0   0  24   3  30   3   0  70   0]
 [  0   5  63  49   0   2   0   6   7]
 [  0   0   4  41   0  16   0   1  32]
 [ 38   2   8   5   7   0  68   1   5]
 [  6   6   5  66   1   6   3   2  14]
 [ 14   4   6   2   3  14   4   1   9]
 [  1   1  16   2  15   3   0 107   3]
 [  1   2  16  11   7   9   2   1  44]]
====================================================================================================
DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=10,
            random_state=None, splitter='best')
             precision    recall  f1-score   support

          0       0.66      0.80      0.72       103
          1       0.31      0.11      0.16       130
          2       0.63      0.30      0.41       132
          3       0.58      0.48      0.52        94
          4       0.57      0.51      0.54       134
          5       0.70      0.63      0.67       109
          6       0.25      0.16      0.19        57
          7       0.43      0.84      0.57       148
          8       0.05      0.09      0.07        93

avg / total       0.48      0.46      0.45      1000

0.46
[[ 82   3   0   2   1   0   4  10   1]
 [  4  14   0   2   1   0   0  33  76]
 [  1   4  40  14   1   7   0  47  18]
 [  0   1   3  45  27  11   0   7   0]
 [  2  12  11   1  69   0   1  31   7]
 [  6   5   2   6  10  69   3   5   3]
 [  9   3   4   1   5   3   9   5  18]
 [  2   1   0   0   1   3   1 124  16]
 [ 19   2   3   7   6   5  18  25   8]]
====================================================================================================
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=10, p=2, weights='uniform')
             precision    recall  f1-score   support

          0       0.55      0.93      0.69       103
          1       0.94      0.62      0.74       130
          2       0.94      0.78      0.85       132
          3       0.78      0.81      0.79        94
          4       0.95      0.93      0.94       134
          5       0.70      0.76      0.73       109
          6       0.08      0.02      0.03        57
          7       0.67      0.97      0.79       148
          8       0.32      0.20      0.25        93

avg / total       0.71      0.72      0.70      1000

0.725
[[ 96   0   0   0   0   0   0   4   3]
 [  0  80   0   0   2   0   0  48   0]
 [  0   0 103  17   1   1   0   4   6]
 [  0   0   0  76   1  12   0   3   2]
 [  0   4   2   0 124   0   1   2   1]
 [  8   0   2   5   1  83   0   2   8]
 [ 25   0   2   0   2   4   1   3  20]
 [  1   1   0   0   0   2   0 143   1]
 [ 44   0   0   0   0  17  10   3  19]]
====================================================================================================
LinearSVC(C=10.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)
             precision    recall  f1-score   support

          0       0.80      0.92      0.86       103
          1       0.00      0.00      0.00       130
          2       0.80      0.89      0.84       132
          3       0.99      0.73      0.84        94
          4       0.57      0.90      0.70       134
          5       0.49      0.84      0.62       109
          6       0.00      0.00      0.00        57
          7       0.64      0.97      0.77       148
          8       0.79      0.25      0.38        93

avg / total       0.58      0.66      0.59      1000

0.66
[[ 95   0   0   0   0   2   0   4   2]
 [  0   0   2   0  80   0   0  48   0]
 [  1   0 117   1   2   5   0   5   1]
 [  0   0   6  69   5  11   0   2   1]
 [  4   2   2   0 121   1   1   3   0]
 [  4   0   4   0   4  92   0   5   0]
 [ 14   1  12   0   1  18   0  10   1]
 [  0   1   0   0   0   3   0 143   1]
 [  1   0   3   0   0  54   8   4  23]]
====================================================================================================
LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
             precision    recall  f1-score   support

          0       0.77      0.92      0.84       103
          1       0.00      0.00      0.00       130
          2       0.76      0.92      0.83       132
          3       0.95      0.74      0.83        94
          4       0.57      0.90      0.70       134
          5       0.57      0.78      0.66       109
          6       0.00      0.00      0.00        57
          7       0.64      0.97      0.77       148
          8       0.83      0.41      0.55        93

avg / total       0.58      0.67      0.60      1000

0.673
[[ 95   0   1   0   0   1   0   4   2]
 [  0   0   2   0  80   0   0  48   0]
 [  1   0 121   1   2   1   0   5   1]
 [  0   0   6  70   4  11   0   2   1]
 [  4   2   3   0 121   0   1   3   0]
 [  4   0   8   3   4  85   0   5   0]
 [ 19   1  14   0   1  10   0   9   3]
 [  0   1   1   0   0   2   0 143   1]
 [  1   0   3   0   0  39   8   4  38]]
====================================================================================================
Pipeline(steps=[('rbm', BernoulliRBM(batch_size=10, learning_rate=0.001, n_components=200, n_iter=20,
       random_state=None, verbose=True)), ('logistic', LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001))])
             precision    recall  f1-score   support

          0       0.37      0.93      0.53       103
          1       0.50      0.04      0.07       130
          2       0.67      0.20      0.30       132
          3       0.45      0.80      0.57        94
          4       0.33      0.01      0.03       134
          5       0.79      0.49      0.60       109
          6       0.00      0.00      0.00        57
          7       0.43      0.97      0.60       148
          8       0.14      0.01      0.02        93

avg / total       0.44      0.40      0.32      1000

0.401
[[ 96   1   0   0   0   0   0   4   2]
 [  0   5   0   0   0   0   0 125   0]
 [  6   0  26  78   0   2   0  19   1]
 [  3   1   2  75   1   3   0   7   2]
 [ 13   0   1   0   2   0 115   3   0]
 [ 32   0   2  10   1  53   0  11   0]
 [ 39   0   6   3   1   0   0   7   1]
 [  3   2   0   0   0   0   0 143   0]
 [ 66   1   2   1   1   9   0  12   1]]
====================================================================================================
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)
             precision    recall  f1-score   support

          0       0.58      0.93      0.71       103
          1       0.07      0.01      0.01       130
          2       0.94      0.88      0.91       132
          3       0.63      0.84      0.72        94
          4       0.75      0.95      0.84       134
          5       0.52      0.50      0.51       109
          6       0.00      0.00      0.00        57
          7       0.56      0.97      0.71       148
          8       0.53      0.22      0.31        93

avg / total       0.54      0.64      0.56      1000

0.637
[[ 96   1   0   0   0   0   0   4   2]
 [  0   1   0   0  36   0   0  93   0]
 [  1   4 116   5   1   1   0   2   2]
 [  0   0   1  79   2  10   0   1   1]
 [  0   1   1   1 127   0   0   3   1]
 [  4   4   2  39   1  54   0   4   1]
 [ 29   2   3   1   2   6   0   4  10]
 [  0   1   0   1   0   1   0 144   1]
 [ 36   1   0   0   0  32   0   4  20]]
====================================================================================================
SVC(C=100.0, cache_size=1000, class_weight=None, coef0=0.0, degree=3,
  gamma=0.25, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)
             precision    recall  f1-score   support

          0       0.59      0.93      0.72       103
          1       0.89      0.62      0.73       130
          2       0.89      0.82      0.85       132
          3       0.78      0.80      0.79        94
          4       0.92      0.89      0.90       134
          5       0.64      0.80      0.71       109
          6       0.23      0.05      0.09        57
          7       0.65      0.95      0.77       148
          8       0.36      0.13      0.19        93

avg / total       0.70      0.72      0.69      1000

0.721
[[ 96   0   1   0   0   1   0   3   2]
 [  0  80   0   0   2   0   0  48   0]
 [  0   0 108  16   3   1   0   3   1]
 [  0   0   0  75   2  12   0   4   1]
 [  0   8   2   0 119   0   0   3   2]
 [  3   0   6   5   2  87   0   5   1]
 [ 29   1   3   0   1   2   3   6  12]
 [  0   1   1   0   0   3   0 141   2]
 [ 36   0   1   0   0  31  10   3  12]]
====================================================================================================

In [28]:
# plan 4
# train data : MNIST(feature : No)
# test data : captcha 200 images (feature : No)

In [2]:
# 새로운 200개의 이미지를 test 데이터로 만들자.
p = "./data_test"
md5list = glob(os.path.join(p, "*.png"))
md5list = [os.path.split(fname)[1] for fname in md5list]
print "the number of files is %s" %len(md5list)

features = []
lables = []

t0 = time()

# captcha를 preprossing후 mnist처럼 numpy array로 만들자
for fname in md5list:
    lable = os.path.split(fname)[1].split("_")[1][:5]
    im = io.imread(os.path.join(p, fname))
    w, h, _ = im.shape

    for x in range(w):
        for j in range(h):

            if im[x][j][0] == im[x][j][1] and im[x][j][1] == im[x][j][2] and im[x][j][2] == im[x][j][0]:
                im[x][j][0] = 255
                im[x][j][1] = 255
                im[x][j][2] = 255

    im_gray = rgb2gray(im)
    im_gray = img_as_ubyte(im_gray)
    im_gray = morphology.opening(im_gray, square(2))
    im_gray_equalize = exposure.equalize_hist(im_gray)

    threshold = filters.threshold_otsu(im_gray_equalize).copy()
    threshold = im_gray_equalize < threshold
    threshold = img_as_ubyte(threshold)

    bw = morphology.closing(im_gray_equalize < threshold, square(3))
    cleared = bw.copy()

    im_th = cleared
    ctrs, hier = cv2.findContours(img_as_ubyte(im_th.copy()), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rects = [cv2.boundingRect(ctr) for ctr in ctrs]
    rects = sorted(rects, key=lambda tup: tup[0])

    if len(rects) != 5:
        continue


    for rect, l in zip(rects, lable):
        # Draw the rectangles
        cv2.rectangle(threshold, (rect[0], rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (0, 255, 0), 1) 

        # Make the rectangular region around the digit
        roi = threshold[rect[1]:rect[1]+rect[3], rect[0]:rect[0]+rect[2]]
        roi = cv2.resize(roi, (28, 28), interpolation=cv2.INTER_AREA)
        roi = morphology.closing(roi, square(4))
        
        features.append(roi.ravel())
        lables.append([l])

features = np.array(features, 'int16')
labels = np.array(lables, 'int').ravel()

# features, lables의 차원을 출력
print features.shape
print labels.shape
print "escape time : ", round(time()-t0, 3), "s"

t0 = time()
def scale(X, eps = 0.001):
    # scale the data points s.t the columns of the feature space
    # (i.e the predictors) are within the range [0, 1]
    return (X - np.min(X, axis = 0)) / (np.max(X, axis = 0) + eps)

features = features.astype("float32")
features = scale(features)

print "escape time : ", round(time()-t0, 3), "s"


the number of files is 200
(1000, 784)
(1000,)
escape time :  12.148 s
escape time :  0.015 s
/Users/dikien/anaconda/lib/python2.7/site-packages/skimage/util/dtype.py:107: UserWarning: Possible precision loss when converting from float64 to uint8
  "%s to %s" % (dtypeobj_in, dtypeobj))

In [3]:
classifiers = glob("./pkl/scale/mnist/*.pkl")

for classifier in classifiers:
    clf = joblib.load(classifier)
    print clf
    print classification_report(labels, clf.predict(features))
    print accuracy_score(labels, clf.predict(features))
    print confusion_matrix(labels, clf.predict(features))
    print "=" * 100


BernoulliNB(alpha=1, binarize=0.0, class_prior=None, fit_prior=True)
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       103
          1       0.00      0.00      0.00       130
          2       0.15      0.92      0.26       132
          3       0.00      0.00      0.00        94
          4       0.71      0.04      0.07       134
          5       0.03      0.02      0.02       109
          6       0.00      0.00      0.00        57
          7       0.00      0.00      0.00       148
          8       0.00      0.00      0.00        93

avg / total       0.12      0.13      0.05      1000

0.129
[[  0   1  99   1   0   2   0   0   0]
 [  0   0  17  80   1  19   0   0  13]
 [  0   4 122   0   0   6   0   0   0]
 [  0   0  87   0   0   7   0   0   0]
 [  0   1 122   0   5   6   0   0   0]
 [  0   4 102   0   1   2   0   0   0]
 [  0   2  45   2   0   7   0   1   0]
 [  0   1 143   0   0   4   0   0   0]
 [  0   1  81   0   0  11   0   0   0]]
====================================================================================================
DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=5,
            random_state=None, splitter='best')
             precision    recall  f1-score   support

          0       0.69      0.88      0.78       103
          1       0.00      0.00      0.00       130
          2       0.15      0.22      0.18       132
          3       0.18      0.47      0.26        94
          4       0.00      0.00      0.00       134
          5       0.63      0.68      0.65       109
          6       0.19      0.53      0.28        57
          7       0.47      0.16      0.24       148
          8       0.01      0.01      0.01        93

avg / total       0.26      0.29      0.25      1000

0.293
[[91  0  2  2  2  4  1  1  0]
 [ 1  0 37 75  0  0  8  9  0]
 [ 3  0 29 62  8  2 20  4  4]
 [ 3  0  7 44  2 28  7  2  1]
 [ 4  0 85 27  0  0 14  2  2]
 [16  1  4  2  2 74  1  5  4]
 [ 8  0  5  2  0  5 30  3  4]
 [ 1  0 15 31  0  2  1 24 74]
 [ 4  0  6  3  0  2 76  1  1]]
====================================================================================================
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')
             precision    recall  f1-score   support

          0       0.30      0.96      0.46       103
          1       0.09      0.02      0.04       130
          2       0.58      0.91      0.71       132
          3       0.45      0.91      0.60        94
          4       0.00      0.00      0.00       134
          5       1.00      0.09      0.17       109
          6       0.75      0.05      0.10        57
          7       0.13      0.01      0.02       148
          8       0.25      0.54      0.34        93

avg / total       0.36      0.37      0.26      1000

0.373
[[ 99   2   0   0   1   0   0   0   1]
 [  1   3   1  16   1   0   0  12  96]
 [  4   4 120   1   0   0   1   0   2]
 [  6   0   0  86   1   0   0   1   0]
 [124   1   3   0   0   0   0   0   6]
 [ 21   4   1  69   2  10   0   0   2]
 [ 32   2   2   0   1   0   3   0  17]
 [  1  18  79  21   2   0   0   2  25]
 [ 38   1   0   0   4   0   0   0  50]]
====================================================================================================
LinearSVC(C=10.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       103
          1       0.00      0.00      0.00       130
          2       0.04      0.16      0.06       132
          3       0.00      0.00      0.00        94
          4       0.00      0.00      0.00       134
          5       0.07      0.05      0.06       109
          6       0.01      0.02      0.01        57
          7       0.00      0.00      0.00       148
          8       0.00      0.00      0.00        93

avg / total       0.01      0.03      0.01      1000

0.027
[[  0   0  99   1   0   1   0   1   1]
 [  0   0  17 102   0   0   0  11   0]
 [  0   0  21  50   0  33  28   0   0]
 [  0   0  86   0   0   7   1   0   0]
 [  0   0 122   0   0   9   0   1   2]
 [  0   0 101   1   0   5   1   1   0]
 [  0   0  46   3   0   5   1   2   0]
 [  0   0   5  45   0   1  97   0   0]
 [  0   0  86   0   0   7   0   0   0]]
====================================================================================================
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       103
          1       0.00      0.00      0.00       130
          2       0.03      0.08      0.04       132
          3       0.15      0.88      0.26        94
          4       0.00      0.00      0.00       134
          5       0.29      0.05      0.08       109
          6       0.09      0.04      0.05        57
          7       0.00      0.00      0.00       148
          8       0.00      0.00      0.00        93

avg / total       0.06      0.10      0.04      1000

0.101
[[  0   0  95   7   0   1   0   0   0]
 [  0   0  17  89   0   0   0  20   4]
 [  0   0  11  98   0   4  19   0   0]
 [  0   0  10  83   0   1   0   0   0]
 [  0   0 120   0   0   1   0   1  12]
 [  0   0   8  95   0   5   0   0   1]
 [  0   0  41  10   0   3   2   0   1]
 [  0   0   1 145   0   1   1   0   0]
 [  0   0  82  10   0   1   0   0   0]]
====================================================================================================
Pipeline(steps=[('rbm', BernoulliRBM(batch_size=10, learning_rate=0.01, n_components=200, n_iter=20,
       random_state=None, verbose=True)), ('logistic', LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001))])
             precision    recall  f1-score   support

          0       0.25      0.01      0.02       103
          1       0.00      0.00      0.00       130
          2       0.13      0.94      0.23       132
          3       0.00      0.00      0.00        94
          4       0.06      0.01      0.01       134
          5       0.00      0.00      0.00       109
          6       0.00      0.00      0.00        57
          7       0.00      0.00      0.00       148
          8       0.00      0.00      0.00        93

avg / total       0.05      0.13      0.03      1000

0.126
[[  1   0 101   0   1   0   0   0   0]
 [  0   0 120   9   1   0   0   0   0]
 [  0   0 124   4   4   0   0   0   0]
 [  0   0  94   0   0   0   0   0   0]
 [  1   0 132   0   1   0   0   0   0]
 [  0   0 104   0   5   0   0   0   0]
 [  0   0  55   0   2   0   0   0   0]
 [  2   0 144   1   1   0   0   0   0]
 [  0   0  92   0   1   0   0   0   0]]
====================================================================================================
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=80, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)
             precision    recall  f1-score   support

          0       0.50      0.93      0.65       103
          1       0.00      0.00      0.00       130
          2       0.48      0.84      0.61       132
          3       0.30      0.81      0.44        94
          4       0.14      0.01      0.01       134
          5       0.46      0.06      0.10       109
          6       1.00      0.04      0.07        57
          7       0.06      0.01      0.01       148
          8       0.28      0.86      0.42        93

avg / total       0.30      0.37      0.25      1000

0.373
[[ 96   0   2   0   1   0   0   1   3]
 [  1   0   0  77   1   0   0   3  48]
 [  3   0 111   1   0   0   0   4  13]
 [  3   0   0  76   1   7   0   0   7]
 [ 50   0   3   0   1   0   0   1  79]
 [ 14   0   3  76   1   6   0   4   5]
 [ 16   0   3   0   1   0   2   2  33]
 [  3   0 103  23   1   0   0   1  17]
 [  6   0   6   0   0   0   0   1  80]]
====================================================================================================
SVC(C=10.0, cache_size=1000, class_weight=None, coef0=0.0, degree=3,
  gamma=0.03125, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       103
          1       0.00      0.00      0.00       130
          2       0.13      0.97      0.23       132
          3       0.00      0.00      0.00        94
          4       0.00      0.00      0.00       134
          5       0.00      0.00      0.00       109
          6       0.00      0.00      0.00        57
          7       0.00      0.00      0.00       148
          8       0.00      0.00      0.00        93

avg / total       0.02      0.13      0.03      1000

0.128
[[  0   1 102   0   0   0   0   0   0]
 [  0   0 130   0   0   0   0   0   0]
 [  0   4 128   0   0   0   0   0   0]
 [  0   0  94   0   0   0   0   0   0]
 [  0   1 133   0   0   0   0   0   0]
 [  0   4 105   0   0   0   0   0   0]
 [  0   2  55   0   0   0   0   0   0]
 [  0   1 147   0   0   0   0   0   0]
 [  0   1  92   0   0   0   0   0   0]]
====================================================================================================
/Users/dikien/anaconda/lib/python2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [53]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues, labels=labels):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(set(list(labels))))
    plt.xticks(tick_marks, list(set(list(labels))), rotation=45)
    plt.yticks(tick_marks, list(set(list(labels))))
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [54]:
# Confusion matrix of SVC on Plan1
# 새로운 200개의 이미지를 test 데이터로 만들자.
p = "./data_test"
md5list = glob(os.path.join(p, "*.png"))
md5list = [os.path.split(fname)[1] for fname in md5list]
print "the number of files is %s" %len(md5list)

features = []
lables = []

t0 = time()

# captcha를 preprossing후 mnist처럼 numpy array로 만들자
for fname in md5list:
    lable = os.path.split(fname)[1].split("_")[1][:5]
    im = io.imread(os.path.join(p, fname))
    w, h, _ = im.shape

    for x in range(w):
        for j in range(h):

            if im[x][j][0] == im[x][j][1] and im[x][j][1] == im[x][j][2] and im[x][j][2] == im[x][j][0]:
                im[x][j][0] = 255
                im[x][j][1] = 255
                im[x][j][2] = 255

    im_gray = rgb2gray(im)
    im_gray = img_as_ubyte(im_gray)
    im_gray = morphology.opening(im_gray, square(2))
    im_gray_equalize = exposure.equalize_hist(im_gray)

    threshold = filters.threshold_otsu(im_gray_equalize).copy()
    threshold = im_gray_equalize < threshold
    threshold = img_as_ubyte(threshold)

    bw = morphology.closing(im_gray_equalize < threshold, square(3))
    cleared = bw.copy()

    im_th = cleared
    ctrs, hier = cv2.findContours(img_as_ubyte(im_th.copy()), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rects = [cv2.boundingRect(ctr) for ctr in ctrs]
    rects = sorted(rects, key=lambda tup: tup[0])

    if len(rects) != 5:
        continue


    for rect, l in zip(rects, lable):
        # Draw the rectangles
        cv2.rectangle(threshold, (rect[0], rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (0, 255, 0), 1) 

        # Make the rectangular region around the digit
        roi = threshold[rect[1]:rect[1]+rect[3], rect[0]:rect[0]+rect[2]]
        roi = cv2.resize(roi, (28, 28), interpolation=cv2.INTER_AREA)
        roi = morphology.closing(roi, square(4))
        
        features.append(roi.ravel())
        lables.append([l])

features = np.array(features, 'int16')
labels = np.array(lables, 'int').ravel()

# features, lables의 차원을 출력
print features.shape
print labels.shape
print "escape time : ", round(time()-t0, 3), "s"

t0 = time()
list_hog_fd = []
for feature in features:
    fd = hog(feature.reshape((28, 28)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualise=False)
    list_hog_fd.append(fd)
hog_features = np.array(list_hog_fd, 'float64')
print "escape time : ", round(time()-t0, 3), "s"

clf = joblib.load("./pkl/hog/skt/digits_SVC.pkl")

# Compute confusion matrix
cm = confusion_matrix(labels, clf.predict(hog_features))
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
plot_confusion_matrix(cm, labels=labels)

# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
# cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# print('Normalized confusion matrix')
# print(cm_normalized)
# plt.figure()
# plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix', labels=labels)


the number of files is 200
(1000, 784)
(1000,)
escape time :  13.41 s
escape time :  0.953 s
Confusion matrix, without normalization
[[ 96   3   0   0   0   0   2   1   1]
 [  0 130   0   0   0   0   0   0   0]
 [  0   0 124   1   1   0   5   1   0]
 [  0   1   0  90   0   1   0   0   2]
 [  0   1   1   2 128   0   2   0   0]
 [  0   2   0   1   0  99   5   1   1]
 [  2   2   1   2   1   0  49   0   0]
 [  0   0   0   1   1   1   1 143   1]
 [  0   1   0   1   0   1   6   0  84]]

In [56]:
# Confusion matrix of KNeighbors on Plan3

# 새로운 200개의 이미지를 test 데이터로 만들자.
p = "./data_test"
md5list = glob(os.path.join(p, "*.png"))
md5list = [os.path.split(fname)[1] for fname in md5list]
print "the number of files is %s" %len(md5list)

features = []
lables = []

t0 = time()

# captcha를 preprossing후 mnist처럼 numpy array로 만들자
for fname in md5list:
    lable = os.path.split(fname)[1].split("_")[1][:5]
    im = io.imread(os.path.join(p, fname))
    w, h, _ = im.shape

    for x in range(w):
        for j in range(h):

            if im[x][j][0] == im[x][j][1] and im[x][j][1] == im[x][j][2] and im[x][j][2] == im[x][j][0]:
                im[x][j][0] = 255
                im[x][j][1] = 255
                im[x][j][2] = 255

    im_gray = rgb2gray(im)
    im_gray = img_as_ubyte(im_gray)
    im_gray = morphology.opening(im_gray, square(2))
    im_gray_equalize = exposure.equalize_hist(im_gray)

    threshold = filters.threshold_otsu(im_gray_equalize).copy()
    threshold = im_gray_equalize < threshold
    threshold = img_as_ubyte(threshold)

    bw = morphology.closing(im_gray_equalize < threshold, square(3))
    cleared = bw.copy()

    im_th = cleared
    ctrs, hier = cv2.findContours(img_as_ubyte(im_th.copy()), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rects = [cv2.boundingRect(ctr) for ctr in ctrs]
    rects = sorted(rects, key=lambda tup: tup[0])

    if len(rects) != 5:
        continue


    for rect, l in zip(rects, lable):
        # Draw the rectangles
        cv2.rectangle(threshold, (rect[0], rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (0, 255, 0), 1) 

        # Make the rectangular region around the digit
        roi = threshold[rect[1]:rect[1]+rect[3], rect[0]:rect[0]+rect[2]]
        roi = cv2.resize(roi, (28, 28), interpolation=cv2.INTER_AREA)
        roi = morphology.closing(roi, square(4))
        
        features.append(roi.ravel())
        lables.append([l])

features = np.array(features, 'int16')
labels = np.array(lables, 'int').ravel()

# features, lables의 차원을 출력
print features.shape
print labels.shape
print "escape time : ", round(time()-t0, 3), "s"

t0 = time()
list_hog_fd = []
for feature in features:
    fd = hog(feature.reshape((28, 28)), orientations=9, pixels_per_cell=(14, 14), cells_per_block=(1, 1), visualise=False)
    list_hog_fd.append(fd)
hog_features = np.array(list_hog_fd, 'float64')
print "escape time : ", round(time()-t0, 3), "s"

clf = joblib.load("./pkl/hog/mnist/digits_KNeighborsClassifier.pkl")

# Compute confusion matrix
cm = confusion_matrix(labels, clf.predict(hog_features))
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
plot_confusion_matrix(cm, labels=labels)


the number of files is 200
(1000, 784)
(1000,)
escape time :  13.174 s
escape time :  0.874 s
Confusion matrix, without normalization
[[ 96   0   0   0   0   0   0   4   3]
 [  0  80   0   0   2   0   0  48   0]
 [  0   0 103  17   1   1   0   4   6]
 [  0   0   0  76   1  12   0   3   2]
 [  0   4   2   0 124   0   1   2   1]
 [  8   0   2   5   1  83   0   2   8]
 [ 25   0   2   0   2   4   1   3  20]
 [  1   1   0   0   0   2   0 143   1]
 [ 44   0   0   0   0  17  10   3  19]]