In this notebook, we build a Machine Learning model on the digit recognizer based on the eigenvalue features.


In [1]:
%pylab inline
pylab.style.use('ggplot')

import numpy as np
import pandas as pd


Populating the interactive namespace from numpy and matplotlib

In [2]:
import os

feature_file = os.path.join(os.getcwd(), 'char_features.csv')
data_df = pd.read_csv(feature_file, index_col=0)

In [3]:
data_df.describe().T


Out[3]:
count mean std min 25% 50% 75% max
nu03 1496.0 0.000046 0.000287 -0.001102 -0.000118 0.000024 0.000194 0.001627
nu11 1496.0 0.000031 0.000209 -0.001182 -0.000059 0.000018 0.000119 0.000932
nu12 1496.0 0.000249 0.000523 -0.001331 -0.000106 0.000192 0.000563 0.002311
on_pixel_frac 1496.0 0.040004 0.019563 0.001957 0.027081 0.038540 0.051335 0.104639
on_pixel_x_var 1496.0 8.071443 2.986408 0.187500 6.664485 8.583488 10.064963 15.352538
on_pixel_y_var 1496.0 6.324439 3.035991 0.000000 4.716589 6.572320 8.047325 26.270145
dct_0 1496.0 7857.911932 95.744775 7761.562500 7777.500000 7857.187500 7881.093750 8112.187500
dct_1 1496.0 -9.349362 31.128139 -96.483588 -8.651894 -5.434853 2.311168 29.736777
dct_2 1496.0 71.084518 166.104443 -206.684980 -52.205754 99.375435 220.018664 338.203586
dct_3 1496.0 344.807495 115.873211 40.305293 301.514885 377.801365 431.492843 470.849634
dct_4 1496.0 3.150628 12.267003 -12.147245 -3.031646 -1.904570 5.626301 32.509623
dct_5 1496.0 226.313370 159.142899 -13.127919 69.061977 282.111186 373.779526 447.123852

In [4]:
features = data_df.drop('font_name', axis=1)
np.isfinite(features).sum(axis=0)


Out[4]:
nu03              1496
nu11              1496
nu12              1496
on_pixel_frac     1496
on_pixel_x_var    1496
on_pixel_y_var    1496
dct_0             1496
dct_1             1496
dct_2             1496
dct_3             1496
dct_4             1496
dct_5             1496
dtype: int64

Instance Counts Per Class


In [5]:
data_df.index.value_counts().plot(kind='barh')


Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x24da346d4e0>

In [6]:
# Negative Values Check
(features < 0).sum(axis=0)


Out[6]:
nu03              677
nu11              647
nu12              555
on_pixel_frac       0
on_pixel_x_var      0
on_pixel_y_var      0
dct_0               0
dct_1             952
dct_2             680
dct_3               0
dct_4             952
dct_5             136
dtype: int64

In [7]:
from sklearn.feature_selection import mutual_info_classif

f_importances = mutual_info_classif(features, features.index)
f_importances = pd.Series(f_importances, index=features.columns)

f_importances.plot(kind='bar')
pylab.xticks(rotation=30)


Out[7]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]),
 <a list of 12 Text xticklabel objects>)

In [8]:
dct_cols = features.columns.str.startswith('dct')
features = features.loc[:, dct_cols]

In [9]:
f_corrs = features.corr()

_, ax = pylab.subplots(1, 1, figsize=(10, 10))
import seaborn as sns
sns.heatmap(f_corrs, annot=True, ax=ax)


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x24da51075f8>

In [10]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline

In [11]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

normalizer = StandardScaler()
estimator = SVC(C=5000)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=12345)

pipeline = Pipeline([
    ('normalizer', normalizer),
    ('estimator', estimator),
])

scores = cross_val_score(pipeline, features, features.index, cv=cv, scoring='accuracy')

scores = pd.Series(scores)
scores.plot(kind='bar')


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x24da5598a58>

In [12]:
scores.mean()


Out[12]:
1.0

In [13]:
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict

normalizer = StandardScaler()
estimator = SVC(C=5000)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=12345)

pipeline = Pipeline([    
    ('normalizer', normalizer),
    ('estimator', estimator),
])

predictions = cross_val_predict(pipeline, features, features.index, cv=cv)

report = classification_report(y_true=data_df.index.values, y_pred=predictions)
print(report)


             precision    recall  f1-score   support

      comma       1.00      1.00      1.00       136
      eight       1.00      1.00      1.00       136
       five       1.00      1.00      1.00       136
       four       1.00      1.00      1.00       136
       nine       1.00      1.00      1.00       136
        one       1.00      1.00      1.00       136
      seven       1.00      1.00      1.00       136
        six       1.00      1.00      1.00       136
      three       1.00      1.00      1.00       136
        two       1.00      1.00      1.00       136
       zero       1.00      1.00      1.00       136

avg / total       1.00      1.00      1.00      1496


In [14]:
from sklearn.metrics import confusion_matrix

chars = pd.unique(data_df.index)

# Passing labels=char ensures that confusion_matrix orders the results in the the same order as chars
# This ensures our tick labels and the confusion matrix output are correctly aligned.
cm = confusion_matrix(data_df.index, predictions, labels=chars)
sns.heatmap(cm, annot=True, xticklabels=chars, yticklabels=chars)


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x24da56168d0>