In this notebook, we build a Machine Learning model on the digit recognizer based on the eigenvalue features.
In [1]:
%pylab inline
pylab.style.use('ggplot')
import numpy as np
import pandas as pd
In [2]:
import os
feature_file = os.path.join(os.getcwd(), 'char_features.csv')
data_df = pd.read_csv(feature_file, index_col=0)
In [3]:
data_df.describe().T
Out[3]:
In [4]:
features = data_df.drop('font_name', axis=1)
np.isfinite(features).sum(axis=0)
Out[4]:
In [5]:
data_df.index.value_counts().plot(kind='barh')
Out[5]:
In [6]:
# Negative Values Check
(features < 0).sum(axis=0)
Out[6]:
In [7]:
from sklearn.feature_selection import mutual_info_classif
f_importances = mutual_info_classif(features, features.index)
f_importances = pd.Series(f_importances, index=features.columns)
f_importances.plot(kind='bar')
pylab.xticks(rotation=30)
Out[7]:
In [8]:
dct_cols = features.columns.str.startswith('dct')
features = features.loc[:, dct_cols]
In [9]:
f_corrs = features.corr()
_, ax = pylab.subplots(1, 1, figsize=(10, 10))
import seaborn as sns
sns.heatmap(f_corrs, annot=True, ax=ax)
Out[9]:
In [10]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
In [11]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
normalizer = StandardScaler()
estimator = SVC(C=5000)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=12345)
pipeline = Pipeline([
('normalizer', normalizer),
('estimator', estimator),
])
scores = cross_val_score(pipeline, features, features.index, cv=cv, scoring='accuracy')
scores = pd.Series(scores)
scores.plot(kind='bar')
Out[11]:
In [12]:
scores.mean()
Out[12]:
In [13]:
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
normalizer = StandardScaler()
estimator = SVC(C=5000)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=12345)
pipeline = Pipeline([
('normalizer', normalizer),
('estimator', estimator),
])
predictions = cross_val_predict(pipeline, features, features.index, cv=cv)
report = classification_report(y_true=data_df.index.values, y_pred=predictions)
print(report)
In [14]:
from sklearn.metrics import confusion_matrix
chars = pd.unique(data_df.index)
# Passing labels=char ensures that confusion_matrix orders the results in the the same order as chars
# This ensures our tick labels and the confusion matrix output are correctly aligned.
cm = confusion_matrix(data_df.index, predictions, labels=chars)
sns.heatmap(cm, annot=True, xticklabels=chars, yticklabels=chars)
Out[14]: