In [1]:
import pandas as pd
import numpy as np
%pylab inline
pylab.style.use('ggplot')
In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data'
letter_df = pd.read_csv(url, header=None)
In [4]:
letter_df.head()
Out[4]:
Next we attach the column names.
In [6]:
s = """ 1. lettr capital letter (26 values from A to Z)
2. x-box horizontal position of box (integer)
3. y-box vertical position of box (integer)
4. width width of box (integer)
5. high height of box (integer)
6. onpix total # on pixels (integer)
7. x-bar mean x of on pixels in box (integer)
8. y-bar mean y of on pixels in box (integer)
9. x2bar mean x variance (integer)
10. y2bar mean y variance (integer)
11. xybar mean x y correlation (integer)
12. x2ybr mean of x * x * y (integer)
13. xy2br mean of x * y * y (integer)
14. x-ege mean edge count left to right (integer)
15. xegvy correlation of x-ege with y (integer)
16. y-ege mean edge count bottom to top (integer)
17. yegvx correlation of y-ege with x (integer)"""
lines = [l.strip() for l in s.split('\n')]
feature_names = [l.split()[1] for l in lines]
feature_names = [f.replace('-', '_') for f in feature_names]
In [7]:
letter_df.columns = feature_names
In [8]:
letter_df.head()
Out[8]:
In [11]:
letter_counts = letter_df['lettr'].value_counts()
letter_counts.sort_index(ascending=False).plot(kind='barh')
Out[11]:
All the classes are represented in a fairly balanced manner, so looks like in this instance we don't have to address class imbalance.
In [18]:
features_df = letter_df.drop('lettr', axis=1)
letters = letter_df['lettr']
import seaborn as sns
f_corrs = features_df.corr()
fig, ax = pylab.subplots(figsize=(12, 12))
sns.heatmap(f_corrs, annot=True, ax=ax)
Out[18]:
The first 5 features, x_box, y_box, width, high, onpix are highly correlated with each other.
ANOVA F-test based feature selection requires all feature values to be positive.
In [28]:
features_df[features_df < 0].sum(axis=0)
Out[28]:
The above condition holds in our case - none of the features have negative values.
In [19]:
from sklearn.feature_selection import f_classif
In [26]:
t_stats, p_vals = f_classif(features_df, letters)
f_test_results = pd.DataFrame(np.column_stack([t_stats, p_vals]),
index=features_df.columns.copy(),
columns=['test_statistic', 'p_value'])
In [27]:
f_test_results.plot(kind='bar', subplots=True)
Out[27]:
In [44]:
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
In [45]:
estimator = GaussianNB()
selector = SelectKBest(f_classif, k=5)
pipeline = Pipeline([
('selector', selector),
('model', estimator)
])
cross_validator = StratifiedKFold(n_splits=10, shuffle=True)
scores = cross_val_score(pipeline, features_df, letters,
cv=cross_validator, scoring='f1_macro')
score_1 = pd.Series(scores)
In [46]:
score_1.plot(kind='bar')
Out[46]:
In [47]:
estimator = GaussianNB()
selector = SelectKBest(f_classif, k=10)
pipeline = Pipeline([
('selector', selector),
('model', estimator)
])
cross_validator = StratifiedKFold(n_splits=10, shuffle=True)
scores = cross_val_score(pipeline, features_df, letters,
cv=cross_validator, scoring='f1_macro')
score_2 = pd.Series(scores)
In [48]:
combined_scores = pd.concat([score_1, score_2], axis=1, keys=['cv_top5', 'cv_top10'])
combined_scores.plot(kind='bar')
Out[48]:
In [49]:
estimator = GaussianNB()
selector = SelectKBest(f_classif, k=15)
pipeline = Pipeline([
('selector', selector),
('model', estimator)
])
cross_validator = StratifiedKFold(n_splits=10, shuffle=True)
scores = cross_val_score(pipeline, features_df, letters,
cv=cross_validator, scoring='f1_macro')
score_3 = pd.Series(scores)
In [50]:
combined_scores2 = pd.concat([score_2, score_3], axis=1, keys=['cv_top10', 'cv_top15'])
combined_scores2.plot(kind='bar')
Out[50]:
We get a significant (in layman terms, not in a statistical testing sense) boost in accuracy by going from top 5 to top 10 features. But the improvement in accuracy with top 15 features over top 10 features are marginal.
In [54]:
top_5_feature_names = f_test_results.nlargest(5, columns='test_statistic').index
pairplot_df = features_df.loc[:, top_5_feature_names].copy()
pairplot_df['letter'] = letters
sns.pairplot(pairplot_df, hue='letter')
Out[54]:
In [58]:
from sklearn.svm import SVC
In [59]:
estimator = SVC(C=100.0, kernel='rbf')
selector = SelectKBest(f_classif, k=5)
pipeline = Pipeline([
('selector', selector),
('model', estimator)
])
cross_validator = StratifiedKFold(n_splits=10, shuffle=True)
scores = cross_val_score(pipeline, features_df, letters,
cv=cross_validator, scoring='f1_macro')
svm_5 = pd.Series(scores)
In [60]:
svm_5.plot(kind='bar', title='10 Fold CV with SVM (top 5 features)')
Out[60]:
In [62]:
combined_3 = pd.concat([score_3, svm_5], axis=1, keys=['Gaussian_15', 'svm_5'])
combined_3.plot(kind='bar')
Out[62]:
In [68]:
estimator = SVC(C=100.0, kernel='rbf')
selector = SelectKBest(f_classif, k=10)
pipeline = Pipeline([
('selector', selector),
('model', estimator)
])
cross_validator = StratifiedKFold(n_splits=10, shuffle=True)
scores = cross_val_score(pipeline, features_df, letters,
cv=cross_validator, scoring='f1_macro')
svm_10 = pd.Series(scores)
In [72]:
combined_4 = pd.concat([svm_5, svm_10], axis=1, keys=['svm_5', 'svm_10'])
combined_4.plot(kind='bar')
Out[72]:
In [70]:
estimator = SVC(C=100.0, kernel='rbf')
selector = SelectKBest(f_classif, k=15)
pipeline = Pipeline([
('selector', selector),
('model', estimator)
])
cross_validator = StratifiedKFold(n_splits=10, shuffle=True)
scores = cross_val_score(pipeline, features_df, letters,
cv=cross_validator, scoring='f1_macro')
svm_15 = pd.Series(scores)
In [71]:
combined_5 = pd.concat([svm_10, svm_15], axis=1, keys=['svm_10', 'svm_15'])
combined_5.plot(kind='bar')
Out[71]: