In [4]:
import pandas as pd
df = pd.io.parsers.read_csv(
filepath_or_buffer='https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv',
header=None,
sep=',',
)
df.tail()
Out[4]:
In [2]:
%matplotlib inline
In [39]:
from matplotlib import pyplot as plt
import numpy as np
import math
X = df.values[:,1:] # feature vectors
y = df.values[:,0] # class labels
fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(12,10))
for ax,cnt in zip(axes.ravel(), range(13)):
# set bin sizes
min_b = math.floor(np.min(X[:,cnt]))
max_b = math.ceil(np.max(X[:,cnt]))
bins = np.linspace(min_b, max_b, 25)
# plottling the histograms
for lab,col in zip(range(1,4), ('blue', 'red', 'green')):
ax.hist(X[y==lab, cnt],
color=col,
label='class %s' %col,
bins=bins,
alpha=0.5,)
ylims = ax.get_ylim()
# plot annotation
leg.get_frame().set_alpha(0.5)
ax.set_ylim([0, max(ylims)+2])
ax.set_xlabel('feature column %s' %cnt)
ax.set_title('Wine histogram #%s' %str(cnt+1))
# hide axis ticks
ax.tick_params(axis="both", which="both", bottom="off", top="off",
labelbottom="on", left="off", right="off", labelleft="on")
# remove axis spines
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["left"].set_visible(False)
# set y-axis labels
for ax in axes:
ax[0].set_ylabel('count')
# hide subplots that are not being used
for i, ax in enumerate(axes.ravel()):
if i == 12:
ax.legend(loc='upper right', fancybox=True, fontsize=8)
if i > 12:
ax.axis('off')
fig.tight_layout()
plt.show()
In [40]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.values[:,1:], df.values[:,0],
test_size=0.30, random_state=123)
Accuracy is calculated as the number of correct predictions divided by the number of total predictions
In [66]:
X_train[:,0:1].shape
Out[66]:
In [63]:
np.concatenate((X_train[:,0].reshape(X_train.shape[0],1), X_train[:,0].reshape(X_train.shape[0],1)), axis=1).shape
Out[63]:
In [82]:
class ColumnExtractor(object):
def __init__(self, cols):
self.cols = cols
def transform(self, X):
col_list = []
for c in self.cols:
col_list.append(X[:, c:c+1])
return np.concatenate(col_list, axis=1)
def fit(self, X, y=None):
return self
#ColumnExtractor(cols=(1,6)).transform(X_train)
In [79]:
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.lda import LDA
from sklearn.decomposition import PCA
clf_all = Pipeline(steps=[
('scaler', StandardScaler()),
('reduce_dim', ColumnExtractor(cols=(0,5))),
('classification', GaussianNB())
])
clf_pca = Pipeline(steps=[
('scaler', StandardScaler()),
('reduce_dim', PCA(n_components=2)),
('classification', GaussianNB())
])
clf_lda = Pipeline(steps=[
('scaler', StandardScaler()),
('reduce_dim', LDA(n_components=2)),
('classification', GaussianNB())
])
# Constructing the k-fold cross validation iterator (k=10)
cv = KFold(n=X_train.shape[0], # total number of samples
n_folds=10, # number of folds the dataset is divided into
shuffle=True,
random_state=123)
scores = [
cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy')
for clf in [clf_all, clf_pca, clf_lda]
]
In [80]:
print('Scores (all samples):', scores[0])
print("Accuracy: {:.2%} (+/- {:.2%})".format(scores[0].mean(), scores[0].std()))
In [81]:
for score,label in zip(scores,
['all samples',
'PCA dim. red. (n=2)',
'LDA dim. red. (n=2)',
]
):
print("Accuracy: {:.2%} (+/- {:.2%}), {:}".format(score.mean(), score.std(), label))
In [18]:
std_scale = StandardScaler().fit(X_train)
X_train = std_scale.transform(X_train)
X_test = std_scale.transform(X_test)
In [19]:
sklearn_lda = LDA(n_components=2).fit(X_train, y_train)
X_train = sklearn_lda.transform(X_train)
X_test = sklearn_lda.transform(X_test)
In [20]:
gnb_clf = GaussianNB()
gnb_clf.fit(X_train, y_train)
pred_test = gnb_clf.predict(X_test)
In [21]:
from sklearn import metrics
pred_test = gnb_clf.predict(X_test)
print('Prediction accuracy for the test dataset')
print('{:.2%}'.format(metrics.accuracy_score(y_test, pred_test)))
print('Confusion Matrix of the GNB-classifier')
print(metrics.confusion_matrix(y_test, gnb_clf.predict(X_test)))