In [161]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import itertools
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.learning_curve import learning_curve
from sklearn.learning_curve import validation_curve
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
%matplotlib inline
In [2]:
wine = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", header = None)
wine.head()
Out[2]:
In [3]:
names = """Class,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,
Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,
Color intensity,Hue,OD280/OD315 of diluted wines,Proline""".replace("\n", "").split(",")
In [4]:
wine.columns = names
wine.head()
Out[4]:
In [5]:
wine.info()
In [6]:
X, y = wine.iloc[:, 1:].values, wine.iloc[:, 0].values
In [145]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
print("Train X dimension: %s, Test X dimension: %s" % (X_train.shape, X_test.shape))
In [16]:
le = LabelEncoder()
y = le.fit_transform(y)
In [93]:
pipeline = Pipeline(
[("scl", StandardScaler()),
("pca", PCA(n_components = 2)),
("clf", LogisticRegression(random_state = 0, penalty = "l2"))])
In [94]:
pipeline.fit(X_train, y_train)
Out[94]:
In [95]:
pipeline.score(X_test, y_test)
Out[95]:
In [96]:
kfold = StratifiedKFold(y = y_train, n_folds = 10, random_state = 0)
In [97]:
scores = cross_val_score(estimator = pipeline, X = X_train, y = y_train, cv = 10, n_jobs = -1)
# n_jobs = -1: use all CPU cores for parallel computing
print("CV accuracy scores: ", scores)
In [98]:
print("Mean CV accuracy: %.3f, std: %.3f" % (np.mean(scores), np.std(scores)))
In [99]:
plt.plot(scores)
plt.ylim(0.9, 1.01)
plt.xlabel("Iteration")
plt.ylabel("Accuracy Score")
Out[99]:
In [100]:
train_sizes, train_scores, test_scores = learning_curve(estimator = pipeline, X = X_train, y = y_train,
train_sizes = np.linspace(0.1, 1.0, 10),
cv = 10,
n_jobs = -1)
# learning_curve internally uses stratified_kfold
In [101]:
train_mean = np.mean(train_scores, axis = 1)
test_mean = np.mean(test_scores, axis = 1)
train_std = np.std(train_scores, axis = 1)
test_std = np.std(test_scores, axis = 1)
In [105]:
plt.figure(figsize = (15, 10))
plt.plot(train_sizes, train_mean, color = "b", marker = "o", markersize = 5, label = "training accuracy scores")
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha = 0.4)
plt.plot(train_sizes, test_mean, color = "g", ls = "--", marker = "s", markersize = 5, label = "validation accurancy")
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha = 0.15, color = "g")
plt.legend(loc = "lower right")
plt.xlabel("Number of training samples")
plt.ylabel("Accuracy scores")
Out[105]:
In [130]:
np.set_printoptions(precision = 4, suppress= True)
param_range = 10.0 ** np.arange(-3, 4)
print("param_range", param_range)
In [127]:
train_scores, test_scores = validation_curve(estimator = pipeline,
X = X_train,
y = y_train,
param_name = "clf__C",
param_range = param_range,
cv = 10)
# validation_curve internally uses stratified_kfold
In [126]:
train_mean = np.mean(train_scores, axis = 1)
test_mean = np.mean(test_scores, axis = 1)
train_std = np.std(train_scores, axis = 1)
test_std = np.std(test_scores, axis = 1)
plt.figure(figsize = (15, 5))
plt.plot(param_range, train_mean, color = "b", marker = "o", markersize = 5, label = "training accuracy scores")
plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha = 0.4)
plt.plot(param_range, test_mean, color = "g", ls = "--", marker = "s", markersize = 5, label = "validation accurancy")
plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha = 0.15, color = "g")
plt.legend(loc = "lower right")
plt.xlabel("Complexity Parameter - C")
plt.ylabel("Accuracy scores")
plt.xscale("log")
plt.ylim(0.8, 1.02)
Out[126]:
In [168]:
pipe_svc = Pipeline([
("scl", StandardScaler()),
("clf", SVC(random_state = 0))
])
In [169]:
param_range = 10.0 ** np.arange(-4, 4)
param_range
Out[169]:
In [170]:
param_grid = [
{"clf__C": param_range, "clf__kernel": ["linear"]},
{"clf__C": param_range, "clf__gamma": param_range, "clf__kernel": ["rbf"]}
]
In [171]:
gs = GridSearchCV(estimator = pipe_svc,
param_grid = param_grid,
scoring = "accuracy",
cv = 10,
n_jobs = -1)
In [172]:
gs.fit(X_train, y_train)
Out[172]:
In [173]:
print("Best params: %s Best score: %.4f" % (gs.best_params_, gs.best_score_))
In [174]:
# Applying Grid Search on Decision Tree
tree = DecisionTreeClassifier(random_state = 0)
param_grid = [
{"max_depth": [1, 2, 3, 4, 5, 6, 7, None]}
]
gs_tree = GridSearchCV(estimator = tree, param_grid = param_grid, scoring = "accuracy", cv = 5)
gs_tree.fit(X_train, y_train)
print("Best param: %s, best score: %s" % (gs_tree.best_params_, gs_tree.best_score_))
In [ ]: