In [1]:
# import
from sklearn.datasets import load_iris
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn import preprocessing, pipeline
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
In [2]:
#Loading the IRIS dataset
iris_data = load_iris()
X = iris_data['data']
y = iris_data['target']
print(iris_data['feature_names'])
print(iris_data['target_names'])
In [3]:
# splitting and Pre-Processing the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
print(X_train[:2])
print("X_train shape", X_train.shape)
print("X_test shape", X_test.shape)
# Preprocessing and Standardize the features
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
print(X_train[:2])
SGDClassifier
SGD stands for Stochastic Gradient Descent, a very popular numerical procedure
to find the local minimum of a function (in this case, the loss function, which
measures how far every instance is from our boundary). The algorithm will learn the
coefficients of the hyperplane by minimizing the loss function.
In [4]:
# instantiate
sgd = SGDClassifier()
# fitting
sgd.fit(X_train, y_train)
# coefficient
print("coefficient", sgd.coef_)
# intercept
print("intercept: ", sgd.intercept_)
# predicting for one
y_pred = sgd.predict(scaler.transform([[4.9,3.1,1.5,0.1]]))
print(y_pred)
# predicting for X_test
y_pred = sgd.predict(X_test)
# checking accuracy score
print("Model Accuracy on Train data: ", accuracy_score(y_train, sgd.predict(X_train)))
print("Model Accuracy on Test data: ", accuracy_score(y_test, y_pred))
In [5]:
# let's plot the data
plt.figure(figsize=(8,6))
plt.scatter(X_train[:,0][y_train==0],X_train[:,1][y_train==0],color='red', label='setosa')
plt.scatter(X_train[:,0][y_train==1],X_train[:,1][y_train==1],color='blue', label='verginica')
plt.scatter(X_train[:,0][y_train==2],X_train[:,1][y_train==2],color='green', label='versicolour')
plt.legend(loc='best')
Out[5]:
Classification Report
Accuracy = (TP+TN)/m
Precision = TP/(TP+FP)
Recall = TP/(TP+FN)
F1-score = 2 * Precision * Recall / (Precision + Recall)
In [6]:
# predicting
print(classification_report(y_pred=y_pred, y_true=y_test))
In [7]:
confusion_matrix(y_pred=y_pred, y_true=y_test)
Out[7]:
Using a pipeline mechanism to build and test our model
In [8]:
# create a composite estimator made by a pipeline of the standarization and the linear model
clf = pipeline.Pipeline([
('scaler', preprocessing.StandardScaler()),
('linear_model', SGDClassifier())
])
# create a k-fold cross validation iterator of k=5 folds
cv = KFold(X.shape[0], 5, shuffle=True, random_state=33)
# by default the score used is the one returned by score method of the estimator (accuracy)
scores = cross_val_score(clf, X, y, cv=cv)
print(scores)
In [9]:
# mean accuracy
print(np.mean(scores), sp.stats.sem(scores))
In [ ]: