In [1]:
# importing all required modules
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
Scikit learn contains a database of pre-loaded datasets that can be accessed in the following way:
In [2]:
# importing datasets
from sklearn import datasets
iris = datasets.load_iris()
However the type is not a typical Pandas dataframe or Numpy array
In [3]:
type(iris)
Out[3]:
The content of the data set can be accessed in the following way:
In [4]:
iris.keys()
Out[4]:
In [5]:
# displaying the set first ten rows
iris.data[:10]
Out[5]:
In [6]:
# assigning data and target to X and y variables that will be used in machine learning
X = iris.data
y = iris.target
y
Out[6]:
0 = iris-setosa
1 = iris-versicolor
2 = iris-virginica
In [7]:
iris.target_names
Out[7]:
In order to faciliate the display of the data, a data frame can be created
In [8]:
df = pd.DataFrame(X, columns=iris.feature_names)
df.head()
Out[8]:
In [9]:
df.info()
In [10]:
# Import necessary modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
In [11]:
# spliting the dataset between test and training data (using 40% for test data because of small size of dataset)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42, stratify=y)
In [12]:
# Creating the knn classifier with 6 neighbors
knn = KNeighborsClassifier(n_neighbors=6)
In [13]:
# fitting the data
knn.fit(X_train, y_train)
Out[13]:
In [14]:
# predicting the outcomes
y_pred = knn.predict(X_test)
In [15]:
y_pred
Out[15]:
In [16]:
# model accuracy
knn.score(X_test, y_test)
Out[16]:
In [17]:
# Setup arrays to store train and test accuracies
neighbors = np.arange(1, 15)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
# Loop over different values of k
for i, k in enumerate(neighbors):
# Setup a k-NN Classifier with k neighbors: knn
knn = KNeighborsClassifier(n_neighbors=k)
# Fit the classifier to the training data
knn.fit(X_train, y_train)
#Compute accuracy on the training set
train_accuracy[i] = knn.score(X_train, y_train)
#Compute accuracy on the testing set
test_accuracy[i] = knn.score(X_test, y_test)
# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, test_accuracy, label = 'Testing Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy');
It can be concluded the the best accuracies are obtained with 3 or from 7 to 10 neighbors
In [19]:
# using sklearn to obtain other validation of the model
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)
Out[19]:
The array above shows that iris-setosa (0) is classified well, the same for iris-versicolor (1) but in case of iris-virginica (2) only 16 are properly classified and 4 are misclassified as versicolor.
In [21]:
print(classification_report(y_test, y_pred))
pecision = TP/(TP+FP), recall = TP/(TP + FN), f1-score = 2 precision recall/(precision + recall)
In [24]:
from sklearn.model_selection import GridSearchCV
# parameters grid (in our case just one parameter = n_neighbours)
param_grid = {'n_neighbors': np.arange(1,50)}
knn2 = KNeighborsClassifier()
knn_cv = GridSearchCV(knn2, param_grid, cv=5)
knn_cv.fit(X, y)
Out[24]:
In [25]:
knn_cv.best_params_
Out[25]:
In [26]:
knn_cv.best_score_
Out[26]:
In [29]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# setting up pipeline steps
steps =[('scaler', StandardScaler()), ('knn3', KNeighborsClassifier())]
pipeline = Pipeline(steps)
# parameters grid is set up in the cell above but it must be redef
parameters = {'knn3__n_neighbors': np.arange(1,50)}
# using Grid search to build the model
cv = GridSearchCV(pipeline, param_grid=parameters, cv=5)
# Fit to the training set
cv.fit(X_train, y_train)
# Predict the labels of the test set: y_pred_cv
y_pred_cv = cv.predict(X_test)
# Compute and print metrics
print("Accuracy: {}".format(cv.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
print("Tuned Model Parameters: {}".format(cv.best_params_))
In [18]:
sns.heatmap(df.corr(), square=True, cmap='RdYlGn');
In [28]:
np.arange(1,50)
Out[28]:
In [ ]: