In [1]:
# Import the necessary ML Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import neighbors, datasets
from sklearn.model_selection import train_test_split
In [2]:
# Read the Iris data set from pre-build scikit learn library
iris=datasets.load_iris()
iris.keys()
Out[2]:
In [3]:
# printing the feature names
print(iris.feature_names[:])
In [4]:
# displaying the first 5 rows
iris.data[:5]
Out[4]:
In [5]:
# Assigning the features and results
X = iris.data[:] # reading all of the data features (1-4)
y = iris.target[:] # reading all of the target features (1)
len(X)
Out[5]:
In [6]:
# plotting some graphs to show the relationship between flowering dataset using matplotlib
# Relationship between Sepal length and Sepal width for the 3 classes of flowers
plt.figure(1, figsize=(8, 6))
plt.clf()
plt.scatter(X[:,0], X[:,1], c=y, s=60, cmap=plt.cm.RdYlGn, edgecolor='k')
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.title('Sepal length (cm) vs Sepal Width (cm)')
plt.show()
In [7]:
# Relationship between Petal length and Petal width for the 3 classes of flowers
plt.figure(1, figsize=(8, 6))
plt.scatter(X[:,2], X[:,3], c=y, s=60, cmap=plt.cm.cool, edgecolor='k')
plt.xlabel('Petal length')
plt.ylabel('Petal width')
plt.title('Petal length (cm) vs Petal Width (cm)')
plt.show()
Observation:
The Petal length and width seems to be less distored and clearly classified than the Sepal Length and width plot.
In [8]:
# Splitting the Iris dataset into Train and Test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print('The length of Training data set',len(X_train))
print('The length of Test data set',len(X_test))
In [21]:
# Training the train data set using KNN Classifier
from sklearn.neighbors import KNeighborsClassifier
n_neighbors=10
knn_clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
knn_clf.fit(X_train,y_train)
Out[21]:
In [22]:
# Calculating the score
print('Algorithm Score (KNN): {:.2f}'.format(knn_clf.score(X_test,y_test) * 100))
In [25]:
# Plotting a 2D image based on 2 features of the Iris dataset - Sepal Length and Sepal Width
from matplotlib.colors import ListedColormap
cmap_light = ListedColormap(['#e74c3c', '#f1c40f','#bdc3c7'])
cmap_bold = ListedColormap(['#ecf0f1', '#2c3e50','#2ecc71'])
h=.05 # step size in the mesh
# Fitting only the Sepal Length and width data set to the KNN Classifier for plotting
knn_clf2 = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
knn_clf2.fit(X_train[:,:2], y_train)
# calculate min, max and limits for creating the boundaries
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# predict class using data and kNN classifier
Z = knn_clf2.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(10, 8))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = %i)" % (n_neighbors))
plt.show()
In [26]:
# Making Sample Prediction based on manual data entry
manual_dataentry=knn_clf.predict([[1.5,1.0,0.7,1.0]])
print('Sample Prediction :'),
if manual_dataentry == 0:
print('Iris Setosa')
elif manual_dataentry == 1:
print('Iris Versicolour')
else:
print('Iris Virginica')
In [27]:
# printing all of the test data set predictions
len_test_data=len(y_test)
for i in range(0,len_test_data):
test_predict=knn_clf.predict(X_test[[i]])
if test_predict == 0:
variety='Setosa'
elif test_predict == 1:
variety='Versicolour'
else:
variety='Virginica'
print(X_test[[i]],test_predict,variety)
In [28]:
# Training the data set using Logistic regression
from sklearn.linear_model import LogisticRegression
logistic_reg= LogisticRegression()
logistic_reg.fit(X_train,y_train)
Out[28]:
In [29]:
print('Algorithm Score (Logistic Regression): {:.2f}'.format(logistic_reg.score(X_test, y_test) * 100))
In [ ]: