In [1]:
# import
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
%matplotlib inline
In [2]:
# iris data is already packed with sklearn
# now, loading the data
iris = load_iris()
type(iris) # iris var is holding a sklearn dataset which is kind of dictionary
Out[2]:
In [3]:
iris.keys() # it is holding, data, target, feature name etc
Out[3]:
In [4]:
# let's see them one by one
# desc
print(iris['DESCR'])
In [5]:
print(iris['feature_names'])
print(iris['target_names'])
In [6]:
# now, extracting the data into variables
X = iris['data']
y = iris['target']
print(len(X), len(y))
In [7]:
for t in range(3):
if t == 0:
c = 'r'
marker = '>'
elif t == 1:
c = 'g'
marker = 'o'
elif t == 2:
c = 'b'
marker = 'x'
#plt.figure(figsize=(12,9))
plt.scatter(X[y == t,0],X[y == t,1], marker=marker, color=c)
Let's build a model which can identify the type of IRIS flower using simple mathematics
In [9]:
# creating boolean filtes
setosa = y==0
versi = y==1
virgi = y==2
#setosa
Out[9]:
In [11]:
X_setosa, y_setosa = X[setosa,], y[setosa]
X_versi, y_versi = X[versi,], y[versi]
X_virgi, y_virgi = X[virgi,], y[virgi]
In [14]:
# creating petal length condition
print(X_setosa[:,2].max())
print(X_setosa[:,2].min())
In [16]:
# let's check the same for others but not setosa
print(X[~setosa,2].max())
print(X[~setosa,2].min())
In [17]:
# let's check the same for others individually
print(X_versi[:,2].max())
print(X_versi[:,2].min())
print(X_virgi[:,2].max())
print(X_virgi[:,2].min())
But this way is easy with small datasets which has few features. We need to use advance classification technique to classify large and complex data
In [62]:
# import
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score, train_test_split, KFold
from sklearn import metrics
KNN Classifier with 1 neighbors
In [35]:
# reading SEED dataset
header = ["area", "perimeter", "compactness", "kernel_length", "kernel_width", "asymmetry coefficient",
"length of kernel groove", "variety"]
seed = sp.genfromtxt("data/seeds_dataset.txt", delimiter="|")
seed
Out[35]:
In [50]:
sX = seed[:,0:7]
sy = seed[:,7]
print(sX[:2], sy[:2])
In [51]:
# instantiate the Classifier with neighbors = 1
knn1 = KNeighborsClassifier(n_neighbors=1)
# fitting the data
knn1.fit(sX, sy)
# predicting the type
knn1.predict([[15.26,14.84,0.871,5.763,3.312,2.221,5.22]])
Out[51]:
As we have used all the data to train our model, so training accuracy will be > ~95, but this is not what we want, before procceding futher let's check the accuracy
In [60]:
sy_pred = knn1.predict(sX)
metrics.accuracy_score(sy, sy_pred) # it will always be around 1
Out[60]:
KNN with N=1, splitting the data into train and test
In [56]:
sX_train, sX_test, sy_train, sy_test = train_test_split(sX, sy, test_size=0.4, random_state=4)
In [61]:
knn1 = KNeighborsClassifier(n_neighbors=1)
knn1.fit(sX_train, sy_train)
sy_test_pred = knn1.predict(sX_test)
metrics.accuracy_score(sy_test, sy_test_pred) # 92% accuracy
Out[61]:
Simulating the same with KFold
In [64]:
kf = KFold(len(sX), n_folds=5, shuffle=True, random_state=7)
kf
Out[64]:
In [71]:
mean = []
for train, test in kf:
knn1.fit(sX[train], sy[train])
sy_pred = knn1.predict(sX[test])
mean.append(metrics.accuracy_score(sy[test], sy_pred))
print(mean)
print(np.mean(mean))
In [ ]: