In [2]:
#import
from sklearn import datasets

In [3]:
iris = datasets.load_iris()
digits = datasets.load_digits()

In [4]:
print(digits)


{'target_names': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), 'DESCR': "Optical Recognition of Handwritten Digits Data Set\n===================================================\n\nNotes\n-----\nData Set Characteristics:\n    :Number of Instances: 5620\n    :Number of Attributes: 64\n    :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n    :Missing Attribute Values: None\n    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n    :Date: July; 1998\n\nThis is a copy of the test set of the UCI ML hand-written digits datasets\nhttp://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n\nThe data set contains images of hand-written digits: 10 classes where\neach class refers to a digit.\n\nPreprocessing programs made available by NIST were used to extract\nnormalized bitmaps of handwritten digits from a preprinted form. From a\ntotal of 43 people, 30 contributed to the training set and different 13\nto the test set. 32x32 bitmaps are divided into nonoverlapping blocks of\n4x4 and the number of on pixels are counted in each block. This generates\nan input matrix of 8x8 where each element is an integer in the range\n0..16. This reduces dimensionality and gives invariance to small\ndistortions.\n\nFor info on NIST preprocessing routines, see M. D. Garris, J. L. Blue, G.\nT. Candela, D. L. Dimmick, J. Geist, P. J. Grother, S. A. Janet, and C.\nL. Wilson, NIST Form-Based Handprint Recognition System, NISTIR 5469,\n1994.\n\nReferences\n----------\n  - C. Kaynak (1995) Methods of Combining Multiple Classifiers and Their\n    Applications to Handwritten Digit Recognition, MSc Thesis, Institute of\n    Graduate Studies in Science and Engineering, Bogazici University.\n  - E. Alpaydin, C. Kaynak (1998) Cascading Classifiers, Kybernetika.\n  - Ken Tang and Ponnuthurai N. Suganthan and Xi Yao and A. Kai Qin.\n    Linear dimensionalityreduction using relevance weighted LDA. School of\n    Electrical and Electronic Engineering Nanyang Technological University.\n    2005.\n  - Claudio Gentile. A New Approximate Maximal Margin Classification\n    Algorithm. NIPS. 2000.\n", 'target': array([0, 1, 2, ..., 8, 9, 8]), 'data': array([[  0.,   0.,   5., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,  10.,   0.,   0.],
       [  0.,   0.,   0., ...,  16.,   9.,   0.],
       ..., 
       [  0.,   0.,   1., ...,   6.,   0.,   0.],
       [  0.,   0.,   2., ...,  12.,   0.,   0.],
       [  0.,   0.,  10., ...,  12.,   1.,   0.]]), 'images': array([[[  0.,   0.,   5., ...,   1.,   0.,   0.],
        [  0.,   0.,  13., ...,  15.,   5.,   0.],
        [  0.,   3.,  15., ...,  11.,   8.,   0.],
        ..., 
        [  0.,   4.,  11., ...,  12.,   7.,   0.],
        [  0.,   2.,  14., ...,  12.,   0.,   0.],
        [  0.,   0.,   6., ...,   0.,   0.,   0.]],

       [[  0.,   0.,   0., ...,   5.,   0.,   0.],
        [  0.,   0.,   0., ...,   9.,   0.,   0.],
        [  0.,   0.,   3., ...,   6.,   0.,   0.],
        ..., 
        [  0.,   0.,   1., ...,   6.,   0.,   0.],
        [  0.,   0.,   1., ...,   6.,   0.,   0.],
        [  0.,   0.,   0., ...,  10.,   0.,   0.]],

       [[  0.,   0.,   0., ...,  12.,   0.,   0.],
        [  0.,   0.,   3., ...,  14.,   0.,   0.],
        [  0.,   0.,   8., ...,  16.,   0.,   0.],
        ..., 
        [  0.,   9.,  16., ...,   0.,   0.,   0.],
        [  0.,   3.,  13., ...,  11.,   5.,   0.],
        [  0.,   0.,   0., ...,  16.,   9.,   0.]],

       ..., 
       [[  0.,   0.,   1., ...,   1.,   0.,   0.],
        [  0.,   0.,  13., ...,   2.,   1.,   0.],
        [  0.,   0.,  16., ...,  16.,   5.,   0.],
        ..., 
        [  0.,   0.,  16., ...,  15.,   0.,   0.],
        [  0.,   0.,  15., ...,  16.,   0.,   0.],
        [  0.,   0.,   2., ...,   6.,   0.,   0.]],

       [[  0.,   0.,   2., ...,   0.,   0.,   0.],
        [  0.,   0.,  14., ...,  15.,   1.,   0.],
        [  0.,   4.,  16., ...,  16.,   7.,   0.],
        ..., 
        [  0.,   0.,   0., ...,  16.,   2.,   0.],
        [  0.,   0.,   4., ...,  16.,   2.,   0.],
        [  0.,   0.,   5., ...,  12.,   0.,   0.]],

       [[  0.,   0.,  10., ...,   1.,   0.,   0.],
        [  0.,   2.,  16., ...,   1.,   0.,   0.],
        [  0.,   0.,  15., ...,  15.,   0.,   0.],
        ..., 
        [  0.,   4.,  16., ...,  16.,   6.,   0.],
        [  0.,   8.,  16., ...,  16.,   8.,   0.],
        [  0.,   1.,   8., ...,  12.,   1.,   0.]]])}

In [4]:
print(type(iris)); 
print(type(digits))


<class 'sklearn.datasets.base.Bunch'>
<class 'sklearn.datasets.base.Bunch'>

In [5]:
print(iris.data[:6])   # training data is stored in data member of iris dataset


[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]
 [ 5.4  3.9  1.7  0.4]]

In [6]:
print(iris.target)


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]

In [7]:
iris.data.shape


Out[7]:
(150, 4)

In [8]:
iris.target.shape


Out[8]:
(150,)

In [9]:
# print feature
print(iris.feature_names)


['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

In [10]:
print(iris.target_names)


['setosa' 'versicolor' 'virginica']

In [11]:
print(type(iris.data), type(iris.target))


<class 'numpy.ndarray'> <class 'numpy.ndarray'>

In [12]:
# loading featured and responsed into X and y
X = iris.data
y = iris.target

print(X.shape);
print(y.shape)


(150, 4)
(150,)

Choosing KNN Classifier algorithm to predict the IRIS data


In [13]:
from sklearn.neighbors import KNeighborsClassifier

In [14]:
# instantiate the KN
knn = KNeighborsClassifier(n_neighbors=2)

# training the model
knn.fit(X, y)


Out[14]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=2, p=2,
           weights='uniform')

In [15]:
#predict the value [5,4,3,2]
knn.predict([5,4,3,2])


C:\Anaconda3\lib\site-packages\sklearn\utils\validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
Out[15]:
array([1])

In [16]:
knn.predict([[5,4,3,2], [1,2,3,5]])


Out[16]:
array([1, 1])

Using another model - Logistic Regression


In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
lrm = LogisticRegression()

In [19]:
lrm.fit(X, y)


Out[19]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
lrm.predict([[5,4,3,2], [1,2,3,5]])


Out[20]:
array([0, 2])

Testing the model accuracy when model is trained with all data


In [21]:
from sklearn import metrics

In [22]:
# test LogisticRegression

# training my model
lrm = LogisticRegression()
lrm.fit(X, y)
y_pred = lrm.predict(X)

# testing accuracy
metrics.accuracy_score(y, y_pred)


Out[22]:
0.95999999999999996

In [23]:
# test KNN when K=1

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X, y)
y_pred = knn.predict(X)

# testing accracy
metrics.accuracy_score(y, y_pred)


Out[23]:
1.0

In [24]:
# test KNN when K = 5

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
y_pred = knn.predict(X)

# testing accracy
metrics.accuracy_score(y, y_pred)


Out[24]:
0.96666666666666667

Testing the model accuracy when model is trained with train/test way


In [25]:
# splitting the data 
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)

In [26]:
# test LogisticRegression

# training my model
lrm = LogisticRegression()
lrm.fit(X_train, y_train)
y_pred = lrm.predict(X_test)

# testing accuracy
metrics.accuracy_score(y_test, y_pred)


Out[26]:
0.94999999999999996

In [27]:
# test KNN when K=1

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# testing accracy
metrics.accuracy_score(y_test, y_pred)


Out[27]:
0.94999999999999996

In [28]:
# test KNN when K=5

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# testing accracy
metrics.accuracy_score(y_test, y_pred)


Out[28]:
0.96666666666666667

By training and testing our data, we can say it is better in KNN when K = 5


In [29]:
# Let's have a loop which will check for all possible value of K
accuracy = []
K = range(1,26)

for k in K:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    # testing accracy
    ac = metrics.accuracy_score(y_test, y_pred)
    accuracy.append(ac)

In [30]:
# now plotting it
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(K, accuracy)


Out[30]:
[<matplotlib.lines.Line2D at 0x901b7b8>]

In [31]:
# we can see, model is performing better when K between 6 and 16
# let's train our model on KNN when K = 6

# test KNN when K=6

knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X, y)
y_pred = knn.predict(X)

# testing accracy
metrics.accuracy_score(y, y_pred)


Out[31]:
0.97333333333333338

In [1]:
print(digits)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-d046ead0e795> in <module>()
----> 1 print(digits)

NameError: name 'digits' is not defined

In [ ]: