Let's go through the intro machine learning tutorial: http://scikit-learn.org/stable/tutorial/basic/tutorial.html

We should have already installed sklearn if not then use: pip install -U scikit-learn

Or follow the instructions at: http://scikit-learn.org/stable/install.html


In [1]:
from sklearn import datasets
#install test sets that come wiht sklearn
iris = datasets.load_iris()
digits = datasets.load_digits()

In [2]:
print(digits.data)


[[  0.   0.   5. ...,   0.   0.   0.]
 [  0.   0.   0. ...,  10.   0.   0.]
 [  0.   0.   0. ...,  16.   9.   0.]
 ..., 
 [  0.   0.   1. ...,   6.   0.   0.]
 [  0.   0.   2. ...,  12.   0.   0.]
 [  0.   0.  10. ...,  12.   1.   0.]]

In [3]:
digits.target


Out[3]:
array([0, 1, 2, ..., 8, 9, 8])

In [4]:
digits.images[0]


Out[4]:
array([[  0.,   0.,   5.,  13.,   9.,   1.,   0.,   0.],
       [  0.,   0.,  13.,  15.,  10.,  15.,   5.,   0.],
       [  0.,   3.,  15.,   2.,   0.,  11.,   8.,   0.],
       [  0.,   4.,  12.,   0.,   0.,   8.,   8.,   0.],
       [  0.,   5.,   8.,   0.,   0.,   9.,   8.,   0.],
       [  0.,   4.,  11.,   0.,   1.,  12.,   7.,   0.],
       [  0.,   2.,  14.,   5.,  10.,  12.,   0.,   0.],
       [  0.,   0.,   6.,  13.,  10.,   0.,   0.,   0.]])

In [5]:
#Predict which digit is being represented by an image
from sklearn import svm
clf = svm.SVC(gamma=0.001, C=100.)
#gamma was set manually, this could be found automaticall with good values other tools
clf.fit(digits.data[:-1], digits.target[:-1])


Out[5]:
SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
  gamma=0.001, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)

In [9]:
from sklearn import datasets
import matplotlib.pyplot as plt
%matplotlib inline
#Load the digits dataset
digits = datasets.load_digits()

#Display the first digit
plt.figure(1, figsize=(3, 3))
plt.imshow(digits.images[-1], cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()

# predict which digit this is
clf.predict(digits.data[-1:])


Out[9]:
array([8])

Do you agree that this is an image of an 8? Did something go wrong? What could be done better.

Model Persistence using pickle


In [11]:
from sklearn import svm
from sklearn import datasets
clf = svm.SVC()
iris = datasets.load_iris()
X, y = iris.data, iris.target
clf.fit(X, y)


Out[11]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [13]:
import pickle
s = pickle.dumps(clf)
clf2 = pickle.loads(s)
clf2.predict(X[0:1])


Out[13]:
array([0])

In [18]:
from sklearn.externals import joblib
joblib.dump(clf, 'filename.pkl') 
#gives back the list of files


Out[18]:
['filename.pkl',
 'filename.pkl_01.npy',
 'filename.pkl_02.npy',
 'filename.pkl_03.npy',
 'filename.pkl_04.npy',
 'filename.pkl_05.npy',
 'filename.pkl_06.npy',
 'filename.pkl_07.npy',
 'filename.pkl_08.npy',
 'filename.pkl_09.npy',
 'filename.pkl_10.npy',
 'filename.pkl_11.npy']

In [20]:
clf = joblib.load('filename.pkl')

Conventions


In [23]:
import numpy as np
from sklearn import random_projection


Out[23]:
dtype('float64')

In [27]:
# default type is float64 unless specified
rng = np.random.RandomState(0)
X = rng.rand(10, 2000)
X = np.array(X, dtype='float32')

transformer = random_projection.GaussianRandomProjection()
X_new = transformer.fit_transform(X)

print ( "Data types for X = " + str(X.dtype) + " and X_new = "+ str(X_new.dtype))


Data types for X = float32 and X_new = float64

In [28]:
from sklearn import datasets
from sklearn.svm import SVC
iris = datasets.load_iris()
clf = SVC()
clf.fit(iris.data, iris.target)


Out[28]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [29]:
list(clf.predict(iris.data[:3]))


Out[29]:
[0, 0, 0]

In [30]:
clf.fit(iris.data, iris.target_names[iris.target])


Out[30]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [31]:
list(clf.predict(iris.data[:3]))


Out[31]:
['setosa', 'setosa', 'setosa']

In [ ]:
# refitting and updating parameters

In [35]:
import numpy as np
from sklearn.svm import SVC

rng = np.random.RandomState(0)
X = rng.rand(100, 10)
y = rng.binomial(1, 0.5, 100)
X_test = rng.rand(5, 10)

clf = SVC()
clf.set_params(kernel='linear').fit(X, y)  
print ("Predict x_test: "+str(clf.predict(X_test)))
clf.set_params(kernel='rbf').fit(X, y)  
print ("Predict x_test after refitting: "+str(clf.predict(X_test)))


Predict x_test: [1 0 1 1 0]
Predict x_test after refitting: [0 0 0 1 0]

In [ ]: