A sample ML classification project


In [62]:
# preprocessing: numbers-->mathematical operations
# Define neighbourhood, classification

Assumes data vizualization and pre-possing is already achieved

Step 1: Load the Data


In [7]:
from sklearn import datasets

In [8]:
iris_data = datasets.load_iris()

In [9]:
# Define target value, feature value

In [10]:
iris_data.target


Out[10]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [11]:
iris_data.data[:5]


Out[11]:
array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [12]:
iris_data.feature_names


Out[12]:
['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [13]:
iris_data.target_names


Out[13]:
array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

Step 2: Create train, test, validate


In [14]:
# Why we need train test?

In [15]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size=0.3)

In [19]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [20]:
# Explain train-test ratio, random_seed

Step 3: Initialise the classification model


In [21]:
# Explain basic working of knn, rnn, weighted/non-weghted, Brute force, order of execution

In [22]:
from sklearn.neighbors import KNeighborsClassifier

In [23]:
knn = KNeighborsClassifier(n_neighbors=5)

In [24]:
knn.fit(X_train, y_train)


Out[24]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

Step 4-5: Evaluation & Prediction


In [25]:
# Perform model evaluation on validate/test set, explain confusion matrix, accuracy
# Explain prediction step
# Confusion matrix
#    Tp, TN, FN(type1 error), FP(type2), disease prediction
# accuracy=tp+tn/total
# error rate=1-accuracy
# recall=tp/tp+fn(when true, how many predicted yes)
# precision=tp/tp+fp(when true, how many were true)

In [26]:
from sklearn import metrics

In [27]:
y_val_pred = knn.predict(X_val)

In [28]:
print("Accuracy:",metrics.accuracy_score(y_val, y_val_pred))


Accuracy: 0.9523809523809523

In [29]:
y_test_pred = knn.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_test_pred))


Accuracy: 0.9555555555555556

Step 6: HyperParameter Tuining


In [30]:
# Define learned parameter, hyperparameter, curse of dim, over/under fitting
# Explain k-fold-cross-validation, GridSearch

In [31]:
from sklearn.model_selection import GridSearchCV

In [32]:
parameters = {'n_neighbors':[1,2,3,4], 'p':[1,3]}

In [33]:
# Explain n_neighbour, and P for knn

In [63]:
model = GridSearchCV(knn, param_grid=parameters)

In [64]:
model.fit(X_train,y_train)


c:\users\cs\appdata\local\programs\python\python37-32\lib\site-packages\sklearn\model_selection\_split.py:2053: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
  warnings.warn(CV_WARNING, FutureWarning)
c:\users\cs\appdata\local\programs\python\python37-32\lib\site-packages\sklearn\model_selection\_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
Out[64]:
GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': [1, 2, 3, 4], 'p': [1, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

Step 7: Finding the best model


In [65]:
model.best_estimator_


Out[65]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=3,
           weights='uniform')

In [66]:
model.best_score_


Out[66]:
0.9404761904761905

In [67]:
knn_best=model.best_estimator_

In [68]:
knn_best.score(X_val,y_val)


Out[68]:
0.9523809523809523

In [69]:
knn_best.score(X_test,y_test)


Out[69]:
0.9777777777777777

In [70]:
y_test_pred = knn_best.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_test_pred))


Accuracy: 0.9777777777777777

In [71]:
# Try a different classifier

In [72]:
from sklearn.tree import DecisionTreeClassifier
df=DecisionTreeClassifier()
df.fit(X_train,y_train)
y_test_pred = df.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_test_pred))


Accuracy: 0.9333333333333333

In [73]:
# Step 8: Save, load, serve model
# Mention flask API, pickel, model_persistance

In [74]:
import pickle
_save = pickle.dumps(knn_best)
_load = pickle.loads(_save)
_load.predict(X_test)

"""
similarly there is numpy fucntions to load/save arrays
from joblib import dump, load
dump(model, 'path/model_name.joblib')
_loaded_model = load('path/model_filename.joblib')
"""


Out[74]:
"\nsimilarly there is numpy fucntions to load/save arrays\nfrom joblib import dump, load\ndump(model, 'path/model_name.joblib')\n_loaded_model = load('path/model_filename.joblib')\n"

In [75]:
# Not the actual syntax, just an intution
"""
class_model:
    def _init__(self):
    _loaded_model = pickle.loads("path/model_name")
"""

"""
import flask and its components
import class_model
app.model=class_model() # load it beforehand
@path=[ap1/v1/predict]
def predict(user_input):
    result=app.model.predict(user_input)
    return(jsonify(result))

if __name__ == "__main__":
    app.run()    
"""


Out[75]:
'\nimport flask and its components\nimport class_model\napp.model=class_model() # load it beforehand\n@path=[ap1/v1/predict]\ndef predict(user_input):\n    result=app.model.predict(user_input)\n    return(jsonify(result))\n\nif __name__ == "__main__":\n    app.run()    \n'