A sample ML classification project



In [62]:

    
# preprocessing: numbers-->mathematical operations
# Define neighbourhood, classification

Assumes data vizualization and pre-possing is already achieved

Step 1: Load the Data



In [7]:

    
from sklearn import datasets



In [8]:

    
iris_data = datasets.load_iris()



In [9]:

    
# Define target value, feature value



In [10]:

    
iris_data.target









    Out[10]:





array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])



In [11]:

    
iris_data.data[:5]









    Out[11]:





array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])



In [12]:

    
iris_data.feature_names









    Out[12]:





['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']



In [13]:

    
iris_data.target_names









    Out[13]:





array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

Step 2: Create train, test, validate



In [14]:

    
# Why we need train test?



In [15]:

    
from sklearn.model_selection import train_test_split



In [18]:

    
X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size=0.3)



In [19]:

    
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)



In [20]:

    
# Explain train-test ratio, random_seed

Step 3: Initialise the classification model



In [21]:

    
# Explain basic working of knn, rnn, weighted/non-weghted, Brute force, order of execution



In [22]:

    
from sklearn.neighbors import KNeighborsClassifier



In [23]:

    
knn = KNeighborsClassifier(n_neighbors=5)



In [24]:

    
knn.fit(X_train, y_train)









    Out[24]:





KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

Step 4-5: Evaluation & Prediction



In [25]:

    
# Perform model evaluation on validate/test set, explain confusion matrix, accuracy
# Explain prediction step
# Confusion matrix
#    Tp, TN, FN(type1 error), FP(type2), disease prediction
# accuracy=tp+tn/total
# error rate=1-accuracy
# recall=tp/tp+fn(when true, how many predicted yes)
# precision=tp/tp+fp(when true, how many were true)



In [26]:

    
from sklearn import metrics



In [27]:

    
y_val_pred = knn.predict(X_val)



In [28]:

    
print("Accuracy:",metrics.accuracy_score(y_val, y_val_pred))









    



Accuracy: 0.9523809523809523



In [29]:

    
y_test_pred = knn.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_test_pred))









    



Accuracy: 0.9555555555555556

Step 6: HyperParameter Tuining



In [30]:

    
# Define learned parameter, hyperparameter, curse of dim, over/under fitting
# Explain k-fold-cross-validation, GridSearch



In [31]:

    
from sklearn.model_selection import GridSearchCV



In [32]:

    
parameters = {'n_neighbors':[1,2,3,4], 'p':[1,3]}



In [33]:

    
# Explain n_neighbour, and P for knn



In [63]:

    
model = GridSearchCV(knn, param_grid=parameters)



In [64]:

    
model.fit(X_train,y_train)









    



c:\users\cs\appdata\local\programs\python\python37-32\lib\site-packages\sklearn\model_selection\_split.py:2053: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
  warnings.warn(CV_WARNING, FutureWarning)
c:\users\cs\appdata\local\programs\python\python37-32\lib\site-packages\sklearn\model_selection\_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)






    Out[64]:





GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': [1, 2, 3, 4], 'p': [1, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

Step 7: Finding the best model



In [65]:

    
model.best_estimator_









    Out[65]:





KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=3,
           weights='uniform')



In [66]:

    
model.best_score_









    Out[66]:





0.9404761904761905



In [67]:

    
knn_best=model.best_estimator_



In [68]:

    
knn_best.score(X_val,y_val)









    Out[68]:





0.9523809523809523



In [69]:

    
knn_best.score(X_test,y_test)









    Out[69]:





0.9777777777777777



In [70]:

    
y_test_pred = knn_best.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_test_pred))









    



Accuracy: 0.9777777777777777



In [71]:

    
# Try a different classifier



In [72]:

    
from sklearn.tree import DecisionTreeClassifier
df=DecisionTreeClassifier()
df.fit(X_train,y_train)
y_test_pred = df.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_test_pred))









    



Accuracy: 0.9333333333333333



In [73]:

    
# Step 8: Save, load, serve model
# Mention flask API, pickel, model_persistance



In [74]:

    
import pickle
_save = pickle.dumps(knn_best)
_load = pickle.loads(_save)
_load.predict(X_test)

"""
similarly there is numpy fucntions to load/save arrays
from joblib import dump, load
dump(model, 'path/model_name.joblib')
_loaded_model = load('path/model_filename.joblib')
"""









    Out[74]:





"\nsimilarly there is numpy fucntions to load/save arrays\nfrom joblib import dump, load\ndump(model, 'path/model_name.joblib')\n_loaded_model = load('path/model_filename.joblib')\n"



In [75]:

    
# Not the actual syntax, just an intution
"""
class_model:
    def _init__(self):
    _loaded_model = pickle.loads("path/model_name")
"""

"""
import flask and its components
import class_model
app.model=class_model() # load it beforehand
@path=[ap1/v1/predict]
def predict(user_input):
    result=app.model.predict(user_input)
    return(jsonify(result))

if __name__ == "__main__":
    app.run()    
"""









    Out[75]:





'\nimport flask and its components\nimport class_model\napp.model=class_model() # load it beforehand\n@path=[ap1/v1/predict]\ndef predict(user_input):\n    result=app.model.predict(user_input)\n    return(jsonify(result))\n\nif __name__ == "__main__":\n    app.run()    \n'