This is the follow-on from the previous notebook "Train vendor matching algorithm".
The training proceeds in a similar manner:
In [1]:
# Initialize
import pandas as pd
import numpy as np
try:
df_label_software = pd.io.parsers.read_csv(
"/home/jovyan/work/shared/data/csv/label_software.csv",
error_bad_lines=False,
warn_bad_lines=True,
quotechar='"',
encoding='utf-8')
except IOError as e:
print('\n\n***I/O error({0}): {1}\n\n'.format(
e.errno, e.strerror))
# except ValueError:
# self.logger.critical('Could not convert data to an integer.')
except:
print(
'\n\n***Unexpected error: {0}\n\n'.format(
sys.exc_info()[0]))
raise
# Number of records / columns
df_label_software.shape
Out[1]:
In [2]:
# Print out some sample values
df_label_software.sample(5)
Out[2]:
In [3]:
# Check that all rows are labelled
# (Should return "False")
df_label_software['match'].isnull().any()
Out[3]:
In [4]:
# Format training data as "X" == "features, "y" == target.
# The target value is the 1st column.
df_match_train1 = df_label_software[['match', 'fz_ratio', 'fz_ptl_ratio', 'fz_tok_set_ratio',
'fz_ptl_tok_sort_ratio', 'fz_uwratio', 'fz_rel_ratio',
'fz_rel_ptl_ratio', 'titlX_len', 'DsplyNm0_len']]
# Convert into 2 numpy arrays for the scikit-learn ML classification algorithms.
np_match_train1 = np.asarray(df_match_train1)
X, y = np_match_train1[:, 1:], np_match_train1[:, 0]
print(X.shape, y.shape)
As before, the classification algorithm needs to be tuned for optimal performance with the data.
This is done using a randomized grid search. This code was modified from the scikit-learn sample code.
In [5]:
# Now find optimum parameters for model using Grid Search
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
# build a classifier
clf = RandomForestClassifier()
# Utility function to report best scores
def report(results, n_top=3):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
# specify parameters and distributions to sample from
param_dist = {"n_estimators": sp_randint(20, 100),
"max_depth": [3, None],
"max_features": sp_randint(1,7),
"min_samples_split": sp_randint(2,7),
"min_samples_leaf": sp_randint(1, 7),
"bootstrap": [True, False],
"class_weight": ['auto', None],
"criterion": ["gini", "entropy"]}
# run randomized search
n_iter_search = 40
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
n_iter=n_iter_search)
start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)
Based on the above, and ignoring default values, the optimum set of parameters would be something like the following:
'bootstrap':True, 'min_samples_leaf': 3, 'n_estimators': 55, 'min_samples_split': 5, 'criterion':'gini', 'max_features': 4, 'max_depth: 3, 'class_weight': None
The RandomForest classifier is now trained on the test data to produce the model.
In [9]:
clf = RandomForestClassifier(
bootstrap=True,
min_samples_leaf=3,
n_estimators=55,
min_samples_split=5,
criterion='gini',
max_features=4,b
max_depth=3,
class_weight=None
)
b
# Train model on original training data
clf.fit(X, y)
# save model for future use
from sklearn.externals import joblib
joblib.dump(clf, '/home/jovyan/work/shared/data/models/software_classif_trained_Rdm_Forest.pkl.z')
Out[9]:
In [10]:
# Test loading
clf = joblib.load('/home/jovyan/work/shared/data/models/software_classif_trained_Rdm_Forest.pkl.z')