From the scikit-learn documentation concerning k-fold cross-validation:
To avoid it ["overfitting"], it is common practice when performing a (supervised) machine learning experiment to hold out part of the available data as a test set X_test, y_test.
In the basic approach, called k-fold CV, the training set is split into k smaller sets... The following procedure is followed for each of the k “folds”:
- A model is trained using k-1 of the folds as training data;
- the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy).
The following code uses this technique to evaluate the relative performance of various ML classification algorithms on the training data.
RandomForest is one of the best choices.
In [10]:
# Initialize
import pandas as pd
import numpy as np
import pip #needed to use the pip functions
# Show versions of all installed software to help debug incompatibilities.
for i in pip.get_installed_distributions(local_only=True):
print(i)
In [11]:
try:
df_label_vendors = pd.io.parsers.read_csv(
"/home/jovyan/work/shared/data/csv/label_vendors.csv",
error_bad_lines=False,
warn_bad_lines=True,
quotechar='"',
encoding='utf-8')
except IOError as e:
print('\n\n***I/O error({0}): {1}\n\n'.format(
e.errno, e.strerror))
# except ValueError:
# self.logger.critical('Could not convert data to an integer.')
except:
print(
'\n\n***Unexpected error: {0}\n\n'.format(
sys.exc_info()[0]))
raise
# Number of records / columns
df_label_vendors.shape
Out[11]:
In [12]:
# Format training data as "X" == "features, "y" == target.
# The target value is the 1st column.
df_match_train1 = df_label_vendors[['match','fz_ptl_ratio', 'fz_ptl_tok_sort_ratio', 'fz_ratio', 'fz_tok_set_ratio', 'fz_uwratio','ven_len', 'pu0_len']]
# Convert into 2 numpy arrays for the scikit-learn ML classification algorithms.
np_match_train1 = np.asarray(df_match_train1)
X, y = np_match_train1[:, 1:], np_match_train1[:, 0]
print(X.shape, y.shape)
In [13]:
# set up for k-fold cross-validation to choose best model
#rom sklearn import cross_validation
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
for clf, clf_name in (
(RidgeClassifier(alpha=1.0), "Ridge Classifier"),
(RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier #2"),
(Perceptron(n_iter=50), "Perceptron"),
(PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
(KNeighborsClassifier(n_neighbors=10), "kNN"),
(NearestCentroid(), "Nearest Centroid"),
(RandomForestClassifier(n_estimators=100, class_weight="auto"), "Random forest"),
(SGDClassifier(alpha=.0001, n_iter=50, penalty="l2"), "SGD / SVM"),
(MultinomialNB(alpha=.01), "Naive Bayes")):
scores = cross_val_score(clf, X, y, cv=5)
print("%s, Accuracy: %0.2f (+/- %0.2f)" % (clf_name, scores.mean(), scores.std() * 2))