In [1]:
from collections import defaultdict, Counter
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
import numpy as np
from copy import deepcopy
class CrowdMachineClassifier(BaseEstimator):
"""Crowd Machine for ensembling the predictions of supervised machine learning methods"""
def __init__(self, estimators, default_label):
"""Sets up the Crowd Machine
Parameters
----------
estimators: list
A list of machine learning method instances to be included in the Crowd Machine.
These instances must be scikit-learn compatible estimators.
default_label: integer
The default value that the Crowd Machine will assign to a data instance type that
it has not encountered during the training phase.
Returns
-------
None
"""
self.estimators = deepcopy(estimators)
self.default_label = default_label
self.crowd_ensemble = None
self.crowd_matrix = None
self.prediction_matrix = None
def fit(self, features, class_labels):
"""Constructs the Crowd Machine prediction matrix from the training data
Parameters
----------
features: array-like {n_samples, n_features}
Feature matrix
class_labels: array-like {n_samples}
List of true class labels
Returns
-------
Returns a copy of the CrowdMachineClassifier instance to allow for function chaining
"""
# Group the true class labels by the predicted labels from the machines in the crowd
estimator_tuples = list(zip(range(len(self.estimators)), self.estimators))
self.crowd_ensemble = VotingClassifier(estimators=estimator_tuples, voting='hard')
crowd_predictions = self.crowd_ensemble.fit_transform(features, class_labels)
self.crowd_matrix = defaultdict(list)
for crowd_prediction, class_label in zip(crowd_predictions, class_labels):
self.crowd_matrix[tuple(crowd_prediction)].append(class_label)
# The final prediction for each cell in the Crowd Machine prediction matrix is the most
# frequent prediction in that cell
self.prediction_matrix = defaultdict(int)
for prediction_tuple in self.crowd_matrix:
self.prediction_matrix[prediction_tuple] = Counter(
self.crowd_matrix[prediction_tuple]).most_common(1)[0][0]
self.prediction_matrix = dict(self.prediction_matrix)
return self
def predict(self, features):
"""Uses the Crowd Machine prediction matrix to create predictions for the provided features
Parameters
----------
features: array-like {n_samples, n_features}
Feature matrix to create predictions for
Returns
----------
array-like: {n_samples}
Class predictions from the provided feature matrix
"""
if self.prediction_matrix is None:
raise ValueError('The Crowd Machine must be fit before predict can be called')
crowd_predictions = self.crowd_ensemble.transform(features)
predictions = []
for crowd_prediction in crowd_predictions:
try:
predictions.append(self.prediction_matrix[tuple(crowd_prediction)])
except KeyError:
predictions.append(self.default_label)
return np.array(predictions)
def fit_predict(self, features, class_labels):
"""Convenience function that fits the provided features then returns the predictions
Parameters
----------
features: array-like {n_samples, n_features}
Feature matrix
class_labels: array-like {n_samples}
List of true class labels
Returns
----------
array-like: {n_samples}
Class predictions from the provided feature matrix
"""
self.fit(features, class_labels)
return self.predict(features)
def score(self, features, class_labels, scoring_function=None, **scoring_function_kwargs):
"""Estimates the accuracy of the predictions from the constructed feature
Parameters
----------
features: array-like {n_samples, n_features}
Feature matrix to predict from
class_labels: array-like {n_samples}
List of true class labels
Returns
-------
accuracy_score: float
The estimated accuracy of the Crowd Machine
"""
if self.prediction_matrix is None:
raise ValueError('The Crowd Machine must be fit before score can be called')
predictions = self.predict(features)
if scoring_function is None:
return accuracy_score(class_labels, predictions)
else:
return scoring_function(class_labels, predictions, **scoring_function_kwargs)
In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
clf = CrowdMachineClassifier(estimators=[
RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=2),
RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=2),
RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=2)
],
default_label=0)
X, y = make_classification(n_samples=1000, n_features=100, random_state=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Out[2]:
In [3]:
clf = RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Out[3]:
In [4]:
clf = RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Out[4]:
In [5]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Out[5]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: