In [1]:
from collections import defaultdict, Counter
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
import numpy as np
from copy import deepcopy

class CrowdMachineClassifier(BaseEstimator):

    """Crowd Machine for ensembling the predictions of supervised machine learning methods"""

    def __init__(self, estimators, default_label):
        """Sets up the Crowd Machine

        Parameters
        ----------
        estimators: list
            A list of machine learning method instances to be included in the Crowd Machine.
            These instances must be scikit-learn compatible estimators.
        default_label: integer
            The default value that the Crowd Machine will assign to a data instance type that
            it has not encountered during the training phase.

        Returns
        -------
        None

        """
        self.estimators = deepcopy(estimators)
        self.default_label = default_label
        self.crowd_ensemble = None
        self.crowd_matrix = None
        self.prediction_matrix = None

    def fit(self, features, class_labels):
        """Constructs the Crowd Machine prediction matrix from the training data

        Parameters
        ----------
        features: array-like {n_samples, n_features}
            Feature matrix

        class_labels: array-like {n_samples}
            List of true class labels

        Returns
        -------
        Returns a copy of the CrowdMachineClassifier instance to allow for function chaining

        """
        # Group the true class labels by the predicted labels from the machines in the crowd
        estimator_tuples = list(zip(range(len(self.estimators)), self.estimators))
        self.crowd_ensemble = VotingClassifier(estimators=estimator_tuples, voting='hard')
        crowd_predictions = self.crowd_ensemble.fit_transform(features, class_labels)
        
        self.crowd_matrix = defaultdict(list)
        for crowd_prediction, class_label in zip(crowd_predictions, class_labels):
            self.crowd_matrix[tuple(crowd_prediction)].append(class_label)
        
        # The final prediction for each cell in the Crowd Machine prediction matrix is the most
        # frequent prediction in that cell
        self.prediction_matrix = defaultdict(int)
        for prediction_tuple in self.crowd_matrix:
            self.prediction_matrix[prediction_tuple] = Counter(
                self.crowd_matrix[prediction_tuple]).most_common(1)[0][0]
        
        self.prediction_matrix = dict(self.prediction_matrix)
        return self

    def predict(self, features):
        """Uses the Crowd Machine prediction matrix to create predictions for the provided features

        Parameters
        ----------
        features: array-like {n_samples, n_features}
            Feature matrix to create predictions for

        Returns
        ----------
        array-like: {n_samples}
            Class predictions from the provided feature matrix

        """
        if self.prediction_matrix is None:
            raise ValueError('The Crowd Machine must be fit before predict can be called')
        
        crowd_predictions = self.crowd_ensemble.transform(features)
        predictions = []
        for crowd_prediction in crowd_predictions:
            try:
                predictions.append(self.prediction_matrix[tuple(crowd_prediction)])
            except KeyError:
                predictions.append(self.default_label)

        return np.array(predictions)

    def fit_predict(self, features, class_labels):
        """Convenience function that fits the provided features then returns the predictions

        Parameters
        ----------
        features: array-like {n_samples, n_features}
            Feature matrix
        class_labels: array-like {n_samples}
            List of true class labels

        Returns
        ----------
        array-like: {n_samples}
            Class predictions from the provided feature matrix

        """
        self.fit(features, class_labels)
        return self.predict(features)

    def score(self, features, class_labels, scoring_function=None, **scoring_function_kwargs):
        """Estimates the accuracy of the predictions from the constructed feature

        Parameters
        ----------
        features: array-like {n_samples, n_features}
            Feature matrix to predict from
        class_labels: array-like {n_samples}
            List of true class labels

        Returns
        -------
        accuracy_score: float
            The estimated accuracy of the Crowd Machine

        """
        if self.prediction_matrix is None:
            raise ValueError('The Crowd Machine must be fit before score can be called')

        predictions = self.predict(features)

        if scoring_function is None:
            return accuracy_score(class_labels, predictions)
        else:
            return scoring_function(class_labels, predictions, **scoring_function_kwargs)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

clf = CrowdMachineClassifier(estimators=[
                                         RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=2),
                                         RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=2),
                                         RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=2)
                                        ],
                             default_label=0)

X, y = make_classification(n_samples=1000, n_features=100, random_state=2)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)


Out[2]:
0.89600000000000002

In [3]:
clf = RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)


Out[3]:
0.89600000000000002

In [4]:
clf = RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)


Out[4]:
0.90400000000000003

In [5]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)


Out[5]:
0.89600000000000002

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: