In [1]:
%matplotlib inline

In [2]:
# Create some fake estimators.

In [3]:
from sklearn.datasets import make_regression, make_classification

In [4]:
X, y = make_regression()

In [5]:
from sklearn import dummy

In [6]:
dumdum = dummy.DummyRegressor()

In [7]:
dumdum.fit(X, y)


Out[7]:
DummyRegressor(constant=None, quantile=None, strategy='mean')

In [8]:
dumdum.predict(X)[:5]


Out[8]:
array([ 8.80084806,  8.80084806,  8.80084806,  8.80084806,  8.80084806])

In [9]:
# Or we could predict a supplied constant. Or we could predict the
# median value.

In [10]:
# Supplying a constant will only be considered if strategy is
# "constant"

In [11]:
predictors = [('mean', None),
              ('median', None),
              ('constant', 10)]

In [14]:
for strategy, constant in predictors:
    dumdum = dummy.DummyRegressor(strategy=strategy,
                                  constant=constant)
    dumdum.fit(X, y)
    print 'strategy: {}'.format(strategy), \
          ', '.join(map(str, dumdum.predict(X)[:5]))


strategy: mean 8.80084805828, 8.80084805828, 8.80084805828, 8.80084805828, 8.80084805828
strategy: median 30.1205353996, 30.1205353996, 30.1205353996, 30.1205353996, 30.1205353996
strategy: constant 10.0, 10.0, 10.0, 10.0, 10.0

In [15]:
# There are actually 4 options for classifiers. These strategies
# are similar to the continuous case

In [16]:
predictors = [('constant', 0),
              ('stratified', None),
              ('uniform', None),
              ('most_frequent', None)]

In [17]:
X, y = make_classification()

In [20]:
for strategy, constant in predictors:
    dumdum = dummy.DummyClassifier(strategy=strategy,
                                   constant=constant)
    dumdum.fit(X, y)
    print 'strategy: {} {}'.format(strategy,
                                   ','.join(map(str,
                                                dumdum.predict(X)[:5])))


strategy: constant 0,0,0,0,0
strategy: stratified 1,0,0,1,1
strategy: uniform 0,0,0,0,0
strategy: most_frequent 0,0,0,0,0

In [21]:
# It is always good to test models against the simplest models
# and that's exactly what the dummy estimators do. Example: A
# fraud model where only 5% of the model is a fraud; we can create
# a fairly good model by predicting not-fraud every time.

In [22]:
X, y = make_classification(20000, weights=[.95, .05])

In [23]:
dumdum = dummy.DummyClassifier(strategy='most_frequent')
dumdum.fit(X, y)


Out[23]:
DummyClassifier(constant=None, random_state=None, strategy='most_frequent')

In [24]:
from sklearn.metrics import accuracy_score
print accuracy_score(y, dumdum.predict(X))


0.94425

In [25]:
# It is actually correct very often but that's not the point. The
# point is that this is our baseline. If we cannot create a model
# for fraud that is more accurate than this, then it isn't worth
# the time.

In [ ]: