In [1]:
%matplotlib inline
In [2]:
# Create some fake estimators.
In [3]:
from sklearn.datasets import make_regression, make_classification
In [4]:
X, y = make_regression()
In [5]:
from sklearn import dummy
In [6]:
dumdum = dummy.DummyRegressor()
In [7]:
dumdum.fit(X, y)
Out[7]:
In [8]:
dumdum.predict(X)[:5]
Out[8]:
In [9]:
# Or we could predict a supplied constant. Or we could predict the
# median value.
In [10]:
# Supplying a constant will only be considered if strategy is
# "constant"
In [11]:
predictors = [('mean', None),
('median', None),
('constant', 10)]
In [14]:
for strategy, constant in predictors:
dumdum = dummy.DummyRegressor(strategy=strategy,
constant=constant)
dumdum.fit(X, y)
print 'strategy: {}'.format(strategy), \
', '.join(map(str, dumdum.predict(X)[:5]))
In [15]:
# There are actually 4 options for classifiers. These strategies
# are similar to the continuous case
In [16]:
predictors = [('constant', 0),
('stratified', None),
('uniform', None),
('most_frequent', None)]
In [17]:
X, y = make_classification()
In [20]:
for strategy, constant in predictors:
dumdum = dummy.DummyClassifier(strategy=strategy,
constant=constant)
dumdum.fit(X, y)
print 'strategy: {} {}'.format(strategy,
','.join(map(str,
dumdum.predict(X)[:5])))
In [21]:
# It is always good to test models against the simplest models
# and that's exactly what the dummy estimators do. Example: A
# fraud model where only 5% of the model is a fraud; we can create
# a fairly good model by predicting not-fraud every time.
In [22]:
X, y = make_classification(20000, weights=[.95, .05])
In [23]:
dumdum = dummy.DummyClassifier(strategy='most_frequent')
dumdum.fit(X, y)
Out[23]:
In [24]:
from sklearn.metrics import accuracy_score
print accuracy_score(y, dumdum.predict(X))
In [25]:
# It is actually correct very often but that's not the point. The
# point is that this is our baseline. If we cannot create a model
# for fraud that is more accurate than this, then it isn't worth
# the time.
In [ ]: