notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
# Create some fake estimators.



In [3]:

    
from sklearn.datasets import make_regression, make_classification



In [4]:

    
X, y = make_regression()



In [5]:

    
from sklearn import dummy



In [6]:

    
dumdum = dummy.DummyRegressor()



In [7]:

    
dumdum.fit(X, y)









    Out[7]:





DummyRegressor(constant=None, quantile=None, strategy='mean')



In [8]:

    
dumdum.predict(X)[:5]









    Out[8]:





array([ 8.80084806,  8.80084806,  8.80084806,  8.80084806,  8.80084806])



In [9]:

    
# Or we could predict a supplied constant. Or we could predict the
# median value.



In [10]:

    
# Supplying a constant will only be considered if strategy is
# "constant"



In [11]:

    
predictors = [('mean', None),
              ('median', None),
              ('constant', 10)]



In [14]:

    
for strategy, constant in predictors:
    dumdum = dummy.DummyRegressor(strategy=strategy,
                                  constant=constant)
    dumdum.fit(X, y)
    print 'strategy: {}'.format(strategy), \
          ', '.join(map(str, dumdum.predict(X)[:5]))









    



strategy: mean 8.80084805828, 8.80084805828, 8.80084805828, 8.80084805828, 8.80084805828
strategy: median 30.1205353996, 30.1205353996, 30.1205353996, 30.1205353996, 30.1205353996
strategy: constant 10.0, 10.0, 10.0, 10.0, 10.0



In [15]:

    
# There are actually 4 options for classifiers. These strategies
# are similar to the continuous case



In [16]:

    
predictors = [('constant', 0),
              ('stratified', None),
              ('uniform', None),
              ('most_frequent', None)]



In [17]:

    
X, y = make_classification()



In [20]:

    
for strategy, constant in predictors:
    dumdum = dummy.DummyClassifier(strategy=strategy,
                                   constant=constant)
    dumdum.fit(X, y)
    print 'strategy: {} {}'.format(strategy,
                                   ','.join(map(str,
                                                dumdum.predict(X)[:5])))









    



strategy: constant 0,0,0,0,0
strategy: stratified 1,0,0,1,1
strategy: uniform 0,0,0,0,0
strategy: most_frequent 0,0,0,0,0



In [21]:

    
# It is always good to test models against the simplest models
# and that's exactly what the dummy estimators do. Example: A
# fraud model where only 5% of the model is a fraud; we can create
# a fairly good model by predicting not-fraud every time.



In [22]:

    
X, y = make_classification(20000, weights=[.95, .05])



In [23]:

    
dumdum = dummy.DummyClassifier(strategy='most_frequent')
dumdum.fit(X, y)









    Out[23]:





DummyClassifier(constant=None, random_state=None, strategy='most_frequent')



In [24]:

    
from sklearn.metrics import accuracy_score
print accuracy_score(y, dumdum.predict(X))



In [25]:

    
# It is actually correct very often but that's not the point. The
# point is that this is our baseline. If we cannot create a model
# for fraud that is more accurate than this, then it isn't worth
# the time.



In [ ]: