notebook.community

Edit and run



In [7]:

    
import sklearn
from sklearn import ensemble, datasets, cross_validation, metrics



In [8]:

    
boston = datasets.load_boston()
boston.data.shape









    Out[8]:





(506, 13)



In [8]:



In [9]:

    
friedman1 = datasets.make_friedman1(n_samples=10000, n_features=20, noise=1.)
friedman1
friedman1[0].shape









    Out[9]:





(10000, 20)



In [10]:

    
X = friedman1[0]
y = friedman1[1]
X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(X, y, train_size=0.7)



In [11]:

    
X_train.shape









    Out[11]:





(7000, 20)



In [12]:

    
def regression_report(y_true, y_pred):
    print '\t explained variance:', metrics.regression.explained_variance_score(y_true, y_pred)



In [7]:

    
model_gbc = ensemble.GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             max_depth=8, 
             max_features=None, 
             #max_leaf_nodes=15,
             min_samples_leaf=20, min_samples_split=40,
             min_weight_fraction_leaf=0.0, n_estimators=70,
             random_state=None, subsample=0.8, tree_params_producer=None,
             verbose=1, warm_start=False)

model_gbc.fit(X_train, y_train)


print ' '
print 'Train:'
regression_report(y_train, model_gbc.predict(X_train))
print ' '
print 'Test:'
regression_report(y_valid, model_gbc.predict(X_valid))









    



      Iter       Train Loss      OOB Improve   Remaining Time 
         1          20.9019           3.7085            6.56s
         2          17.7793           3.0312            6.10s
         3          15.0250           2.4743            5.98s
         4          12.7851           2.0418            5.79s
         5          10.7326           1.8724            5.64s
         6           9.2756           1.4604            5.51s
         7           8.0677           1.1391            5.41s
         8           6.8960           1.0195            5.29s
         9           6.0294           0.8592            5.18s
        10           5.3834           0.6615            5.06s
        20           1.8812           0.1390            4.02s
        30           0.9802           0.0279            3.14s
        40           0.7031           0.0037            2.32s
        50           0.5863           0.0015            1.52s
        60           0.5097          -0.0003            0.76s
        70           0.4748          -0.0010            0.00s
 
Train:
	 explained variance: 0.980751373945
 
Test:
	 explained variance: 0.940881259745



In [13]:

    
def linear_variable(vfrom, vto, N):
    return lambda i: int(i * 1. / N * (vfrom - vto) + vfrom)


def tree_params_producer_variable_depth(depth_from, depth_to, n_estimators):
    variable_depth_foo = linear_variable(depth_from, depth_to, n_estimators)
    return lambda stage: {
            'max_depth': variable_depth_foo(stage),
            'min_samples_split': 2,
            'min_samples_leaf': 1,
            'min_weight_fraction_leaf': 0.0,                
            'max_features': None,
            'max_leaf_nodes': None}


model_gbc = ensemble.GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             #max_depth=8, 
             #max_features=None, 
             #max_leaf_nodes=15,
             #min_samples_leaf=20, min_samples_split=40,
             #min_weight_fraction_leaf=0.0, 
             n_estimators=70,
             random_state=None, 
             subsample=0.8, 
             tree_params_producer=tree_params_producer_variable_depth(1, 8, 70),
             verbose=1, warm_start=False)

model_gbc.fit(X_train, y_train)


print ' '
print 'Train:'
regression_report(y_train, model_gbc.predict(X_train))
print ' '
print 'Test:'
regression_report(y_valid, model_gbc.predict(X_valid))









    



      Iter       Train Loss      OOB Improve   Remaining Time 
         1          20.6301           3.9689            8.02s






    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-13-b2c5db4be71d> in <module>()
     26              verbose=1, warm_start=False)
     27 
---> 28 model_gbc.fit(X_train, y_train)
     29 
     30 

/home/obus/.local/lib/python2.7/site-packages/sklearn/ensemble/gradient_boosting.pyc in fit(self, X, y, sample_weight, monitor)
    989         # fit the boosting stages
    990         n_stages = self._fit_stages(X, y, y_pred, sample_weight, random_state,
--> 991                                     begin_at_stage, monitor)
    992         # change shape of arrays after fit (early-stopping or additional ests)
    993         if n_stages != self.estimators_.shape[0]:

/home/obus/.local/lib/python2.7/site-packages/sklearn/ensemble/gradient_boosting.pyc in _fit_stages(self, X, y, y_pred, sample_weight, random_state, begin_at_stage, monitor)
   1049             y_pred = self._fit_stage(i, X, y, y_pred, sample_weight,
   1050                                      sample_mask, criterion, splitter,
-> 1051                                      random_state)
   1052 
   1053             # track deviance (= loss)

/home/obus/.local/lib/python2.7/site-packages/sklearn/ensemble/gradient_boosting.pyc in _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask, criterion, splitter, random_state)
    775 
    776             tree.fit(X, residual, sample_weight=sample_weight,
--> 777                      check_input=False)
    778 
    779             # update tree leaves

/home/obus/.local/lib/python2.7/site-packages/sklearn/tree/tree.pyc in fit(self, X, y, sample_weight, check_input)
    229             raise ValueError("min_weight_fraction_leaf must in [0, 0.5]")
    230         if max_depth <= 0:
--> 231             raise ValueError("max_depth must be greater than zero. ")
    232         if not (0 < max_features <= self.n_features_):
    233             raise ValueError("max_features must be in (0, n_features]")

ValueError: max_depth must be greater than zero.



In [8]:



In [9]:

    
help(metrics.regression)









    



Help on module sklearn.metrics.regression in sklearn.metrics:

NAME
    sklearn.metrics.regression - Metrics to assess performance on regression task

FILE
    /home/obus/.local/lib/python2.7/site-packages/sklearn/metrics/regression.py

DESCRIPTION
    Functions named as ``*_score`` return a scalar value to maximize: the higher
    the better
    
    Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize:
    the lower the better

FUNCTIONS
    explained_variance_score(y_true, y_pred, sample_weight=None, multioutput='uniform_average')
        Explained variance regression score function
        
        Best possible score is 1.0, lower values are worse.
        
        Read more in the :ref:`User Guide <explained_variance_score>`.
        
        Parameters
        ----------
        y_true : array-like of shape = (n_samples) or (n_samples, n_outputs)
            Ground truth (correct) target values.
        
        y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs)
            Estimated target values.
        
        sample_weight : array-like of shape = (n_samples), optional
            Sample weights.
        
        multioutput : string in ['raw_values', 'uniform_average',                 'variance_weighted'] or array-like of shape (n_outputs)
            Defines aggregating of multiple output scores.
            Array-like value defines weights used to average scores.
        
            'raw_values' :
                Returns a full set of scores in case of multioutput input.
        
            'uniform_average' :
                Scores of all outputs are averaged with uniform weight.
        
            'variance_weighted' :
                Scores of all outputs are averaged, weighted by the variances
                of each individual output.
        
        Returns
        -------
        score : float or ndarray of floats
            The explained variance or ndarray if 'multioutput' is 'raw_values'.
        
        Notes
        -----
        This is not a symmetric function.
        
        Examples
        --------
        >>> from sklearn.metrics import explained_variance_score
        >>> y_true = [3, -0.5, 2, 7]
        >>> y_pred = [2.5, 0.0, 2, 8]
        >>> explained_variance_score(y_true, y_pred)  # doctest: +ELLIPSIS
        0.957...
        >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
        >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
        >>> explained_variance_score(y_true, y_pred, multioutput='uniform_average')
        ... # doctest: +ELLIPSIS
        0.983...
    
    mean_absolute_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average')
        Mean absolute error regression loss
        
        Read more in the :ref:`User Guide <mean_absolute_error>`.
        
        Parameters
        ----------
        y_true : array-like of shape = (n_samples) or (n_samples, n_outputs)
            Ground truth (correct) target values.
        
        y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs)
            Estimated target values.
        
        sample_weight : array-like of shape = (n_samples), optional
            Sample weights.
        
        multioutput : string in ['raw_values', 'uniform_average']
            or array-like of shape (n_outputs)
            Defines aggregating of multiple output values.
            Array-like value defines weights used to average errors.
        
            'raw_values' :
                Returns a full set of errors in case of multioutput input.
        
            'uniform_average' :
                Errors of all outputs are averaged with uniform weight.
        
        
        Returns
        -------
        loss : float or ndarray of floats
            If multioutput is 'raw_values', then mean absolute error is returned
            for each output separately.
            If multioutput is 'uniform_average' or an ndarray of weights, then the
            weighted average of all output errors is returned.
        
            MAE output is non-negative floating point. The best value is 0.0.
        
        Examples
        --------
        >>> from sklearn.metrics import mean_absolute_error
        >>> y_true = [3, -0.5, 2, 7]
        >>> y_pred = [2.5, 0.0, 2, 8]
        >>> mean_absolute_error(y_true, y_pred)
        0.5
        >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
        >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
        >>> mean_absolute_error(y_true, y_pred)
        0.75
        >>> mean_absolute_error(y_true, y_pred, multioutput='raw_values')
        array([ 0.5,  1. ])
        >>> mean_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7])
        ... # doctest: +ELLIPSIS
        0.849...
    
    mean_squared_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average')
        Mean squared error regression loss
        
        Read more in the :ref:`User Guide <mean_squared_error>`.
        
        Parameters
        ----------
        y_true : array-like of shape = (n_samples) or (n_samples, n_outputs)
            Ground truth (correct) target values.
        
        y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs)
            Estimated target values.
        
        sample_weight : array-like of shape = (n_samples), optional
            Sample weights.
        
        multioutput : string in ['raw_values', 'uniform_average']
            or array-like of shape (n_outputs)
            Defines aggregating of multiple output values.
            Array-like value defines weights used to average errors.
        
            'raw_values' :
                Returns a full set of errors in case of multioutput input.
        
            'uniform_average' :
                Errors of all outputs are averaged with uniform weight.
        
        Returns
        -------
        loss : float or ndarray of floats
            A non-negative floating point value (the best value is 0.0), or an
            array of floating point values, one for each individual target.
        
        Examples
        --------
        >>> from sklearn.metrics import mean_squared_error
        >>> y_true = [3, -0.5, 2, 7]
        >>> y_pred = [2.5, 0.0, 2, 8]
        >>> mean_squared_error(y_true, y_pred)
        0.375
        >>> y_true = [[0.5, 1],[-1, 1],[7, -6]]
        >>> y_pred = [[0, 2],[-1, 2],[8, -5]]
        >>> mean_squared_error(y_true, y_pred)  # doctest: +ELLIPSIS
        0.708...
        >>> mean_squared_error(y_true, y_pred, multioutput='raw_values')
        ... # doctest: +ELLIPSIS
        array([ 0.416...,  1.        ])
        >>> mean_squared_error(y_true, y_pred, multioutput=[0.3, 0.7])
        ... # doctest: +ELLIPSIS
        0.824...
    
    median_absolute_error(y_true, y_pred)
        Median absolute error regression loss
        
        Read more in the :ref:`User Guide <median_absolute_error>`.
        
        Parameters
        ----------
        y_true : array-like of shape = (n_samples)
            Ground truth (correct) target values.
        
        y_pred : array-like of shape = (n_samples)
            Estimated target values.
        
        Returns
        -------
        loss : float
            A positive floating point value (the best value is 0.0).
        
        Examples
        --------
        >>> from sklearn.metrics import median_absolute_error
        >>> y_true = [3, -0.5, 2, 7]
        >>> y_pred = [2.5, 0.0, 2, 8]
        >>> median_absolute_error(y_true, y_pred)
        0.5
    
    r2_score(y_true, y_pred, sample_weight=None, multioutput=None)
        R^2 (coefficient of determination) regression score function.
        
        Best possible score is 1.0 and it can be negative (because the
        model can be arbitrarily worse). A constant model that always
        predicts the expected value of y, disregarding the input features,
        would get a R^2 score of 0.0.
        
        Read more in the :ref:`User Guide <r2_score>`.
        
        Parameters
        ----------
        y_true : array-like of shape = (n_samples) or (n_samples, n_outputs)
            Ground truth (correct) target values.
        
        y_pred : array-like of shape = (n_samples) or (n_samples, n_outputs)
            Estimated target values.
        
        sample_weight : array-like of shape = (n_samples), optional
            Sample weights.
        
        multioutput : string in ['raw_values', 'uniform_average',
                    'variance_weighted'] or None or array-like of shape (n_outputs)
            Defines aggregating of multiple output scores.
            Array-like value defines weights used to average scores.
            Default value correponds to 'variance_weighted', but
            will be changed to 'uniform_average' in next versions.
        
            'raw_values' :
                Returns a full set of scores in case of multioutput input.
        
            'uniform_average' :
                Scores of all outputs are averaged with uniform weight.
        
            'variance_weighted' :
                Scores of all outputs are averaged, weighted by the variances
                of each individual output.
        
        Returns
        -------
        z : float or ndarray of floats
            The R^2 score or ndarray of scores if 'multioutput' is
            'raw_values'.
        
        Notes
        -----
        This is not a symmetric function.
        
        Unlike most other scores, R^2 score may be negative (it need not actually
        be the square of a quantity R).
        
        References
        ----------
        .. [1] `Wikipedia entry on the Coefficient of determination
                <http://en.wikipedia.org/wiki/Coefficient_of_determination>`_
        
        Examples
        --------
        >>> from sklearn.metrics import r2_score
        >>> y_true = [3, -0.5, 2, 7]
        >>> y_pred = [2.5, 0.0, 2, 8]
        >>> r2_score(y_true, y_pred)  # doctest: +ELLIPSIS
        0.948...
        >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
        >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
        >>> r2_score(y_true, y_pred, multioutput='variance_weighted')  # doctest: +ELLIPSIS
        0.938...

DATA
    __ALL__ = ['mean_absolute_error', 'mean_squared_error', 'median_absolu...
    division = _Feature((2, 2, 0, 'alpha', 2), (3, 0, 0, 'alpha', 0), 8192...



In [ ]: