In [3]:
import sklearn.datasets as datasets
import pandas as pd
iris=datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head(2)
Out[3]:
In [7]:
independent_vars = ['sepal length (cm)','sepal width (cm)', 'petal length (cm)']
dependent_var = 'petal width (cm)'
X = df[independent_vars]
y = df[dependent_var]
from sklearn import tree
model = tree.DecisionTreeRegressor()
model.fit(X,y)
Out[7]:
In [8]:
# get feature importances
importances = model.feature_importances_
pd.Series(importances, index=independent_vars)
Out[8]:
Some evaluation metrics (like mean squared error) are naturally descending scores (the smallest score is best) </br> This is important to note, because some scores will be reported as negative that by definition can never be negative. </br> In order to keep this clear: metrics which measure the distance between the model and the data, like metrics.mean_squared_error, are available as neg_mean_squared_error which return the negated value of the metric.
In [13]:
from sklearn import model_selection
results = model_selection.cross_val_score(tree.DecisionTreeRegressor(), X, y, cv=10, scoring='neg_mean_squared_error')
print("MSE: %.3f (%.3f)") % (results.mean(), results.std())
In [23]:
import matplotlib.pyplot as plt
from sklearn import model_selection
scores = []
depths = []
for depth in range(1, 25):
scores.append(
neg_mean_squared_error = model_selection.cross_val_score(tree.DecisionTreeRegressor(max_depth = depth), X, y, cv=10, scoring='neg_mean_squared_error'),
neg_median_absolute_error = model_selection.cross_val_score(tree.DecisionTreeRegressor(max_depth = depth), X, y, cv=10, scoring='neg_median_absolute_error')
neg_median_absolute_error = model_selection.cross_val_score(tree.DecisionTreeRegressor(max_depth = depth), X, y, cv=10, scoring='neg_median_absolute_error')
)
depths.append(depth)
_ = pd.DataFrame(data = scores, index = depths, columns = ['score']).plot()
In [24]:
# looks like a best depth around 5 is the best choice for regression
In [ ]: