In [1]:
import pydotplus
import pandas as pd
from IPython.display import Image
from sklearn import tree, grid_search
from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import numpy as np
import matplotlib.pyplot as plt
In [2]:
# %pylab inline
In [3]:
german = pd.read_csv('german_credit.csv', header=0, sep=',')
In [4]:
german.head()
Out[4]:
In [5]:
german_target = german.Creditability.values
german_data = german.iloc[:, 1:]
Доля класса 1
In [6]:
sum(german_target) / float(len(german_target))
Out[6]:
In [7]:
german_data.columns
Out[7]:
Сделаем максимальную глубину дерева равной 2
In [8]:
model = tree.DecisionTreeClassifier(max_depth=2)
model.fit(german_data, german_target)
dot_data = tree.export_graphviz(model, out_file="tree3.out",
feature_names=german_data.columns,
class_names=['credit', 'no credit'],
filled=True, rounded=True,
special_characters=False)
graph = pydotplus.graphviz.graph_from_dot_file("tree3.out")
Image(graph.create_png())
Out[8]:
Заметим, что первыми параметрами, по которым происходят разбиения, являются Account Balance, Duration of Credits, Length of current employment. При получившимся решающем дереве кредит будет выдан только при Account Balance <= 2.5 и на небольшой срок (до 22.5 месяцев).
Теперь построим дерево без ограничения на глубину.
In [9]:
model = tree.DecisionTreeClassifier()
model.fit(german_data, german_target)
dot_data = tree.export_graphviz(model, out_file="big_tree.out",
feature_names=german_data.columns,
class_names=['0', '1'],
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graphviz.graph_from_dot_file("big_tree.out")
Image(graph.create_png())
Out[9]:
In [10]:
train_data, test_data, train_target, test_target = train_test_split(german_data, german_target,
test_size = 0.2)
In [11]:
model.fit(train_data, train_target)
test_predictions = model.predict(test_data)
print(roc_auc_score(test_target, test_predictions))
train_predictions = model.predict(train_data)
print(roc_auc_score(train_target, train_predictions))
Заметим, что качество на треннировочных данных завышенно, если дерево обучалось на этих же данных.
In [12]:
print(classification_report(test_target, test_predictions))
In [13]:
np.mean(cross_val_score(model, german_data, german_target, cv=5))
Out[13]:
Посмотрим на зависимость площади под ROC-кривой от максимальной глубины дерева.
In [14]:
def f_cv(i):
model = tree.DecisionTreeClassifier(max_depth=i)
return np.mean(cross_val_score(model, german_data, german_target, cv=5, scoring='roc_auc'))
def f_test(i):
model = tree.DecisionTreeClassifier(max_depth=i)
model.fit(train_data, train_target)
test_predictions = model.predict(test_data)
return roc_auc_score(test_target, test_predictions)
def f_train(i):
model = tree.DecisionTreeClassifier(max_depth=i)
model.fit(train_data, train_target)
return roc_auc_score(test_target, test_predictions)
plt.figure(figsize=(10, 10))
plt.title('score(depth)')
plt.grid(True)
plt.ylim((0.0, 1.0))
plt.plot([i for i in range(1, 31)], [i for i in map(f_cv, range(1, 31))], label='cross val score')
plt.plot([i for i in range(1, 31)], [i for i in map(f_test, range(1, 31))], label='test')
plt.xlabel('depth')
plt.ylabel('score')
plt.legend(loc='best')
plt.show()
# pylab.legend(loc='lower right')
In [ ]: