In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
Считываем данные
In [2]:
data = pd.read_csv('../data/telecom_churn.csv')
По-минимуму анализируем и предобрабатываем
In [3]:
data.head()
Out[3]:
In [4]:
data.drop(['State', 'Voice mail plan'], axis=1, inplace=True)
In [5]:
data['International plan'] = data['International plan'].map({'Yes': 1,
'No': 0})
In [6]:
data.info()
In [7]:
y = data['Churn'].astype('int')
In [8]:
X = data.drop('Churn', axis=1)
In [9]:
X.shape, y.shape
Out[9]:
Выделяем отложенную выборку (30% от исходной)
In [10]:
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
In [11]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
test_size=0.3,
random_state=17)
In [12]:
X_train.shape, X_valid.shape
Out[12]:
Первое дерево решений
In [13]:
first_tree = DecisionTreeClassifier(random_state=17)
In [14]:
np.mean(cross_val_score(first_tree, X_train, y_train, cv=5))
Out[14]:
Первая модель kNN
In [15]:
from sklearn.neighbors import KNeighborsClassifier
In [16]:
first_knn = KNeighborsClassifier()
In [17]:
np.mean(cross_val_score(first_knn, X_train, y_train, cv=5))
Out[17]:
настраиваем max_depth и max_features для дерева
In [18]:
from sklearn.model_selection import GridSearchCV
In [19]:
tree_params = {'max_depth': np.arange(1, 11),
'max_features':[.5, .7, 1.]}
In [20]:
tree_grid = GridSearchCV(first_tree, tree_params, cv=5, n_jobs=-1)
In [21]:
%%time
tree_grid.fit(X_train, y_train);
Out[21]:
In [22]:
tree_grid.best_score_, tree_grid.best_params_
Out[22]:
настраиваем n_neighbors для kNN
In [23]:
knn_params = {'n_neighbors': range(5, 30, 5) }#+ list(range(50, 100, 10))}
In [24]:
knn_grid = GridSearchCV(first_knn, knn_params, cv=5)
In [25]:
%%time
knn_grid.fit(X_train, y_train);
Out[25]:
In [26]:
knn_grid.best_score_, knn_grid.best_params_
Out[26]:
прогноз для отложенной выборки
In [27]:
tree_valid_pred = tree_grid.predict(X_valid)
In [28]:
from sklearn.metrics import accuracy_score
In [29]:
accuracy_score(y_valid, tree_valid_pred)
Out[29]:
отрисовка дерева
In [30]:
from sklearn.tree import export_graphviz
In [31]:
second_tree = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)
second_tree.score(X_valid, y_valid)
Out[31]:
In [32]:
export_graphviz(second_tree, out_file='../img/telecom_tree.dot',
feature_names=X.columns, filled=True)
In [33]:
!dot -Tpng ../img/telecom_tree.dot -o ../img/telecom_tree.png
!rm ../img/telecom_tree.dot