In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
file_path = "../data/df_model01.pkl"
df = pd.read_pickle(file_path)
print(df.shape)
df.head()
Out[2]:
Binning
In [3]:
bins = [10, 40, 120, 180] # SR, LNZ, SZG, VIE, >VIE
df["binned_distance"] = np.digitize(df.distance.values, bins=bins)
Conversion for Scikit-learn
Feature selection based on expert knowledge. Model-based hardly interpretable, at least confirmed "binned_distance" as relevant feature.
In [4]:
feature_names = ["buzzwordy_title", "main_topic_Daten", "binned_distance"]
X = df[feature_names].values
y = df.rating.map(lambda x: 1 if x>5 else 0).values # binary target: >5 (better as all the same) was worth attending
print("X:", X.shape, "y:", y.shape)
In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23,
test_size=0.5) # 50% split, small dataset size
Model
In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
linreg = LinearRegression() # Benchmark model
dec_tree = DecisionTreeClassifier() # Actual model
In [7]:
# Benchmark
linreg.fit(X_train, y_train)
print("Score (r^2): {:.3f}".format(linreg.score(X_test, y_test)))
print("Coef: {}".format(linreg.coef_))
=> Really bad perfomance
Parameter Tuning
In [8]:
from sklearn.model_selection import GridSearchCV
parameter_grid = {"criterion": ["gini", "entropy"],
"max_depth": [None, 1, 2, 3, 4, 5, 6],
"min_samples_leaf": list(range(1, 14)),
"max_leaf_nodes": list(range(3, 25))}
grid_search = GridSearchCV(DecisionTreeClassifier(presort=True), parameter_grid, cv=5) # 5 fold cross-val
grid_search.fit(X_train, y_train)
print("Score (Accuracy): {:.3f}".format(grid_search.score(X_test, y_test)))
print("Best Estimator: {}".format(grid_search.best_estimator_))
print("Best Parameters: {}".format(grid_search.best_params_))
=> Not that good accuracy, but at least better than random draw
Build Model
In [9]:
model = DecisionTreeClassifier(presort=True, criterion="gini", max_depth=None,
min_samples_leaf=2, max_leaf_nodes=5)
model.fit(X_train, y_train)
print("Score (Accuracy): {:.3f}".format(model.score(X_test, y_test)))
Print Decision Tree
In [10]:
from sklearn.tree import export_graphviz
export_graphviz(model, class_names=True, feature_names=feature_names,
rounded=True, filled=True, label="root", impurity=False, proportion=True,
out_file="plots/dectree_Model_best.dot")
Evaluation Scores
In [11]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
=> Not so good in predicting class 1 ("worth attending), better in predicting class 0. Weighted average of precision and recall 0.62 (F1 Score)
ROC Curve
In [12]:
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, label="ROC")
plt.plot([0, 1], c="r", label="Chance level")
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("Recall")
plt.legend(loc=4)
plt.savefig("plots/ROC_Curve_Model.png", dpi=180)
plt.show()
print("AUC: {:.3f}".format(roc_auc_score(y_test, y_pred)))
=> Indeed only slightly better than random guess, ~ 0.05 percentage points (AUC)
In [13]:
from sklearn.model_selection import cross_val_score
scores_benchmark = cross_val_score(model, X, y, cv=5) # 5 folds
print("Cross-Val Scores (Accuracy): {}".format(scores_benchmark))
print("Cross-Val Mean (Accuracy): {}".format(scores_benchmark.mean()))
=> Generalization acceptable, but not good (as expected from a Decision Tree)
In [14]:
from sklearn.externals import joblib
file_path = "../data/model_trained.pkl"
joblib.dump(model, file_path)
Out[14]:
In [ ]: