In [22]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
%matplotlib inline
In [80]:
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
def save_fig(fig_id, tight_layout=True):
path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
print("Saving figure", fig_id)
if tight_layout:
plt.tight_layout()
plt.savefig(path, format='png', dpi=300)
In [2]:
# load the data first
raw_data = pd.read_csv("/Users/weilu/Research/data/training_data/training_set.csv")
In [7]:
raw_test_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data.csv")
In [90]:
raw_test_data.groupby("Name").describe().stack()
Out[90]:
In [13]:
FEATURES = ["Rw", "VTotal", "QGO"]
LABEL = ["Good"]
In [16]:
def normalize(x):
return (x - x.mean()) / x.std()
In [19]:
X_train = raw_data[FEATURES].transform(normalize)
In [93]:
X_test = raw_test_data[FEATURES+["Name"]].groupby("Name").transform(normalize)
In [98]:
Y_test = pd.Series(raw_test_data["Qw"] > 0.7)
In [27]:
raw_data["Qw"].hist()
Out[27]:
In [51]:
Y = pd.Series(raw_data["Qw"] > 0.7)
In [52]:
import numpy as np
# For illustration only. Sklearn has train_test_split()
def split_train_test(data, y, test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], y.iloc[train_indices], data.iloc[test_indices], y.iloc[test_indices]
In [53]:
train_set, train_y, test_set, test_y = split_train_test(X_train, Y, 0.2)
In [59]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(train_set.values, train_y.values)
Out[59]:
In [60]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, train_set.values, train_y.values, cv=3, scoring="accuracy")
Out[60]:
In [62]:
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, train_set.values, train_y.values, cv=3)
In [64]:
from sklearn.metrics import confusion_matrix
confusion_matrix(train_y.values, y_train_pred)
Out[64]:
In [99]:
y_scores = cross_val_predict(sgd_clf, X_test.values, Y_test.values, cv=3,
method="decision_function")
In [100]:
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(Y_test.values, y_scores)
In [103]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
plt.xlabel("Threshold", fontsize=16)
plt.legend(loc="upper left", fontsize=16)
plt.ylim([0, 1])
plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.xlim([-10, 10])
save_fig("precision_recall_vs_threshold_plot")
plt.show()
In [65]:
y_scores = cross_val_predict(sgd_clf, train_set.values, train_y.values, cv=3,
method="decision_function")
In [67]:
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(train_y.values, y_scores)
In [73]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
plt.xlabel("Threshold", fontsize=16)
plt.legend(loc="upper left", fontsize=16)
plt.ylim([0, 1])
plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.xlim([-30, 30])
plt.show()
In [74]:
y_train_pred_90 = (y_scores > 6)
In [76]:
from sklearn.metrics import precision_score, recall_score
precision_score(train_y.values, y_train_pred_90)
Out[76]:
In [77]:
def plot_precision_vs_recall(precisions, recalls):
plt.plot(recalls, precisions, "b-", linewidth=2)
plt.xlabel("Recall", fontsize=16)
plt.ylabel("Precision", fontsize=16)
plt.axis([0, 1, 0, 1])
plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions, recalls)
plt.show()
In [78]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(train_y.values, y_scores)
In [81]:
def plot_roc_curve(fpr, tpr, label=None):
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([0, 1, 0, 1])
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
save_fig("roc_curve_plot")
plt.show()
In [82]:
from sklearn.metrics import roc_auc_score
roc_auc_score(train_y.values, y_scores)
Out[82]:
In [108]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42, class_weight={0:.1, 1:.9})
y_probas_forest = cross_val_predict(forest_clf, train_set.values, train_y.values, cv=3,
method="predict_proba")
y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(train_y.values,y_scores_forest)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, "b:", linewidth=2, label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right", fontsize=16)
save_fig("roc_curve_comparison_plot")
plt.show()
In [83]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf, train_set.values, train_y.values, cv=3,
method="predict_proba")
In [84]:
y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(train_y.values,y_scores_forest)
In [87]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, "b:", linewidth=2, label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right", fontsize=16)
save_fig("roc_curve_comparison_plot")
plt.show()
In [86]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)
rnd_clf.fit(train_set.values, train_y.values)
for name, score in zip(["Rw", "VTotal", "QGO"], rnd_clf.feature_importances_):
print(name, score)