In [1]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import seaborn as sns
%matplotlib inline
In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
iris = pd.read_csv(url, names=["SepalLength","SepalWidth","PetalLength","PetalWidth","Species"])
iris.sample(10)
Out[2]:
In [3]:
iris.Species.value_counts()
Out[3]:
Distribution of the three classes (Species) in this problem is euqally distributed. Accuracy would a good measure of the performance for model evaluation.
In [4]:
iris.info()
In [5]:
species = iris.Species.unique()
colors = sns.color_palette("hls", 3)
for i, v in enumerate(species):
df = iris[iris.Species == v]
plt.scatter(df["PetalLength"],df["SepalLength"], color = colors[i], label = v)
plt.legend(loc = "upper left")
plt.xlabel("Petal Length")
plt.ylabel("Sepal Length")
Out[5]:
In [6]:
y = np.where(iris.Species == "Iris-virginica", 1, 0)
In [7]:
p = np.linspace(-7, 7, 100)
def phi(p):
return 1 / (1 + np.exp(-p))
plt.plot(p, phi(p))
plt.xlabel("Linear regression output")
plt.ylabel("Sigmoid")
plt.title("Representation of Probability of prediction of 1")
Out[7]:
In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
from mlxtend.plotting import plot_decision_regions
In [9]:
X = iris.iloc[:, [2, 0]].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 340)
X_train.shape
Out[9]:
In [10]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_test_pred = lr.predict(X_test)
outcome = pd.DataFrame({"actual": y_test,"pred": y_test_pred})
outcome["match"] = outcome.actual == outcome.pred
outcome.sample(10)
Out[10]:
In [11]:
accuracy_score(y_test, y_test_pred)
Out[11]:
In [12]:
plt.figure(figsize=(8, 6))
plot_decision_regions(X, y, lr, X_highlight = X_test)
plt.xlabel("Petal Length")
plt.ylabel("Sepal Length")
plt.legend(loc = "upper left")
Out[12]:
In [13]:
confusion_matrix(y_test, y_test_pred)
Out[13]:
In [14]:
accuracy_score(y_test, y_test_pred)
Out[14]:
By default, the positive probability > 0.5 is outcome as 1 else 0. What if we want to change the probabilities threshold.
In [15]:
y_test_prob = lr.predict_proba(X_test)[:, 1]
y_test_pred_new = np.where(y_test_prob > 0.8, 1, 0)
print("Accuracy: ", accuracy_score(y_test, y_test_pred_new))
confusion_matrix(y_test, y_test_pred_new)
Out[15]:
So we can observe that as we vary the threshold the accuracy score varies too. But who decides on threshold?
In [16]:
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob)
plt.plot(fpr, tpr, linewidth = 2)
plt.ylim(0, 1)
plt.xlim(0, 1)
plt.plot([0,1], [0,1], ls = "--", color = "k")
plt.xlabel("False Postive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
Out[16]:
In [17]:
roc_auc_score(y_test, y_test_prob)
Out[17]:
In [18]:
from sklearn.model_selection import cross_val_score
In [19]:
scores = cross_val_score(cv=5, scoring="accuracy", estimator=lr, X=X_train, y= y_train)
scores.mean(), scores.std()
Out[19]:
Now, let's use all features availabe to predict the class.
In [20]:
X = iris.values[:, :-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)
print(X_train.shape)
lr = LogisticRegression(C = 10)
scores = cross_val_score(cv=5, scoring="accuracy", estimator=lr, X=X_train, y= y_train)
scores.mean(), scores.std()
Out[20]:
In [21]:
params = 10 ** np.linspace(-5, 5, 100)
means, stds = [], []
coefs = []
for p in params:
lr = LogisticRegression(C = p)
scores = cross_val_score(cv=5, scoring="accuracy", estimator=lr, X=X_train, y= y_train)
means.append(scores.mean())
stds.append(scores.std())
lr.fit(X_train, y_train)
coefs.append(lr.coef_[0])
means = np.array(means)
stds = np.array(stds)
In [22]:
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(params, means)
plt.fill_between(params, means + stds, means - stds, alpha = 0.2)
plt.xscale("log")
plt.xlabel("C")
plt.ylabel("accuracy")
plt.title("Impact of complexity parameter (C)\n on accuracy score")
plt.subplot(1, 2, 2)
plt.plot(params, coefs)
plt.xlabel("C")
plt.ylabel("Coefficient")
plt.xscale("log")
plt.title("Impact of complexity parameter (C)\n on feature coefficients")
plt.tight_layout()
In [23]:
lr = LogisticRegression(C = 10)
lr.fit(X_train, y_train)
lr.coef_[0]
Out[23]:
In [24]:
lr = LogisticRegression(C = 1e-4)
lr.fit(X_train, y_train)
lr.coef_[0]
Out[24]:
Find which parameters are available to tune.
In [25]:
LogisticRegression().get_params()
Out[25]:
In [26]:
from sklearn.model_selection import GridSearchCV
In [27]:
param_grid ={"C": 10 ** np.linspace(-5, 5, 100)}
gs = GridSearchCV(cv=5, estimator = lr, scoring="accuracy", param_grid= param_grid)
gs.fit(X_train, y_train)
Out[27]:
In [28]:
best = gs.best_estimator_
print("Best estimator score: ", best.score(X_test, y_test))
print(best.coef_[0])
In [29]:
gs.best_params_
Out[29]:
In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from mlxtend.plotting.decision_regions import plot_decision_regions
from sklearn.pipeline import Pipeline
y = np.where(iris.Species == "Iris-versicolor", 1, 0)
X = iris.loc[:, ["PetalLength","SepalLength"]].values
print(X.shape, y.shape)
lr = LogisticRegression()
poly = PolynomialFeatures(degree=2)
pipeline = Pipeline([("poly", poly), ("lr", lr)])
pipeline.fit(X, y)
plot_decision_regions(X, y, pipeline)
plt.xlabel("PetalLength")
plt.ylabel("SepalLength")
print("Acuracy score: ", lr.score(poly.transform(X), y))
In [31]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
In [32]:
X = iris.iloc[:, 0:4].values
y = iris.Species.values
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
pd.DataFrame(X_std).head()
Out[32]:
In [33]:
le = LabelEncoder()
y = le.fit_transform(y)
In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.3, random_state = 100)
In [35]:
lr = LogisticRegression(max_iter=100, random_state=100)
param_grid = [
{"C": 10 ** np.linspace(-5, 5, 100)}
]
gs = GridSearchCV(cv=5, estimator = lr, scoring="accuracy", param_grid= param_grid)
gs.fit(X_train, y_train)
gs.best_params_
Out[35]:
In [36]:
lr = gs.best_estimator_
lr.intercept_, lr.coef_
Out[36]:
In [37]:
coeffs = pd.DataFrame(np.hstack([lr.intercept_.reshape(-1, 1), lr.coef_]))
coeffs.columns = ["intercept", *iris.columns[0:4]]
coeffs
Out[37]:
In [38]:
accuracy_score(y_test, lr.predict(X_test))
Out[38]:
In [39]:
from sklearn.tree import DecisionTreeClassifier
In [40]:
X = iris.iloc[:, [2,0]].values
y = iris.Species.values
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
pd.DataFrame(X_std).head()
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 100)
print(X_train.shape)
In [41]:
tree = DecisionTreeClassifier(max_depth=4)
tree.fit(X_train, y_train)
Out[41]:
In [42]:
plot_decision_regions(X, y, tree)
plt.xlabel("Petal Length")
plt.ylabel("Sepal Length")
plt.legend(loc = "upper left")
Out[42]:
In [43]:
tree.score(X_test, y_test)
Out[43]:
In [44]:
from sklearn.tree import export_graphviz
export_graphviz(tree, out_file = "tree.dot", feature_names = ["PetalLength","SepalLength"])
Convert the .dot file into a png
In [45]:
!dot -Tpng tree.dot -o tree.png
We used 2 parameters for model training because we wanted to plot the decision region. Let's retrain the model using all features and compare the performance with logistic regression.
In [46]:
X = iris.iloc[:, 0:4].values
y = iris.Species.values
X_std = scaler.fit_transform(X)
tree = DecisionTreeClassifier(max_depth=4)
mean_cv_accuracy = np.mean(cross_val_score(cv = 5, estimator=tree, X=X_std, y=y))
print("Mean accuracy using all features over full dataset: ", mean_cv_accuracy)
We got above the accuracy of the tuned model using logistic regression 95.56. We can run tuning over tree max_depth and other parameters to tune the model.
In [47]:
tuning_grid = {"max_depth": np.arange(1, 10)}
tree = DecisionTreeClassifier()
grid_search = GridSearchCV(cv=5, estimator=tree, param_grid=tuning_grid, scoring="accuracy")
grid_search.fit(X_std, y)
grid_search.best_score_, grid_search.best_params_
Out[47]:
After tuning we got a better result 0.9733.
In [48]:
X = iris.iloc[:, [2,0]].values
y = iris.Species.values
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
pd.DataFrame(X_std).head()
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.30, random_state = 100)
print(X_train.shape)
In [49]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(max_depth=4, random_state=123)
forest.fit(X_train, y_train)
print("Accuracy:", forest.score(X_test, y_test))
plot_decision_regions(X_std, y, forest)
plt.xlabel("Petal Length (standarized)")
plt.ylabel("Sepal Length (standarized)")
plt.legend(loc = "upper left")
Out[49]:
In [50]:
from sklearn.svm import SVC
In [51]:
X = iris.iloc[:, [2,0]].values
y = iris.Species.values
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
pd.DataFrame(X_std).head()
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.30, random_state = 100)
print(X_train.shape)
In [52]:
svc = SVC(gamma=1, C =1, kernel="rbf", random_state=345)
svc.fit(X_train, y_train)
print("accuracy", svc.score(X_test, y_test))
plot_decision_regions(X_std, y, svc)
plt.xlabel("Petal Length")
plt.ylabel("Sepal Length")
plt.legend(loc = "upper left")
plt.title("SVM classifier using gamma = 1")
Out[52]:
In [53]:
svc = SVC(gamma=10, C = 1, kernel="rbf", random_state=345)
svc.fit(X_train, y_train)
print("accuracy", svc.score(X_test, y_test))
plot_decision_regions(X_std, y, svc)
plt.xlabel("Petal Length")
plt.ylabel("Sepal Length")
plt.legend(loc = "upper left")
plt.title("SVM classifier using gamma = 10, C = 1")
Out[53]:
In [54]:
svc = SVC(gamma=1, C = 10, kernel="rbf", random_state=345)
svc.fit(X_train, y_train)
print("accuracy", svc.score(X_test, y_test))
plot_decision_regions(X_std, y, svc)
plt.xlabel("Petal Length")
plt.ylabel("Sepal Length")
plt.legend(loc = "upper left")
plt.title("SVM classifier using gamma = 1, C = 10")
Out[54]:
Above we see the impact of different gamma and C values. Higher the gamma value or C values, each observation seems to loose territory of influence and hence tend to create a more overfit model. But we see the accuracy score is already matched with some of the best we have got so far. Let retrain the model using all features and tune the model using parameter grid.
In [55]:
svc = SVC(C=10, kernel="linear", random_state=345)
svc.fit(X_train, y_train)
print("accuracy", svc.score(X_test, y_test))
plot_decision_regions(X_std, y, svc)
plt.xlabel("Petal Length")
plt.ylabel("Sepal Length")
plt.legend(loc = "upper left")
plt.title("SVM classifier using gamma = 1")
Out[55]:
In [56]:
svc = SVC(C=10, kernel="poly", degree=2, random_state=345)
svc.fit(X_train, y_train)
print("accuracy", svc.score(X_test, y_test))
plot_decision_regions(X_std, y, svc)
plt.xlabel("Petal Length")
plt.ylabel("Sepal Length")
plt.legend(loc = "upper left")
plt.title("SVM classifier using gamma = 1")
Out[56]:
In [57]:
SVC().get_params()
Out[57]:
In [58]:
%%time
param_grid = {
"C": 10 ** np.linspace(-2, 2, 10),
"gamma": 10 ** np.linspace(-1, 2, 10),
"kernel": ["linear", "rbf", "sigmoid"]
}
grid_search = GridSearchCV(cv = 5, estimator=SVC(),
param_grid=param_grid, scoring="accuracy", verbose=True)
grid_search.fit(X_std, y)
print("Best score(CV): ", grid_search.best_score_,
"\n Best parameters: ", grid_search.best_params_)
In [59]:
np.set_printoptions(suppress=True)
10 ** np.linspace(-5, 5, 10)
Out[59]:
In [60]:
credit = pd.read_csv("https://raw.githubusercontent.com/abulbasar/data/master/credit-default.csv")
credit.head()
Out[60]:
In [61]:
credit.info()
In [62]:
credit.default.value_counts().plot.bar()
Out[62]:
In [63]:
categorical_columns = credit.select_dtypes(["O"]).columns
categorical_columns
Out[63]:
In [64]:
default_status = credit.default
del credit["default"]
In [65]:
credit_dummied = pd.get_dummies(credit, columns=categorical_columns, drop_first=True)
credit_dummied.info()
In [66]:
scaler = StandardScaler()
X = scaler.fit_transform(credit_dummied.values.astype(np.float64))
y = np.where(default_status == 1.0, 1, 0)
In [67]:
lr = LogisticRegression(C = 1, random_state=100)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)
Out[67]:
In [68]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=123)
tree.get_params()
Out[68]:
Find the best parameters (consider max_depth) for the decision tree model to solve the credit default problem.
In [69]:
from sklearn.model_selection import GridSearchCV
tuning_grid = {
"max_depth": range(1, 10),
"max_leaf_nodes": range(2, 20)
}
gs = GridSearchCV(cv=5, estimator=tree,
param_grid=tuning_grid, scoring="accuracy", verbose=True)
gs.fit(X, y)
print("Best params: ", gs.best_params_, "Best score: ", gs.best_score_)
In [ ]: