In [1]:
%matplotlib inline
import numpy as np
from scipy.io import arff
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import patsy
import statsmodels.api as sm
from sklearn import tree, linear_model, metrics, dummy, naive_bayes, neighbors
from IPython.display import Image
import pydotplus
In [2]:
sns.set_context("paper")
sns.set_style("ticks")
def load_arff(filename):
data, meta = arff.loadarff(filename)
df = pd.DataFrame(data, columns=meta.names())
for c, k in zip(df.columns, meta.types()):
if k == "nominal":
df[c] = df[c].astype("category")
if k == "numeric":
df[c] = df[c].astype("float")
return df
def get_confusion_matrix(clf, X, y, verbose=True):
y_pred = clf.predict(X)
cm = metrics.confusion_matrix(y_true=y, y_pred=y_pred)
clf_report = metrics.classification_report(y, y_pred)
df_cm = pd.DataFrame(cm, columns=clf.classes_, index=clf.classes_)
if verbose:
print clf_report
print df_cm
return clf_report, df_cm
def show_decision_tree(clf, X, y):
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=X.columns,
class_names=y.unique(),
filled=True, rounded=True,
special_characters=True, impurity=False)
graph = pydotplus.graph_from_dot_data(dot_data)
return Image(graph.create_png())
def plot_decision_regions(clf, X, y, col_x=0, col_y=1,
ax=None, plot_step=0.01, colors="bry"):
if ax is None:
fig, ax = plt.subplots()
x_min, x_max = X[col_x].min(), X[col_x].max()
y_min, y_max = X[col_y].min(), X[col_y].max()
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
np.arange(y_min, y_max, plot_step))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
b, Z = np.unique(Z, return_inverse=True)
Z = Z.reshape(xx.shape)
cs = ax.contourf(xx, yy, Z, cmap=plt.cm.Paired)
for i, l in enumerate(clf.classes_):
idx = np.where(y==l)[0]
ax.scatter(X.ix[idx, col_x], X.ix[idx, col_y], label=l, c=colors[i], cmap=plt.cm.Paired)
ax.set_xlabel(col_x)
ax.set_ylabel(col_y)
ax.legend(bbox_to_anchor=(1.2, 0.5))
fig.tight_layout()
return ax
In [3]:
df = load_arff("../data/iris.arff")
print df.shape
df.head()
Out[3]:
In [4]:
df.dtypes
Out[4]:
In [5]:
df_t = df.copy() ## Since we are going to edit the data we should always make a copy
In [6]:
df_t.head()
Out[6]:
In [7]:
df_t["sepallength_sqr"] = df_t["sepallength"]**2 ## ** in python is used for exponent.
df_t.head()
Out[7]:
In [8]:
df_t["sepallength_log"] = np.log10(df_t["sepallength"])
df_t.head()
Out[8]:
In [9]:
df_t = df_t.rename(columns={"class": "label"})
df_t.head()
Out[9]:
In [13]:
y, X = patsy.dmatrices("label ~ petalwidth + petallength:petalwidth + I(sepallength**2)-1", data=df_t, return_type="dataframe")
print y.shape, X.shape
In [14]:
y.head()
Out[14]:
In [15]:
X.head()
Out[15]:
In [16]:
model = sm.MNLogit(y, X)
res = model.fit()
res.summary()
Out[16]:
In [18]:
model_sk = linear_model.LogisticRegression(multi_class="multinomial", solver="lbfgs")
model_sk.fit(X, df_t["label"])
Out[18]:
In [19]:
y_pred = model_sk.predict(X)
In [20]:
y_pred[:10]
Out[20]:
In [21]:
print metrics.classification_report(df_t["label"], y_pred)
In [22]:
model_sk_t = tree.DecisionTreeClassifier()
In [24]:
model_sk_t.fit(X, df_t["label"])
Out[24]:
In [25]:
show_decision_tree(model_sk_t, X, df_t["label"])
Out[25]:
In [26]:
model_0r = dummy.DummyClassifier(strategy="most_frequent")
model_0r.fit(X, df_t["label"])
y_pred = model_0r.predict(X)
print metrics.classification_report(df_t["label"], y_pred)
In [27]:
cm = metrics.confusion_matrix(y_true=df_t["label"], y_pred=y_pred)
In [28]:
df_cm = pd.DataFrame(cm, columns=model_0r.classes_, index=model_0r.classes_)
In [29]:
df_cm
Out[29]:
In [30]:
_ = get_confusion_matrix(model_0r, X, df_t["label"])
In [31]:
_ = get_confusion_matrix(model_sk_t, X, df_t["label"])
In [32]:
_ = get_confusion_matrix(model_sk, X, df_t["label"])
In [33]:
y, X = patsy.dmatrices("label ~ petalwidth + petallength - 1", data=df_t, return_type="dataframe")
# -1 forces the data to not generate an intercept
In [34]:
X.columns
Out[34]:
In [35]:
y = df_t["label"]
In [36]:
clf = tree.DecisionTreeClassifier()
clf.fit(X, y)
_ = get_confusion_matrix(clf, X, y)
In [37]:
clf.feature_importances_
Out[37]:
In [38]:
show_decision_tree(clf, X, y)
Out[38]:
In [39]:
X.head()
Out[39]:
In [40]:
y.value_counts()
Out[40]:
In [42]:
plot_decision_regions(clf, X, y, col_x="petalwidth", col_y="petallength")
Out[42]:
In [43]:
clf = naive_bayes.GaussianNB()
clf.fit(X, y)
_ = get_confusion_matrix(clf, X, y)
Decision surface of Naive Bayes classifier will not have overlapping colors because of the basic code I am using to show decision boundaries. A better code can show the mixing of colors properly
In [44]:
plot_decision_regions(clf, X, y, col_x="petalwidth", col_y="petallength")
Out[44]:
In [45]:
clf = linear_model.LogisticRegression(multi_class="multinomial", solver="lbfgs")
clf.fit(X, y)
_ = get_confusion_matrix(clf, X, y)
In [46]:
plot_decision_regions(clf, X, y, col_x="petalwidth", col_y="petallength")
Out[46]:
In [47]:
clf = neighbors.KNeighborsClassifier(n_neighbors=1)
clf.fit(X, y)
_ = get_confusion_matrix(clf, X, y)
In [48]:
plot_decision_regions(clf, X, y, col_x="petalwidth", col_y="petallength")
Out[48]:
In [ ]: