In [1]:
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
df = pd.read_csv("/data/iris.csv")
df.head()


Out[2]:
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
3 4 4.6 3.1 1.5 0.2 Iris-setosa
4 5 5.0 3.6 1.4 0.2 Iris-setosa

In [3]:
features = ["SepalLengthCm", "PetalLengthCm"]

In [4]:
df.Species.value_counts()


Out[4]:
Iris-versicolor    50
Iris-virginica     50
Iris-setosa        50
Name: Species, dtype: int64

In [5]:
fig, ax = plt.subplots()
colors = ["red", "green", "blue"]

for i, v in enumerate(df.Species.unique()):
    df[df.Species == v].plot.scatter(features[0], features[1], label = v
                                , ax = ax, color = colors[i])



In [6]:
from sklearn import *
from mlxtend.plotting import plot_decision_regions

In [7]:
y = np.where(df.Species == "Iris-setosa", 1, 0)
X = df[features].values
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y
                    , test_size = 0.3, random_state = 1)

pipe = pipeline.Pipeline([
    #("poly", preprocessing.PolynomialFeatures(degree=2
    #                                , include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.LogisticRegression(random_state = 1, solver="lbfgs"))
])

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
y_test_prob = pipe.predict_proba(X_test)[:,1]

print("accuracy:", metrics.accuracy_score(y_test, y_test_pred))
print("precision:", metrics.precision_score(y_test, y_test_pred))
print("recall:", metrics.recall_score(y_test, y_test_pred))
print("f1_score:", metrics.f1_score(y_test, y_test_pred))

plt.figure(figsize = (8,8))
plot_decision_regions(X, y, pipe, X_highlight=X_test)


accuracy: 1.0
precision: 1.0
recall: 1.0
f1_score: 1.0
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1eb177f0>

In [8]:
y = np.where(df.Species == "Iris-virginica", 1, 0)
X = df[features].values
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y
                    , test_size = 0.3, random_state = 1)

pipe = pipeline.Pipeline([
    #("poly", preprocessing.PolynomialFeatures(degree=2
    #                                , include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.LogisticRegression(random_state = 1, solver="lbfgs"))
])

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
y_test_prob = pipe.predict_proba(X_test)[:,1]

print("accuracy:", metrics.accuracy_score(y_test, y_test_pred))
print("precision:", metrics.precision_score(y_test, y_test_pred))
print("recall:", metrics.recall_score(y_test, y_test_pred))
print("f1_score:", metrics.f1_score(y_test, y_test_pred))

plt.figure(figsize = (8,8))
plot_decision_regions(X, y, pipe, X_highlight=X_test)


accuracy: 0.9777777777777777
precision: 0.9285714285714286
recall: 1.0
f1_score: 0.962962962962963
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1eb175c0>

In [9]:
y = np.where(df.Species == "Iris-versicolor", 1, 0)
X = df[features].values
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y
                    , test_size = 0.3, random_state = 1)

pipe = pipeline.Pipeline([
    #("poly", preprocessing.PolynomialFeatures(degree=2
    #                                , include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.LogisticRegression(random_state = 1, solver="lbfgs"))
])

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
y_test_prob = pipe.predict_proba(X_test)[:,1]

print("accuracy:", metrics.accuracy_score(y_test, y_test_pred))
print("precision:", metrics.precision_score(y_test, y_test_pred))
print("recall:", metrics.recall_score(y_test, y_test_pred))
print("f1_score:", metrics.f1_score(y_test, y_test_pred))

plt.figure(figsize = (8,8))
plot_decision_regions(X, y, pipe, X_highlight=X_test)


accuracy: 0.6
precision: 0.0
recall: 0.0
f1_score: 0.0
/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x10aa0dd68>

In [10]:
y = np.where(df.Species == "Iris-versicolor", 1, 0)
X = df[features].values
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y
                    , test_size = 0.3, random_state = 1)

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=4
                                    , include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.LogisticRegression(random_state = 1, solver="lbfgs"))
])

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
y_test_prob = pipe.predict_proba(X_test)[:,1]

print("accuracy:", metrics.accuracy_score(y_test, y_test_pred))
print("precision:", metrics.precision_score(y_test, y_test_pred))
print("recall:", metrics.recall_score(y_test, y_test_pred))
print("f1_score:", metrics.f1_score(y_test, y_test_pred))

plt.figure(figsize = (8,8))
plot_decision_regions(X, y, pipe, X_highlight=X_test)


accuracy: 0.9333333333333333
precision: 0.9411764705882353
recall: 0.8888888888888888
f1_score: 0.9142857142857143
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x10aa0d550>

In [11]:
y = np.where(df.Species == "Iris-versicolor", 1, 0)
X = df[features].values
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y
                    , test_size = 0.3, random_state = 1)

pipe = pipeline.Pipeline([
    #("poly", preprocessing.PolynomialFeatures(degree=4
    #                                , include_bias=False)),
    #("scaler", preprocessing.StandardScaler()),
    ("est", tree.DecisionTreeClassifier(random_state = 1, max_depth=3))
])

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
y_test_prob = pipe.predict_proba(X_test)[:,1]

print("accuracy:", metrics.accuracy_score(y_test, y_test_pred))
print("precision:", metrics.precision_score(y_test, y_test_pred))
print("recall:", metrics.recall_score(y_test, y_test_pred))
print("f1_score:", metrics.f1_score(y_test, y_test_pred))

plt.figure(figsize = (8,8))
plot_decision_regions(X, y, pipe, X_highlight=X_test)


accuracy: 0.9777777777777777
precision: 1.0
recall: 0.9444444444444444
f1_score: 0.9714285714285714
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x10aa75588>

In [16]:
y = np.where(df.Species == "Iris-versicolor", 1, 0)
X = df[features].values
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y
                    , test_size = 0.3, random_state = 1)

pipe = pipeline.Pipeline([
    #("poly", preprocessing.PolynomialFeatures(degree=4
    #                                , include_bias=False)),
    #("scaler", preprocessing.StandardScaler()),
    ("est", ensemble.RandomForestClassifier(random_state = 1, max_depth=3, n_estimators=20))
])

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
y_test_prob = pipe.predict_proba(X_test)[:,1]

print("accuracy:", metrics.accuracy_score(y_test, y_test_pred))
print("precision:", metrics.precision_score(y_test, y_test_pred))
print("recall:", metrics.recall_score(y_test, y_test_pred))
print("f1_score:", metrics.f1_score(y_test, y_test_pred))

plt.figure(figsize = (8,8))
plot_decision_regions(X, y, pipe, X_highlight=X_test)


accuracy: 0.9777777777777777
precision: 1.0
recall: 0.9444444444444444
f1_score: 0.9714285714285714
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a20006470>

In [13]:
y = np.where(df.Species == "Iris-versicolor", 1, 0)
X = df[features].values
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y
                    , test_size = 0.3, random_state = 1)

pipe = pipeline.Pipeline([
    #("poly", preprocessing.PolynomialFeatures(degree=4
    #                                , include_bias=False)),
    #("scaler", preprocessing.StandardScaler()),
    ("est", neighbors.KNeighborsClassifier(n_neighbors=5))
])

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
y_test_prob = pipe.predict_proba(X_test)[:,1]

print("accuracy:", metrics.accuracy_score(y_test, y_test_pred))
print("precision:", metrics.precision_score(y_test, y_test_pred))
print("recall:", metrics.recall_score(y_test, y_test_pred))
print("f1_score:", metrics.f1_score(y_test, y_test_pred))

plt.figure(figsize = (8,8))
plot_decision_regions(X, y, pipe, X_highlight=X_test)


accuracy: 0.9555555555555556
precision: 1.0
recall: 0.8888888888888888
f1_score: 0.9411764705882353
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x10b5cc7f0>

In [14]:
y = preprocessing.LabelEncoder().fit_transform(df.Species)
X = df[features].values
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y
                    , test_size = 0.3, random_state = 1)

pipe = pipeline.Pipeline([
    #("poly", preprocessing.PolynomialFeatures(degree=4
    #                                , include_bias=False)),
    #("scaler", preprocessing.StandardScaler()),
    ("est", neighbors.KNeighborsClassifier(n_neighbors=5))
])

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
y_test_prob = pipe.predict_proba(X_test)[:,1]

print("accuracy:", metrics.accuracy_score(y_test, y_test_pred))
#print("precision:", metrics.precision_score(y_test, y_test_pred))
##print("recall:", metrics.recall_score(y_test, y_test_pred))
#print("f1_score:", metrics.f1_score(y_test, y_test_pred))

plt.figure(figsize = (8,8))
plot_decision_regions(X, y, pipe, X_highlight=X_test)


accuracy: 0.9555555555555556
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a235989e8>

In [15]:
y = preprocessing.LabelEncoder().fit_transform(df.Species)
X = df[features].values
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y
                    , test_size = 0.3, random_state = 1)

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=3
                                    , include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.LogisticRegression(random_state=1
                            , multi_class="ovr"
                            , solver = "liblinear"))
])

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)
y_test_prob = pipe.predict_proba(X_test)[:,1]

print("accuracy:", metrics.accuracy_score(y_test, y_test_pred))
#print("precision:", metrics.precision_score(y_test, y_test_pred))
##print("recall:", metrics.recall_score(y_test, y_test_pred))
#print("f1_score:", metrics.f1_score(y_test, y_test_pred))

plt.figure(figsize = (8,8))
plot_decision_regions(X, y, pipe, X_highlight=X_test)


accuracy: 0.9333333333333333
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1ffbbf98>

In [ ]: