Classification problems are a broad category of machine learning problems that involve the prediction of values taken from a discrete, finite number of cases.
In this example, we'll build a classifier to predict to which species a flower belongs to.
In [1]:
import pandas as pd
iris = pd.read_csv('../datasets/iris.csv')
In [2]:
# Print some info and statistics about the dataset
iris.info()
In [3]:
iris.Class.unique()
Out[3]:
In [4]:
iris.describe()
Out[4]:
In [5]:
# Encode the classes to numeric values
class_encodings = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
iris.Class = iris.Class.map(class_encodings)
In [6]:
iris.Class.unique()
Out[6]:
In [7]:
# Create a scatterplot for sepal length and sepal width
import matplotlib.pyplot as plt
%matplotlib inline
sl = iris.Sepal_length
sw = iris.Sepal_width
# Create a scatterplot of these two properties using plt.scatter()
# Assign different colors to each data point according to the class it belongs to
plt.scatter(sl[iris.Class == 0], sw[iris.Class == 0], color='red')
plt.scatter(sl[iris.Class == 1], sw[iris.Class == 1], color='green')
plt.scatter(sl[iris.Class == 2], sw[iris.Class == 2], color='blue')
# Specify labels for the X and Y axis
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
# Show graph
plt.show()
In [8]:
# Create a scatterplot for petal length and petal width
pl = iris.Petal_length
pw = iris.Petal_width
# Create a scatterplot of these two properties using plt.scatter()
# Assign different colors to each data point according to the class it belongs to
plt.scatter(pl[iris.Class == 0], pw[iris.Class == 0], color='red')
plt.scatter(pl[iris.Class == 1], pw[iris.Class == 1], color='green')
plt.scatter(pl[iris.Class == 2], pw[iris.Class == 2], color='blue')
# Specify labels for the X and Y axis
plt.xlabel('Petal Length')
plt.ylabel('Petal Width')
# Show graph
plt.show()
We'll use scikit-learn's LogisticRegression to build out classifier.
In [9]:
X = iris.drop('Class', axis=1)
t = iris.Class.values
# Use sklean's train_test_plit() method to split our data into two sets.
from sklearn.cross_validation import train_test_split
Xtr, Xts, ytr, yts = train_test_split(X, t)
In [10]:
# Use the training set to build a LogisticRegression model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression().fit(Xtr, ytr) # Fit a logistic regression model
In [11]:
# Use the LogisticRegression's score() method to assess the model accuracy
lr.score(Xtr, ytr)
Out[11]:
In [12]:
from sklearn.metrics import confusion_matrix
# Use scikit-learn's confusion_matrix to understand which classes were misclassified.
# See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
confusion_matrix(ytr, lr.predict(Xtr))
Out[12]: