From the video series: Introduction to machine learning with scikit-learn
In [2]:
# read in the iris data
from sklearn.datasets import load_iris
iris = load_iris()
# create X (features) and y (response)
X = iris.data
y = iris.target
In [3]:
# import the class
from sklearn.linear_model import LogisticRegression
# instantiate the model (using the default parameters)
logreg = LogisticRegression()
# fit the model with data
logreg.fit(X, y)
# predict the response values for the observations in X
logreg.predict(X)
Out[3]:
In [4]:
# store the predicted response values
y_pred = logreg.predict(X)
# check how many predictions were generated
len(y_pred)
Out[4]:
Classification accuracy:
In [5]:
# compute classification accuracy for the logistic regression model
from sklearn import metrics
print metrics.accuracy_score(y, y_pred)
In [6]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
y_pred = knn.predict(X)
print metrics.accuracy_score(y, y_pred)
In [7]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X, y)
y_pred = knn.predict(X)
print metrics.accuracy_score(y, y_pred)
Image Credit: Overfitting by Chabacano. Licensed under GFDL via Wikimedia Commons.
In [8]:
# print the shapes of X and y
print X.shape
print y.shape
In [9]:
# STEP 1: split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)
What did this accomplish?
In [10]:
# print the shapes of the new X objects
print X_train.shape
print X_test.shape
In [11]:
# print the shapes of the new y objects
print y_train.shape
print y_test.shape
In [12]:
# STEP 2: train the model on the training set
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Out[12]:
In [13]:
# STEP 3: make predictions on the testing set
y_pred = logreg.predict(X_test)
# compare actual response values (y_test) with predicted response values (y_pred)
print metrics.accuracy_score(y_test, y_pred)
Repeat for KNN with K=5:
In [14]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)
Repeat for KNN with K=1:
In [15]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print metrics.accuracy_score(y_test, y_pred)
Can we locate an even better value for K?
In [16]:
# try K=1 through K=25 and record testing accuracy
k_range = range(1, 26)
scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
scores.append(metrics.accuracy_score(y_test, y_pred))
In [17]:
# import Matplotlib (scientific plotting library)
import matplotlib.pyplot as plt
# allow plots to appear within the notebook
%matplotlib inline
# plot the relationship between K and testing accuracy
plt.plot(k_range, scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Testing Accuracy')
Out[17]:
In [18]:
# instantiate the model with the best known parameters
knn = KNeighborsClassifier(n_neighbors=11)
# train the model with X and y (not X_train and y_train)
knn.fit(X, y)
# make a prediction for an out-of-sample observation
knn.predict([3, 5, 4, 2])
Out[18]:
In [1]:
from IPython.core.display import HTML
def css_styling():
styles = open("styles/custom.css", "r").read()
return HTML(styles)
css_styling()
Out[1]: