In [2]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit (X,y)
logreg.predict(X)
Out[2]:
In [3]:
y_pred = logreg.predict(X)
len(y_pred)
Out[3]:
Classification accuracy:
- Proportion of correct predictions
- Common evaluation metric for classification problems.
In [4]:
from sklearn import metrics
print metrics.accuracy_score(y,y_pred)
Known as training accuracy when you train and test on the same dataset.
In [6]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X,y)
y_pred=knn.predict(X)
print metrics.accuracy_score(y, y_pred)
Problems with training and testing on the same data
In [7]:
print X.shape
print y.shape
In [9]:
# Step 1
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4, random_state=4)
In [10]:
print X_train.shape
print X_test.shape
print y_train.shape
print y_test.shape
In [11]:
# Step 2
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Out[11]:
In [12]:
# Step 3
y_pred = logreg.predict(X_test)
print metrics.accuracy_score(y_test,y_pred)
In [13]:
k_range = range(1,26)
scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors = k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
scores.append(metrics.accuracy_score(y_test, y_pred))
In [14]:
import matplotlib.pyplot as plt
#allow plots to appear within the notebook
%matplotlib inline
# plot the relationship between k and testing accuracy
plt.plot(k_range, scores)
plt.xlabel('value of k for KNN')
plt.ylabel('Testing accuracy')
Out[14]:
In [ ]:
In [ ]: