In [5]:
# import
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
In [6]:
iris = load_iris()
X = iris.data
y = iris.target
In [7]:
# splitting the data into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)
# using classifier
scl = KNeighborsClassifier(n_neighbors=6)
scl.fit(X_train, y_train)
y_pred = scl.predict(X_test)
# checking accuracy
accuracy_score(y_test, y_pred)
Out[7]:
In [10]:
# simulate splitting a dataset of 25 observations into 5 folds
from sklearn.cross_validation import KFold
kf = KFold(25, n_folds=5, shuffle=False)
# print the contents of each training and testing set
print('{} {:^61} {}'.format('Iteration', 'Training set observations', 'Testing set observations'))
for iteration, data in enumerate(kf, start=1):
print('{:^9} {} {:^25}'.format(iteration, data[0], data[1]))
In [11]:
from sklearn.cross_validation import cross_val_score
In [12]:
# 10-fold cross-validation with K=5 for KNN (the n_neighbors parameter)
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print(scores)
# scores are accuracy
In [13]:
# mean
scores.mean()
Out[13]:
In [16]:
# finding out the best value of k and their accuracy
K = range(1,31)
accuracy = []
for i in K:
knn = KNeighborsClassifier(n_neighbors=i)
acc = cross_val_score(knn, X, y, cv=10, scoring="accuracy")
accuracy.append(acc.mean())
print(accuracy)
#print(list(zip(K,accuracy)))
In [17]:
import matplotlib.pyplot as plt
%matplotlib inline
In [22]:
plt.plot(K, accuracy)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
Out[22]:
In [23]:
# K = 20, as higher
knn = KNeighborsClassifier(n_neighbors=20)
acc = cross_val_score(knn, X, y, cv=10, scoring="accuracy")
acc.mean()
Out[23]:
In [24]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
print(cross_val_score(lr, X, y, cv=10, scoring="accuracy").mean())
In [30]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
In [27]:
data = pd.read_csv("data/Advertising.csv", index_col=0)
print(data.head())
In [28]:
features = ['TV', 'Radio', 'Newspaper']
response = ['Sales']
X = data[features]
y = data.Sales
In [32]:
lreg = LinearRegression()
acc = cross_val_score(lreg, X, y, cv=10, scoring="mean_squared_error")
print(acc)
In [34]:
print(np.sqrt((-acc)).mean())
In [38]:
# Testing the same when Newspaper is not included in model
X = data[['TV', 'Radio']]
lreg = LinearRegression()
print((np.sqrt(-cross_val_score(lreg, X, y, cv=10, scoring="mean_squared_error"))).mean())
In [39]:
# model performs better when Newspaper is not included
In [ ]: