In [1]:
from sklearn.datasets import load_iris
In [2]:
iris = load_iris()
In [3]:
type(iris)
Out[3]:
In [5]:
print (iris.data)
In [6]:
print (iris.feature_names)
In [7]:
print (iris.target)
In [8]:
print (iris.target_names)
In [10]:
print (iris.data.shape)
In [11]:
x = iris.data
y = iris.target
In [15]:
# Step 1 import the class
from sklearn.neighbors import KNeighborsClassifier
In [16]:
#Step 2 Instantiate the estimator
#Estimator is scikit learn term for model
#instantiate = make an instance of
knn = KNeighborsClassifier(n_neighbors=1)
In [17]:
print(knn)
In [21]:
#Step 3 - Fit the model with data (aka"model training")
knn.fit(x,y)
Out[21]:
In [19]:
# make prediction - use learnt model to apply
knn.predict([3,5,4,2])
Out[19]:
In [20]:
x_new = ([3,5,4,2,],[5,4,3,2])
knn.predict(x_new)
#predict return Numpy array
Out[20]:
In [21]:
knn = KNeighborsClassifier(n_neighbors=5)
In [22]:
knn.fit(x,y)
Out[22]:
In [23]:
knn.predict(x_new)
Out[23]:
In [25]:
#import the class logistic regression
from sklearn.linear_model import LogisticRegression
# instantiate the model
logreg = LogisticRegression()
#fit the model
logreg.fit(x,y)
#predict the response
logreg.predict(x_new)
Out[25]:
In [2]:
from sklearn.datasets import load_iris
iris = load_iris()
In [4]:
#create x features and y response
X = iris.data
y = iris.target
Logistic Regression
In [7]:
from sklearn.linear_model import LogisticRegression
# instantiate the model
logreg = LogisticRegression()
#fit the model
logreg.fit(x,y)
Out[7]:
In [8]:
# predict the response value
logreg.predict(x)
Out[8]:
In [9]:
y_pred = logreg.predict(x)
In [10]:
len(y_pred)
Out[10]:
Classification accuracy/ evaluation metrics
In [17]:
from sklearn import metrics
print(metrics.accuracy_score(y,y_pred))
In [22]:
#create X(features) and y(response)
X = iris.data
y = iris.target
KNN (K = 5)
In [23]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X,y)
y_pred = knn.predict(X)
print(metrics.accuracy_score(y,y_pred))
KNN (K=1) best model since it gives 100% (1.0) ?
In [24]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X,y)
y_pred = knn.predict(X)
print(metrics.accuracy_score(y,y_pred))
TRAIN TEST SPLIT/ TEST SET
In [25]:
#Split data into training and test.
#Train the model in training and test the model in tes set.
In [28]:
#print the shape of x & y
print (X.shape)
print (y.shape)
In [33]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4, random_state=4)
In [32]:
# general rules = 20-40% as testing set
In [36]:
# print the value of the shape of the new X & y objects
print (X_train.shape)
print (X_test.shape)
print (y_train.shape)
print (y_test.shape)
In [37]:
#step2: train the model on the training set
# instantiate the model
logreg = LogisticRegression()
#fit the model
logreg.fit(X_train,y_train)
Out[37]:
In [38]:
#step 3: make prediction on the testing set
y_pred = logreg.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
REPEAT for KNN = 5
In [39]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test,y_pred))
REPEAT for KNN = 5
In [41]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test,y_pred))
In [42]:
#Hence KNN of 5 is best model to predict of out of sampl data.
In [43]:
# Which K number is better? try k = 1 through k = 25 and record testing accuracy
k_range = list(range(1,26))
scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
scores.append(metrics.accuracy_score(y_test,y_pred))
In [44]:
#import matplotlib(Scientific plotting library)
import matplotlib.pyplot as plt
#allow plots to appear within the notebook
%matplotlib inline
# plot the relationship between k and testing accuracy
plt.plot(k_range,scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Testing Accuracy')
Out[44]:
In [45]:
#conclusion: K = 6-17 obtain the best accuacy.
Training accuracy raises as model complexity increases. Testing accuracy penalizes models that are two complex or not complex enough. for KNN models, complexity is determined by the value of K (lower value = more complex). Downside of train/test split?¶ Provide a high-variance estimate of out-of-sample accuracy. K-fold cross-validation overcomes this limitation. But, train/test split is still useful because of its flexibility and speed.
In [47]:
#MAKING PREDICTION ON OUT OF SAMPLE DATA
In [49]:
# INSTANTITATE the model with the best known parameters
knn = KNeighborsClassifier(n_neighbors=11)
#train the model with X and y (not x_train and y_train)
knn.fit(X, y)
#make a prediction for an out of sample observation
knn.predict([3,5,4,2])
Out[49]:
Downside of train/ test split
K-fold cross validation over this limiation But train/ test is useful cos is flexible and speed.
In [ ]: