Assignment 6

Using scikit-learn's inbuilt functions

1) Linear Regression


In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
csv = np.genfromtxt('train_data.csv', delimiter = ",")
X = csv[1:, 0:4]
y = csv[1:, 4]
Splitting data into training and testing sets

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = 0)
Training with cross validation

In [3]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
lin_reg = linear_model.LinearRegression()
lin_reg.fit(X_train, y_train)
scores = cross_val_score(lin_reg, X_train, y_train, cv=6)
print("Mean of cross validation scores: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


Mean of cross validation scores: 1.00 (+/- 0.00)
Accuracy on test set

In [4]:
print("Accuracy: {0:.2f}%".format(lin_reg.score(X_test, y_test) * 100))


Accuracy: 100.00%
New test set

In [5]:
csv_test = np.genfromtxt('test_input.csv', delimiter = ",")
var_test = csv_test[1:, 0:4]
Prediction on new test set

In [6]:
pred = lin_reg.predict(var_test)
np.savetxt("test_output.csv", pred, delimiter=",")

2) Logistic Regression

a) ex2.data1.txt

A classification model that estimates an applicant’s probability of admission based on scores from two exams.


In [7]:
import numpy as np
data = np.genfromtxt('ex2data1.txt', delimiter = ",")
y = data[:, 2]
X = data[:, 0:2]
Splitting data into training and testing sets

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = 0)
Training with cross validation

In [9]:
from sklearn import linear_model
log_reg = linear_model.LogisticRegression()
log_reg.fit(X_train, y_train)
scores = cross_val_score(log_reg, X_train, y_train, cv=5)
print("Mean of cross validation scores: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


Mean of cross validation scores: 0.83 (+/- 0.12)
Accuracy on test set

In [10]:
print("Accuracy : {0:.2f} %".format(log_reg.score(X_test, y_test) * 100))


Accuracy : 92.50 %

b) ex2.data2.txt

Predict whether microchips from a fabrication plant passes quality assurance.


In [11]:
import numpy as np
data = np.genfromtxt('ex2data2.txt', delimiter = ",")
y = data[:, 2]
X = data[:, 0:2]

In [12]:
def mapFeature(X1, X2):
    if isinstance(X1, np.float64) :
        l = 1
    else:
        l = len(X1)
    X1.reshape((l, 1))
    X2.reshape((l, 1))
    degree = 6
    out = np.ones((l, 28))
    k = 1
    for i in range(1, degree + 1):
        for j in range(0, i + 1):
            out[:, k] = (np.power(X1, (i-j))) * np.power(X2, j)
            k = k + 1     
    return out

In [13]:
X = mapFeature(X[:, 0], X[:, 1])
Splitting data into training and testing sets

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = 0)
Training with cross validation

In [15]:
from sklearn import linear_model
log_reg = linear_model.LogisticRegression(C = 10)
log_reg.fit(X_train, y_train)
scores = cross_val_score(log_reg, X_train, y_train, cv=5)
print("Mean of cross validation scores: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


Mean of cross validation scores: 0.77 (+/- 0.29)
Accuracy on test set

In [16]:
print("Accuracy : {0:.2f} %".format(log_reg.score(X_test, y_test) * 100))


Accuracy : 85.42 %

c) ex3data1.mat

Using regularised logistic regression to recognize handwritten digits (from 0 to 9).


In [17]:
import numpy as np
import scipy.io
data = scipy.io.loadmat("ex3data1.mat")
X = np.array(data['X'])
y = np.array(data['y'])
Splitting data into training and testing sets

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = 40)
Training

In [21]:
from sklearn.linear_model import LogisticRegression
multi_clf = LogisticRegression(C=50, multi_class='ovr', penalty='l2', solver='lbfgs', tol=4.5)
multi_clf.fit(X_train, y_train)


Out[21]:
LogisticRegression(C=50, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=4.5,
          verbose=0, warm_start=False)
Accuracy on test set

In [22]:
print("Accuracy : {0:.2f} %".format(multi_clf.score(X_test, y_test) * 100))


Accuracy : 90.15 %

3) K Nearest Neighbours

To classify 3 different types of irises (Setosa, Versicolour, and Virginica).


In [23]:
import numpy as np
from sklearn import datasets
dataset = datasets.load_iris()
X = dataset.data
y = dataset.target
Splitting data into training and testing sets

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = 0)
Training with cross validation

In [25]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
scores = cross_val_score(knn, X_train, y_train, cv=5)
print("Mean of cross validation scores: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


Mean of cross validation scores: 0.97 (+/- 0.09)
Accuracy on test set

In [26]:
print("Accuracy : {0:.2f} %".format(knn.score(X_test, y_test) * 100))


Accuracy : 93.33 %