In [12]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neighbors import DistanceMetric
from sklearn import preprocessing
In [3]:
class KNNClassifier(object):
def __init__(self):
self.X_train = None
self.y_train = None
self.k = 1
self.distance = 'euclidean'
def any_distance(self, a, b):
dist = DistanceMetric.get_metric(self.distance)
matDist = dist.pairwise([a,b])
return matDist[0,-1]
def closest(self, row):
dists = [self.any_distance(row, item) for _,item in self.X_train.iterrows()]
neighbors = sorted(dists)[:self.k]
# nei = dists.index(min(dists))
# print(neighbors)
nei = [dists.index(x) for x in neighbors]
# print(nei)
votes = self.y_train.iloc[nei]
votes = np.array(votes)
# print(votes)
label = np.argmax(np.bincount(votes))
return label
def fit(self, training_data, training_labels, k=1, distance='euclidean'):
self.X_train = training_data
self.y_train = training_labels
self.k = k
self.distance = distance
def predict(self, to_classify):
print('Predicting...')
predictions = []
for _,row in to_classify.iterrows():
label = self.closest(row)
#print('Predicted:',label)
predictions.append(label)
return predictions
In [ ]:
In [ ]:
In [4]:
dataset = pd.read_csv('train.csv')
#test.csv file does not have Survived column. Thus, I've prefered to split train.csv.
#test_data = pd.read_csv('test.csv')
#Removing Non-relevant features
del dataset['Cabin']
del dataset['Ticket']
del dataset['PassengerId']
del dataset['Name']
#Mapping numerical or NaN features values
dataset['Age'] = dataset.Age.fillna(dataset.Age.mean())
dataset = dataset.where((pd.notnull(dataset)), 0)
for row in ["Sex", "Embarked"]:
dataset[row] = dataset[row].astype('category')
dataset[row] = dataset[row].cat.codes
datasetCopy = dataset.copy()
#Spliting dataset
list(dataset)
Y = dataset['Survived'].copy()
del dataset['Survived']
X = dataset
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3)
In [ ]:
knn = KNNClassifier()
knn.fit(X_train, Y_train, k=10, distance='minkowski')
result = knn.predict(X_test)
score = metrics.accuracy_score(y_pred = result, y_true = Y_test)
print(score)
In [16]:
#Correlation analysis
corr = datasetCopy.corr()
corr
Out[16]:
In [17]:
#Dataset Normalization
datasetNorm = pd.DataFrame(preprocessing.scale(dataset))
In [18]:
datasetNorm.head()
Out[18]:
In [19]:
#Features Scaling such that the more strongly correlated a feature is with Y,
#then the more the feature will influence in the distance
datasetNorm[0] *= np.absolute(corr['Survived']['Pclass'])
datasetNorm[1] *= np.absolute(corr['Survived']['Sex'])
datasetNorm[2] *= np.absolute(corr['Survived']['Age'])
datasetNorm[3] *= np.absolute(corr['Survived']['SibSp'])
datasetNorm[4] *= np.absolute(corr['Survived']['Parch'])
datasetNorm[5] *= np.absolute(corr['Survived']['Fare'])
datasetNorm[6] *= np.absolute(corr['Survived']['Embarked'])
In [20]:
datasetNorm.head()
Out[20]:
In [21]:
X = datasetNorm
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.25)
In [ ]:
knn = KNNClassifier()
knn.fit(X_train, Y_train, k=10, distance='euclidean')
result = knn.predict(X_test)
score = metrics.accuracy_score(y_pred = result, y_true = Y_test)
print(score)
In [23]:
knn = KNNClassifier()
accuracies = []
for i in range(0,30):
k = 1 + i*4
kRange.append(k)
knn.fit(X_train, Y_train, k=k, distance='euclidean')
result = knn.predict(X_test)
score = metrics.accuracy_score(y_pred = result, y_true = Y_test)
print('K=', k, ' Score=', score)
accuracies.append(score)
In [26]:
fig,ax = plt.subplots()
ax.plot([1+4*i for i in range(0,30)],accuracies,'r',linewidth=2)
ax.set_ylabel('Accuracy')
ax.set_xlabel('Neighbors')
plt.grid()
In [27]:
highestAcc = max(accuracies)
bestK = accuracies.index(highestAcc)*4 + 1
knn.fit(X_train, Y_train, k=bestK, distance='euclidean')
result = knn.predict(X_test)
score = metrics.accuracy_score(y_pred = result, y_true = Y_test)
score
Out[27]:
In [31]:
print(metrics.classification_report(result, Y_test))
In [ ]:
In [ ]: