In [1]:
import numpy as np
from sklearn import datasets
from sklearn.neighbors import DistanceMetric
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import math
from sklearn.model_selection import train_test_split
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

In [2]:
class KNNClassifier(object):
    def __init__(self):
        self.X_train = None
        self.y_train = None

    def euc_distance(self, a, b):
        return DistanceMetric.get_metric('euclidean').pairwise([a],[b])[0][0]

    def closest(self, row):
        dists = [self.euc_distance(row, item) for  item in self.X_train]
        nei = dists.index(min(dists))
        
        return self.y_train[nei]
    
    def k_closest(self, row, k):
        dists = [(index,self.euc_distance(row, item)) for index,item in enumerate(self.X_train)]
        dists.sort(key = lambda x: x[1])
        vizinhos = []
        for i in range(k):
            vizinhos.append(self.y_train[dists[i][0]])
        classe = Counter(vizinhos).most_common(1)[0][0]
        
        return classe
        
    def fit(self, training_data, training_labels):
        self.X_train = training_data
        self.y_train = training_labels
        
        

    def predict(self, to_classify, k=3):
        predictions = []
        for row in to_classify:
            label = self.k_closest(row, k)
            predictions.append(label)
        return predictions

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df['Age'] = df.Age.fillna(df.Age.mean())
df = df.where((pd.notnull(df)), 0)
for f in ["Sex", "Embarked"]:
    df[f] = df[f].astype('category')
    df[f] = df[f].cat.codes

df.head()


Out[4]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris 1 22.0 1 0 A/5 21171 7.2500 0 3
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 38.0 1 0 PC 17599 71.2833 C85 1
2 3 1 3 Heikkinen, Miss. Laina 0 26.0 0 0 STON/O2. 3101282 7.9250 0 3
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 35.0 1 0 113803 53.1000 C123 3
4 5 0 3 Allen, Mr. William Henry 1 35.0 0 0 373450 8.0500 0 3

In [5]:
features = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]
X = df.get(features)
X


Out[5]:
Pclass Sex Age SibSp Parch Fare Embarked
0 3 1 22.000000 1 0 7.2500 3
1 1 0 38.000000 1 0 71.2833 1
2 3 0 26.000000 0 0 7.9250 3
3 1 0 35.000000 1 0 53.1000 3
4 3 1 35.000000 0 0 8.0500 3
5 3 1 29.699118 0 0 8.4583 2
6 1 1 54.000000 0 0 51.8625 3
7 3 1 2.000000 3 1 21.0750 3
8 3 0 27.000000 0 2 11.1333 3
9 2 0 14.000000 1 0 30.0708 1
10 3 0 4.000000 1 1 16.7000 3
11 1 0 58.000000 0 0 26.5500 3
12 3 1 20.000000 0 0 8.0500 3
13 3 1 39.000000 1 5 31.2750 3
14 3 0 14.000000 0 0 7.8542 3
15 2 0 55.000000 0 0 16.0000 3
16 3 1 2.000000 4 1 29.1250 2
17 2 1 29.699118 0 0 13.0000 3
18 3 0 31.000000 1 0 18.0000 3
19 3 0 29.699118 0 0 7.2250 1
20 2 1 35.000000 0 0 26.0000 3
21 2 1 34.000000 0 0 13.0000 3
22 3 0 15.000000 0 0 8.0292 2
23 1 1 28.000000 0 0 35.5000 3
24 3 0 8.000000 3 1 21.0750 3
25 3 0 38.000000 1 5 31.3875 3
26 3 1 29.699118 0 0 7.2250 1
27 1 1 19.000000 3 2 263.0000 3
28 3 0 29.699118 0 0 7.8792 2
29 3 1 29.699118 0 0 7.8958 3
... ... ... ... ... ... ... ...
861 2 1 21.000000 1 0 11.5000 3
862 1 0 48.000000 0 0 25.9292 3
863 3 0 29.699118 8 2 69.5500 3
864 2 1 24.000000 0 0 13.0000 3
865 2 0 42.000000 0 0 13.0000 3
866 2 0 27.000000 1 0 13.8583 1
867 1 1 31.000000 0 0 50.4958 3
868 3 1 29.699118 0 0 9.5000 3
869 3 1 4.000000 1 1 11.1333 3
870 3 1 26.000000 0 0 7.8958 3
871 1 0 47.000000 1 1 52.5542 3
872 1 1 33.000000 0 0 5.0000 3
873 3 1 47.000000 0 0 9.0000 3
874 2 0 28.000000 1 0 24.0000 1
875 3 0 15.000000 0 0 7.2250 1
876 3 1 20.000000 0 0 9.8458 3
877 3 1 19.000000 0 0 7.8958 3
878 3 1 29.699118 0 0 7.8958 3
879 1 0 56.000000 0 1 83.1583 1
880 2 0 25.000000 0 1 26.0000 3
881 3 1 33.000000 0 0 7.8958 3
882 3 0 22.000000 0 0 10.5167 3
883 2 1 28.000000 0 0 10.5000 3
884 3 1 25.000000 0 0 7.0500 3
885 3 0 39.000000 0 5 29.1250 2
886 2 1 27.000000 0 0 13.0000 3
887 1 0 19.000000 0 0 30.0000 3
888 3 0 29.699118 1 2 23.4500 3
889 1 1 26.000000 0 0 30.0000 1
890 3 1 32.000000 0 0 7.7500 2

891 rows × 7 columns


In [6]:
Y = df["Survived"].values

In [7]:
Y


Out[7]:
array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25)

In [9]:
knn = KNeighborsClassifier()

In [10]:
knn.fit(X_train,Y_train)


Out[10]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [11]:
resultado = knn.predict(X_test)

In [12]:
accuracy_score(Y_test, resultado)


Out[12]:
0.66816143497757852

In [13]:
scores = []
ks = range(1,120, 4)
n_ks = len(ks)

for k in ks:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,Y_train)
    scores.append(knn.score(X_test,Y_test))

In [14]:
fig,ax = plt.subplots()
ax.plot(ks,scores,'b',linewidth=3)
ax.set_ylabel('Accuracy')
ax.set_xlabel('K Neighbors')

plt.grid()



In [15]:
max(scores)


Out[15]:
0.68609865470852016

In [16]:
bestK = scores.index(max(scores))

bestK = bestK*4 +1
bestK


Out[16]:
21

In [17]:
knn = KNeighborsClassifier(n_neighbors=bestK)
knn.fit(X_train,Y_train)


Out[17]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=21, p=2,
           weights='uniform')

In [18]:
result = knn.predict(X_test)

In [19]:
acr = accuracy_score(Y_test, result)
acr


Out[19]:
0.68609865470852016

In [20]:
report = classification_report(knn.predict(X_test), Y_test)

In [21]:
print(report)


             precision    recall  f1-score   support

          0       0.82      0.70      0.76       155
          1       0.49      0.65      0.56        68

avg / total       0.72      0.69      0.70       223


In [23]:
knn = KNNClassifier()

In [24]:
knn.fit(X_train.values, Y_train)

In [25]:
resultado = knn.predict(X_test.values, k=bestK)

In [26]:
print(classification_report(resultado, Y_test))


             precision    recall  f1-score   support

          0       0.83      0.69      0.75       159
          1       0.46      0.64      0.53        64

avg / total       0.72      0.68      0.69       223


In [27]:
accuracy_score(Y_test, resultado)


Out[27]:
0.67713004484304928

In [22]:
# deve ta tudo errado. brincs rs