Garimpagem de Dados

Aula 4 - Exercídio de Classificação com kNN

13/10/2017

Equipe:

  • Sayonara Santos Araújo
  • Lailson Azevedo do Rego

Resolução


In [2]:
import numpy as np
from sklearn import datasets
from sklearn.neighbors import DistanceMetric
from collections import Counter
from operator import itemgetter

In [3]:
class KNNClassifier(object):
    def __init__(self):
        self.train_data = None
        self.train_labels = None
    
    #Recebe os dados de treino
    def fit(self, train_d, train_l):
        self.train_data = train_d
        self.train_labels = train_l
    
    ##1. (Atualizada) função que mede a distância euclidiana 
    def euc_distance(self, a, b):
        dist = DistanceMetric.get_metric('euclidean')
        return dist.pairwise([a], [b]) #np.linalg.norm(a-b)

    ##2. Função que seleciona os k vizinhos mais próximos (k > 1)
    def get_neighbors(self, k, test_instance):
        distances = []
        for i in range(len(self.train_data)):
            dist = self.euc_distance(test_instance, self.train_data[i])
            distances.append((self.train_data[i], dist, self.train_labels[i]))
        distances.sort(key=itemgetter(1))
        neighbors = distances[:k]
        #print('Instancia testada: ',test_instance,'\nVizinhos: ',neighbors,'\n')
        return neighbors
    
    ##3.Função que recebe os k vizinhos mais próximos e determina a classe correta    
    def vote(self, neighbors):
        class_counter = Counter()
        for neighbor in neighbors:
            class_counter[neighbor[2]] += 1
        return class_counter.most_common(1)[0][0]
    
    def predict(self, k, test_data):
        votes = []
        for i in range(len(test_data)):
            neighbors = self.get_neighbors(k, test_data[i])
            v = self.vote(neighbors)
            votes.append(v)
        return votes

In [4]:
#Carrega o dataset
import pandas as pd
dataset = pd.read_csv('train_sayonara.csv')
dataset


Out[4]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C
10 11 1 3 Sandstrom, Miss. Marguerite Rut female 4.0 1 1 PP 9549 16.7000 G6 S
11 12 1 1 Bonnell, Miss. Elizabeth female 58.0 0 0 113783 26.5500 C103 S
12 13 0 3 Saundercock, Mr. William Henry male 20.0 0 0 A/5. 2151 8.0500 NaN S
13 14 0 3 Andersson, Mr. Anders Johan male 39.0 1 5 347082 31.2750 NaN S
14 15 0 3 Vestrom, Miss. Hulda Amanda Adolfina female 14.0 0 0 350406 7.8542 NaN S
15 16 1 2 Hewlett, Mrs. (Mary D Kingcome) female 55.0 0 0 248706 16.0000 NaN S
16 17 0 3 Rice, Master. Eugene male 2.0 4 1 382652 29.1250 NaN Q
17 18 1 2 Williams, Mr. Charles Eugene male NaN 0 0 244373 13.0000 NaN S
18 19 0 3 Vander Planke, Mrs. Julius (Emelia Maria Vande... female 31.0 1 0 345763 18.0000 NaN S
19 20 1 3 Masselmani, Mrs. Fatima female NaN 0 0 2649 7.2250 NaN C
20 21 0 2 Fynney, Mr. Joseph J male 35.0 0 0 239865 26.0000 NaN S
21 22 1 2 Beesley, Mr. Lawrence male 34.0 0 0 248698 13.0000 D56 S
22 23 1 3 McGowan, Miss. Anna "Annie" female 15.0 0 0 330923 8.0292 NaN Q
23 24 1 1 Sloper, Mr. William Thompson male 28.0 0 0 113788 35.5000 A6 S
24 25 0 3 Palsson, Miss. Torborg Danira female 8.0 3 1 349909 21.0750 NaN S
25 26 1 3 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... female 38.0 1 5 347077 31.3875 NaN S
26 27 0 3 Emir, Mr. Farred Chehab male NaN 0 0 2631 7.2250 NaN C
27 28 0 1 Fortune, Mr. Charles Alexander male 19.0 3 2 19950 263.0000 C23 C25 C27 S
28 29 1 3 O'Dwyer, Miss. Ellen "Nellie" female NaN 0 0 330959 7.8792 NaN Q
29 30 0 3 Todoroff, Mr. Lalio male NaN 0 0 349216 7.8958 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
861 862 0 2 Giles, Mr. Frederick Edward male 21.0 1 0 28134 11.5000 NaN S
862 863 1 1 Swift, Mrs. Frederick Joel (Margaret Welles Ba... female 48.0 0 0 17466 25.9292 D17 S
863 864 0 3 Sage, Miss. Dorothy Edith "Dolly" female NaN 8 2 CA. 2343 69.5500 NaN S
864 865 0 2 Gill, Mr. John William male 24.0 0 0 233866 13.0000 NaN S
865 866 1 2 Bystrom, Mrs. (Karolina) female 42.0 0 0 236852 13.0000 NaN S
866 867 1 2 Duran y More, Miss. Asuncion female 27.0 1 0 SC/PARIS 2149 13.8583 NaN C
867 868 0 1 Roebling, Mr. Washington Augustus II male 31.0 0 0 PC 17590 50.4958 A24 S
868 869 0 3 van Melkebeke, Mr. Philemon male NaN 0 0 345777 9.5000 NaN S
869 870 1 3 Johnson, Master. Harold Theodor male 4.0 1 1 347742 11.1333 NaN S
870 871 0 3 Balkic, Mr. Cerin male 26.0 0 0 349248 7.8958 NaN S
871 872 1 1 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 47.0 1 1 11751 52.5542 D35 S
872 873 0 1 Carlsson, Mr. Frans Olof male 33.0 0 0 695 5.0000 B51 B53 B55 S
873 874 0 3 Vander Cruyssen, Mr. Victor male 47.0 0 0 345765 9.0000 NaN S
874 875 1 2 Abelson, Mrs. Samuel (Hannah Wizosky) female 28.0 1 0 P/PP 3381 24.0000 NaN C
875 876 1 3 Najib, Miss. Adele Kiamie "Jane" female 15.0 0 0 2667 7.2250 NaN C
876 877 0 3 Gustafsson, Mr. Alfred Ossian male 20.0 0 0 7534 9.8458 NaN S
877 878 0 3 Petroff, Mr. Nedelio male 19.0 0 0 349212 7.8958 NaN S
878 879 0 3 Laleff, Mr. Kristo male NaN 0 0 349217 7.8958 NaN S
879 880 1 1 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56.0 0 1 11767 83.1583 C50 C
880 881 1 2 Shelley, Mrs. William (Imanita Parrish Hall) female 25.0 0 1 230433 26.0000 NaN S
881 882 0 3 Markun, Mr. Johann male 33.0 0 0 349257 7.8958 NaN S
882 883 0 3 Dahlberg, Miss. Gerda Ulrika female 22.0 0 0 7552 10.5167 NaN S
883 884 0 2 Banfield, Mr. Frederick James male 28.0 0 0 C.A./SOTON 34068 10.5000 NaN S
884 885 0 3 Sutehall, Mr. Henry Jr male 25.0 0 0 SOTON/OQ 392076 7.0500 NaN S
885 886 0 3 Rice, Mrs. William (Margaret Norton) female 39.0 0 5 382652 29.1250 NaN Q
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns


In [9]:
#Dados faltantes em Age e Cabin
# Então, na idades faltantes, atribui a média das idades
dataset.Age = dataset.Age.fillna(dataset.Age.mean())

# Nos dados das cabines faltantes, subistitui por SC   (Remover ou preencher?)
#dataset = dataset.dropna(axis=0, how="any")
dataset.Cabin = dataset.Cabin.fillna('SC')

In [10]:
##4. Transforma dados categóricos em numéricos (?)
cate = ['Name','Sex','SibSp','Parch','Ticket','Fare','Cabin','Embarked']
datasetc=dataset
for c in cate:
    datasetc[c] = dataset[c].astype('category')
    datasetc[c] = datasetc[c].cat.codes
#dataset.dtypes
datasetc


Out[10]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 108 1 22.000000 1 0 523 18 146 2
1 2 1 1 190 0 38.000000 1 0 596 207 81 0
2 3 1 3 353 0 26.000000 0 0 669 41 146 2
3 4 1 1 272 0 35.000000 1 0 49 189 55 2
4 5 0 3 15 1 35.000000 0 0 472 43 146 2
5 6 0 3 554 1 29.699118 0 0 275 51 146 1
6 7 0 1 515 1 54.000000 0 0 85 186 129 2
7 8 0 3 624 1 2.000000 3 1 395 124 146 2
8 9 1 3 412 0 27.000000 0 2 344 74 146 2
9 10 1 2 576 0 14.000000 1 0 132 154 146 0
10 11 1 3 727 0 4.000000 1 1 616 110 145 2
11 12 1 1 95 0 58.000000 0 0 38 143 49 2
12 13 0 3 729 1 20.000000 0 0 535 43 146 2
13 14 0 3 28 1 39.000000 1 5 333 158 146 2
14 15 0 3 840 0 14.000000 0 0 413 36 146 2
15 16 1 2 359 0 55.000000 0 0 153 108 146 2
16 17 0 3 682 1 2.000000 4 1 480 151 146 1
17 18 1 2 867 1 29.699118 0 0 151 85 146 2
18 19 0 3 839 0 31.000000 1 0 301 113 146 2
19 20 1 3 512 0 29.699118 0 0 184 16 146 0
20 21 0 2 273 1 35.000000 0 0 139 138 146 2
21 22 1 2 80 1 34.000000 0 0 152 85 111 2
22 23 1 3 523 0 15.000000 0 0 278 42 146 1
23 24 1 1 765 1 28.000000 0 0 42 168 13 2
24 25 0 3 626 0 8.000000 3 1 395 124 146 2
25 26 1 3 44 0 38.000000 1 5 329 159 146 2
26 27 0 3 240 1 29.699118 0 0 179 16 146 0
27 28 0 1 260 1 19.000000 3 2 95 246 63 2
28 29 1 3 605 0 29.699118 0 0 283 38 146 1
29 30 0 3 813 1 29.699118 0 0 362 40 146 2
... ... ... ... ... ... ... ... ... ... ... ... ...
861 862 0 2 283 1 21.000000 1 0 221 76 146 2
862 863 1 1 797 0 48.000000 0 0 88 137 94 2
863 864 0 3 718 0 29.699118 6 2 568 205 146 2
864 865 0 2 285 1 24.000000 0 0 120 85 146 2
865 866 1 2 122 0 42.000000 0 0 126 85 146 2
866 867 1 2 230 0 27.000000 1 0 632 89 146 0
867 868 0 1 696 1 31.000000 0 0 590 184 6 2
868 869 0 3 890 1 29.699118 0 0 309 64 146 2
869 870 1 3 407 1 4.000000 1 1 344 74 146 2
870 871 0 3 59 1 26.000000 0 0 387 40 146 2
871 872 1 1 79 0 47.000000 1 1 54 188 102 2
872 873 0 1 139 1 33.000000 0 0 502 2 34 2
873 874 0 3 836 1 47.000000 0 0 303 58 146 2
874 875 1 2 4 0 28.000000 1 0 575 132 146 0
875 876 1 3 571 0 15.000000 0 0 195 16 146 0
876 877 0 3 311 1 20.000000 0 0 504 69 146 2
877 878 0 3 657 1 19.000000 0 0 358 40 146 2
878 879 0 3 451 1 29.699118 0 0 363 40 146 2
879 880 1 1 668 0 56.000000 0 1 59 221 70 0
880 881 1 2 739 0 25.000000 0 1 115 138 146 2
881 882 0 3 510 1 33.000000 0 0 394 40 146 2
882 883 0 3 193 0 22.000000 0 0 508 73 146 2
883 884 0 2 61 1 28.000000 0 0 565 72 146 2
884 885 0 3 793 1 25.000000 0 0 650 12 146 2
885 886 0 3 684 0 39.000000 0 5 480 151 146 1
886 887 0 2 548 1 27.000000 0 0 101 85 146 2
887 888 1 1 303 0 19.000000 0 0 14 153 30 2
888 889 0 3 413 0 29.699118 1 2 675 131 146 2
889 890 1 1 81 1 26.000000 0 0 8 153 60 0
890 891 0 3 220 1 32.000000 0 0 466 30 146 1

891 rows × 12 columns


In [11]:
##5. Normaliza os atributos numéricos
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
dataset_scaled = min_max_scaler.fit_transform(dataset)
datasetn = pd.DataFrame(dataset_scaled)
datasetn.columns = ['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']
datasetn


Out[11]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 0.000000 0.0 1.0 0.121348 1.0 0.271174 0.166667 0.000000 0.769118 0.072874 0.993197 1.000000
1 0.001124 1.0 0.0 0.213483 0.0 0.472229 0.166667 0.000000 0.876471 0.838057 0.551020 0.333333
2 0.002247 1.0 1.0 0.396629 0.0 0.321438 0.000000 0.000000 0.983824 0.165992 0.993197 1.000000
3 0.003371 1.0 0.0 0.305618 0.0 0.434531 0.166667 0.000000 0.072059 0.765182 0.374150 1.000000
4 0.004494 0.0 1.0 0.016854 1.0 0.434531 0.000000 0.000000 0.694118 0.174089 0.993197 1.000000
5 0.005618 0.0 1.0 0.622472 1.0 0.367921 0.000000 0.000000 0.404412 0.206478 0.993197 0.666667
6 0.006742 0.0 0.0 0.578652 1.0 0.673285 0.000000 0.000000 0.125000 0.753036 0.877551 1.000000
7 0.007865 0.0 1.0 0.701124 1.0 0.019854 0.500000 0.166667 0.580882 0.502024 0.993197 1.000000
8 0.008989 1.0 1.0 0.462921 0.0 0.334004 0.000000 0.333333 0.505882 0.299595 0.993197 1.000000
9 0.010112 1.0 0.5 0.647191 0.0 0.170646 0.166667 0.000000 0.194118 0.623482 0.993197 0.333333
10 0.011236 1.0 1.0 0.816854 0.0 0.044986 0.166667 0.166667 0.905882 0.445344 0.986395 1.000000
11 0.012360 1.0 0.0 0.106742 0.0 0.723549 0.000000 0.000000 0.055882 0.578947 0.333333 1.000000
12 0.013483 0.0 1.0 0.819101 1.0 0.246042 0.000000 0.000000 0.786765 0.174089 0.993197 1.000000
13 0.014607 0.0 1.0 0.031461 1.0 0.484795 0.166667 0.833333 0.489706 0.639676 0.993197 1.000000
14 0.015730 0.0 1.0 0.943820 0.0 0.170646 0.000000 0.000000 0.607353 0.145749 0.993197 1.000000
15 0.016854 1.0 0.5 0.403371 0.0 0.685851 0.000000 0.000000 0.225000 0.437247 0.993197 1.000000
16 0.017978 0.0 1.0 0.766292 1.0 0.019854 0.666667 0.166667 0.705882 0.611336 0.993197 0.666667
17 0.019101 1.0 0.5 0.974157 1.0 0.367921 0.000000 0.000000 0.222059 0.344130 0.993197 1.000000
18 0.020225 0.0 1.0 0.942697 0.0 0.384267 0.166667 0.000000 0.442647 0.457490 0.993197 1.000000
19 0.021348 1.0 1.0 0.575281 0.0 0.367921 0.000000 0.000000 0.270588 0.064777 0.993197 0.333333
20 0.022472 0.0 0.5 0.306742 1.0 0.434531 0.000000 0.000000 0.204412 0.558704 0.993197 1.000000
21 0.023596 1.0 0.5 0.089888 1.0 0.421965 0.000000 0.000000 0.223529 0.344130 0.755102 1.000000
22 0.024719 1.0 1.0 0.587640 0.0 0.183212 0.000000 0.000000 0.408824 0.170040 0.993197 0.666667
23 0.025843 1.0 0.0 0.859551 1.0 0.346569 0.000000 0.000000 0.061765 0.680162 0.088435 1.000000
24 0.026966 0.0 1.0 0.703371 0.0 0.095250 0.500000 0.166667 0.580882 0.502024 0.993197 1.000000
25 0.028090 1.0 1.0 0.049438 0.0 0.472229 0.166667 0.833333 0.483824 0.643725 0.993197 1.000000
26 0.029213 0.0 1.0 0.269663 1.0 0.367921 0.000000 0.000000 0.263235 0.064777 0.993197 0.333333
27 0.030337 0.0 0.0 0.292135 1.0 0.233476 0.500000 0.333333 0.139706 0.995951 0.428571 1.000000
28 0.031461 1.0 1.0 0.679775 0.0 0.367921 0.000000 0.000000 0.416176 0.153846 0.993197 0.666667
29 0.032584 0.0 1.0 0.913483 1.0 0.367921 0.000000 0.000000 0.532353 0.161943 0.993197 1.000000
... ... ... ... ... ... ... ... ... ... ... ... ...
861 0.967416 0.0 0.5 0.317978 1.0 0.258608 0.166667 0.000000 0.325000 0.307692 0.993197 1.000000
862 0.968539 1.0 0.0 0.895506 0.0 0.597889 0.000000 0.000000 0.129412 0.554656 0.639456 1.000000
863 0.969663 0.0 1.0 0.806742 0.0 0.367921 1.000000 0.333333 0.835294 0.829960 0.993197 1.000000
864 0.970787 0.0 0.5 0.320225 1.0 0.296306 0.000000 0.000000 0.176471 0.344130 0.993197 1.000000
865 0.971910 1.0 0.5 0.137079 0.0 0.522493 0.000000 0.000000 0.185294 0.344130 0.993197 1.000000
866 0.973034 1.0 0.5 0.258427 0.0 0.334004 0.166667 0.000000 0.929412 0.360324 0.993197 0.333333
867 0.974157 0.0 0.0 0.782022 1.0 0.384267 0.000000 0.000000 0.867647 0.744939 0.040816 1.000000
868 0.975281 0.0 1.0 1.000000 1.0 0.367921 0.000000 0.000000 0.454412 0.259109 0.993197 1.000000
869 0.976404 1.0 1.0 0.457303 1.0 0.044986 0.166667 0.166667 0.505882 0.299595 0.993197 1.000000
870 0.977528 0.0 1.0 0.066292 1.0 0.321438 0.000000 0.000000 0.569118 0.161943 0.993197 1.000000
871 0.978652 1.0 0.0 0.088764 0.0 0.585323 0.166667 0.166667 0.079412 0.761134 0.693878 1.000000
872 0.979775 0.0 0.0 0.156180 1.0 0.409399 0.000000 0.000000 0.738235 0.008097 0.231293 1.000000
873 0.980899 0.0 1.0 0.939326 1.0 0.585323 0.000000 0.000000 0.445588 0.234818 0.993197 1.000000
874 0.982022 1.0 0.5 0.004494 0.0 0.346569 0.166667 0.000000 0.845588 0.534413 0.993197 0.333333
875 0.983146 1.0 1.0 0.641573 0.0 0.183212 0.000000 0.000000 0.286765 0.064777 0.993197 0.333333
876 0.984270 0.0 1.0 0.349438 1.0 0.246042 0.000000 0.000000 0.741176 0.279352 0.993197 1.000000
877 0.985393 0.0 1.0 0.738202 1.0 0.233476 0.000000 0.000000 0.526471 0.161943 0.993197 1.000000
878 0.986517 0.0 1.0 0.506742 1.0 0.367921 0.000000 0.000000 0.533824 0.161943 0.993197 1.000000
879 0.987640 1.0 0.0 0.750562 0.0 0.698417 0.000000 0.166667 0.086765 0.894737 0.476190 0.333333
880 0.988764 1.0 0.5 0.830337 0.0 0.308872 0.000000 0.166667 0.169118 0.558704 0.993197 1.000000
881 0.989888 0.0 1.0 0.573034 1.0 0.409399 0.000000 0.000000 0.579412 0.161943 0.993197 1.000000
882 0.991011 0.0 1.0 0.216854 0.0 0.271174 0.000000 0.000000 0.747059 0.295547 0.993197 1.000000
883 0.992135 0.0 0.5 0.068539 1.0 0.346569 0.000000 0.000000 0.830882 0.291498 0.993197 1.000000
884 0.993258 0.0 1.0 0.891011 1.0 0.308872 0.000000 0.000000 0.955882 0.048583 0.993197 1.000000
885 0.994382 0.0 1.0 0.768539 0.0 0.484795 0.000000 0.833333 0.705882 0.611336 0.993197 0.666667
886 0.995506 0.0 0.5 0.615730 1.0 0.334004 0.000000 0.000000 0.148529 0.344130 0.993197 1.000000
887 0.996629 1.0 0.0 0.340449 0.0 0.233476 0.000000 0.000000 0.020588 0.619433 0.204082 1.000000
888 0.997753 0.0 1.0 0.464045 0.0 0.367921 0.166667 0.333333 0.992647 0.530364 0.993197 1.000000
889 0.998876 1.0 0.0 0.091011 1.0 0.321438 0.000000 0.000000 0.011765 0.619433 0.408163 0.333333
890 1.000000 0.0 1.0 0.247191 1.0 0.396833 0.000000 0.000000 0.685294 0.121457 0.993197 0.666667

891 rows × 12 columns


In [12]:
##6. Seleciona as features baseada na correlação 
datasetn.corr()


Out[12]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
PassengerId 1.000000 -0.005007 -0.035144 -0.038559 0.042939 0.033207 -0.065229 -0.001652 -0.056554 -0.006390 -0.035197 0.012985
Survived -0.005007 1.000000 -0.338481 -0.057343 -0.543351 -0.069809 -0.026385 0.081629 -0.164549 0.333943 -0.253658 -0.176509
Pclass -0.035144 -0.338481 1.000000 0.052831 0.131900 -0.331339 0.078141 0.018443 0.319869 -0.724119 0.682176 0.173511
Name -0.038559 -0.057343 0.052831 1.000000 0.020314 0.057466 -0.035535 -0.049105 0.047348 -0.053846 0.062119 -0.010633
Sex 0.042939 -0.543351 0.131900 0.020314 1.000000 0.084153 -0.123164 -0.245489 0.059372 -0.265389 0.095991 0.118492
Age 0.033207 -0.069809 -0.331339 0.057466 0.084153 1.000000 -0.254997 -0.179191 -0.068848 0.110296 -0.234912 -0.039610
SibSp -0.065229 -0.026385 0.078141 -0.035535 -0.123164 -0.254997 1.000000 0.423338 0.069238 0.368688 0.040687 0.069165
Parch -0.001652 0.081629 0.018443 -0.049105 -0.245489 -0.179191 0.423338 1.000000 0.020003 0.361243 -0.028179 0.043351
Ticket -0.056554 -0.164549 0.319869 0.047348 0.059372 -0.068848 0.069238 0.020003 1.000000 -0.168153 0.243082 0.011146
Fare -0.006390 0.333943 -0.724119 -0.053846 -0.265389 0.110296 0.368688 0.361243 -0.168153 1.000000 -0.538549 -0.169849
Cabin -0.035197 -0.253658 0.682176 0.062119 0.095991 -0.234912 0.040687 -0.028179 0.243082 -0.538549 1.000000 0.226137
Embarked 0.012985 -0.176509 0.173511 -0.010633 0.118492 -0.039610 0.069165 0.043351 0.011146 -0.169849 0.226137 1.000000

In [13]:
'''
Baseando-se com na correlação dos atributos com Survived, serão selecionados os atributos com correlação acima de 0.3 
  Correlação com Survived:
   Forte/Moderada: Pclass, Sex, Fare
   Fraca: Passengerid, SibSp, Embarked, Age, Name, Parch, Ticket, Fare, Cabin
'''

#Elimina os atributos irrelevantes para predição
datasetd = datasetn.drop(['PassengerId','SibSp','Embarked','Age','Name','Parch','Ticket','Cabin'], axis=1)
#datasetd

In [14]:
##7. Separa o dataset em treino (75%) / teste (25%) / validação (10% do treino)

# Separa o atributo Survived (saída) dos outros (entrada)
y = datasetd.Survived.values
X = datasetd.drop('Survived', axis=1).values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.1)

In [15]:
##8. Executa o classificador para 30 k's, pulando de 4 em 4
from sklearn import metrics
a_accuracy = []
ks = []
for k in range(3,30,4):
    knn = KNNClassifier()
    knn.fit(X_train, y_train)
    result = knn.predict(k, X_test)
    accuracy = metrics.accuracy_score(result,y_test)
    a_accuracy.append(accuracy)
    ks.append(k)

In [16]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.scatter(ks, a_accuracy, marker='o');
plt.grid()
plt.title('Relação da acurácia com a quantidade de vizinhos')
plt.xlabel('k')
plt.ylabel(u'Acurácia')


Out[16]:
Text(0,0.5,'Acurácia')

In [17]:
##9. Executa o classificar para o melhor k
# encontra melhor acuracia
index, best_accuracy = max(enumerate(a_accuracy))
print('Melhor acurácia: ',best_accuracy,' | Index:',index,
     '\nMelhor k: ',ks[index])


Melhor acurácia:  0.834080717489  | Index: 6 
Melhor k:  27

In [18]:
knn_best = KNNClassifier()
knn_best.fit(X_train, y_train)
result = knn.predict(ks[index], X_validation)
accuracy = metrics.accuracy_score(result,y_validation)
print('{0:f}'.format(accuracy))


0.835821

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_validation, result, target_names=['Survived', 'Not Survived']))


              precision    recall  f1-score   support

    Survived       0.80      0.98      0.88        41
Not Survived       0.94      0.62      0.74        26

 avg / total       0.85      0.84      0.83        67


In [ ]:


In [ ]: