In [1]:
import numpy as np
from sklearn import datasets
%matplotlib inline
import matplotlib.pyplot as plt
import collections

In [2]:
class KNNClassifier(object):
    def __init__(self):
        self.X_train = None
        self.y_train = None

    def euc_distance(self, a, b):
        return np.linalg.norm(a-b)

    def closest(self, row,k):
        """
        Retorna a classe respondente ao ponto mais próximo do dataset de treino.\
        É um exemplo de implementação do kNN com k=1.
        """
        dists = [self.euc_distance(row, item) for item in self.X_train][self.y_trains[row]]
        dists.sort(key=operator.itemgetter(1))
        neighbors = []
        for x in range(k):
            neighbors.append(dist[x][0])
        return neighbors
    def define_class(neighbors):
        classVotes = {}
        for x in range(len(neighbors)):
            response = neighbors[x][-1]
            if response in classVotes:
                classVotes[response] += 1
            else:
                classVotes[response] = 1
        sortedVotes = sorted(classVotes.iteritems(), key=operator.itemgetter(1), reverse=True)
        return sortedVotes[0][0]
    def fit(self, training_data, training_labels):
        self.X_train = training_data
        self.y_train = training_labels

    def predict(self, to_classify):
        predictions = []
        for row in to_classify:
            label = self.closest(row)
            predictions.append(label)
        return predictions

In [3]:
titanic = open('train.csv')

In [4]:
from sklearn import metrics

In [5]:
from sklearn.neighbors import KNeighborsClassifier

In [6]:
knn = KNeighborsClassifier(p=2, metric='minkowski')

In [7]:
import pandas as pd

In [8]:
titanic = pd.read_csv('train.csv',na_values="?" )

In [9]:
print(titanic)


     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
5              6         0       3   
6              7         0       1   
7              8         0       3   
8              9         1       3   
9             10         1       2   
10            11         1       3   
11            12         1       1   
12            13         0       3   
13            14         0       3   
14            15         0       3   
15            16         1       2   
16            17         0       3   
17            18         1       2   
18            19         0       3   
19            20         1       3   
20            21         0       2   
21            22         1       2   
22            23         1       3   
23            24         1       1   
24            25         0       3   
25            26         1       3   
26            27         0       3   
27            28         0       1   
28            29         1       3   
29            30         0       3   
..           ...       ...     ...   
861          862         0       2   
862          863         1       1   
863          864         0       3   
864          865         0       2   
865          866         1       2   
866          867         1       2   
867          868         0       1   
868          869         0       3   
869          870         1       3   
870          871         0       3   
871          872         1       1   
872          873         0       1   
873          874         0       3   
874          875         1       2   
875          876         1       3   
876          877         0       3   
877          878         0       3   
878          879         0       3   
879          880         1       1   
880          881         1       2   
881          882         0       3   
882          883         0       3   
883          884         0       2   
884          885         0       3   
885          886         0       3   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
5                                     Moran, Mr. James    male   NaN      0   
6                              McCarthy, Mr. Timothy J    male  54.0      0   
7                       Palsson, Master. Gosta Leonard    male   2.0      3   
8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female  27.0      0   
9                  Nasser, Mrs. Nicholas (Adele Achem)  female  14.0      1   
10                     Sandstrom, Miss. Marguerite Rut  female   4.0      1   
11                            Bonnell, Miss. Elizabeth  female  58.0      0   
12                      Saundercock, Mr. William Henry    male  20.0      0   
13                         Andersson, Mr. Anders Johan    male  39.0      1   
14                Vestrom, Miss. Hulda Amanda Adolfina  female  14.0      0   
15                    Hewlett, Mrs. (Mary D Kingcome)   female  55.0      0   
16                                Rice, Master. Eugene    male   2.0      4   
17                        Williams, Mr. Charles Eugene    male   NaN      0   
18   Vander Planke, Mrs. Julius (Emelia Maria Vande...  female  31.0      1   
19                             Masselmani, Mrs. Fatima  female   NaN      0   
20                                Fynney, Mr. Joseph J    male  35.0      0   
21                               Beesley, Mr. Lawrence    male  34.0      0   
22                         McGowan, Miss. Anna "Annie"  female  15.0      0   
23                        Sloper, Mr. William Thompson    male  28.0      0   
24                       Palsson, Miss. Torborg Danira  female   8.0      3   
25   Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...  female  38.0      1   
26                             Emir, Mr. Farred Chehab    male   NaN      0   
27                      Fortune, Mr. Charles Alexander    male  19.0      3   
28                       O'Dwyer, Miss. Ellen "Nellie"  female   NaN      0   
29                                 Todoroff, Mr. Lalio    male   NaN      0   
..                                                 ...     ...   ...    ...   
861                        Giles, Mr. Frederick Edward    male  21.0      1   
862  Swift, Mrs. Frederick Joel (Margaret Welles Ba...  female  48.0      0   
863                  Sage, Miss. Dorothy Edith "Dolly"  female   NaN      8   
864                             Gill, Mr. John William    male  24.0      0   
865                           Bystrom, Mrs. (Karolina)  female  42.0      0   
866                       Duran y More, Miss. Asuncion  female  27.0      1   
867               Roebling, Mr. Washington Augustus II    male  31.0      0   
868                        van Melkebeke, Mr. Philemon    male   NaN      0   
869                    Johnson, Master. Harold Theodor    male   4.0      1   
870                                  Balkic, Mr. Cerin    male  26.0      0   
871   Beckwith, Mrs. Richard Leonard (Sallie Monypeny)  female  47.0      1   
872                           Carlsson, Mr. Frans Olof    male  33.0      0   
873                        Vander Cruyssen, Mr. Victor    male  47.0      0   
874              Abelson, Mrs. Samuel (Hannah Wizosky)  female  28.0      1   
875                   Najib, Miss. Adele Kiamie "Jane"  female  15.0      0   
876                      Gustafsson, Mr. Alfred Ossian    male  20.0      0   
877                               Petroff, Mr. Nedelio    male  19.0      0   
878                                 Laleff, Mr. Kristo    male   NaN      0   
879      Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)  female  56.0      0   
880       Shelley, Mrs. William (Imanita Parrish Hall)  female  25.0      0   
881                                 Markun, Mr. Johann    male  33.0      0   
882                       Dahlberg, Miss. Gerda Ulrika  female  22.0      0   
883                      Banfield, Mr. Frederick James    male  28.0      0   
884                             Sutehall, Mr. Henry Jr    male  25.0      0   
885               Rice, Mrs. William (Margaret Norton)  female  39.0      0   
886                              Montvila, Rev. Juozas    male  27.0      0   
887                       Graham, Miss. Margaret Edith  female  19.0      0   
888           Johnston, Miss. Catherine Helen "Carrie"  female   NaN      1   
889                              Behr, Mr. Karl Howell    male  26.0      0   
890                                Dooley, Mr. Patrick    male  32.0      0   

     Parch            Ticket      Fare        Cabin Embarked  
0        0         A/5 21171    7.2500          NaN        S  
1        0          PC 17599   71.2833          C85        C  
2        0  STON/O2. 3101282    7.9250          NaN        S  
3        0            113803   53.1000         C123        S  
4        0            373450    8.0500          NaN        S  
5        0            330877    8.4583          NaN        Q  
6        0             17463   51.8625          E46        S  
7        1            349909   21.0750          NaN        S  
8        2            347742   11.1333          NaN        S  
9        0            237736   30.0708          NaN        C  
10       1           PP 9549   16.7000           G6        S  
11       0            113783   26.5500         C103        S  
12       0         A/5. 2151    8.0500          NaN        S  
13       5            347082   31.2750          NaN        S  
14       0            350406    7.8542          NaN        S  
15       0            248706   16.0000          NaN        S  
16       1            382652   29.1250          NaN        Q  
17       0            244373   13.0000          NaN        S  
18       0            345763   18.0000          NaN        S  
19       0              2649    7.2250          NaN        C  
20       0            239865   26.0000          NaN        S  
21       0            248698   13.0000          D56        S  
22       0            330923    8.0292          NaN        Q  
23       0            113788   35.5000           A6        S  
24       1            349909   21.0750          NaN        S  
25       5            347077   31.3875          NaN        S  
26       0              2631    7.2250          NaN        C  
27       2             19950  263.0000  C23 C25 C27        S  
28       0            330959    7.8792          NaN        Q  
29       0            349216    7.8958          NaN        S  
..     ...               ...       ...          ...      ...  
861      0             28134   11.5000          NaN        S  
862      0             17466   25.9292          D17        S  
863      2          CA. 2343   69.5500          NaN        S  
864      0            233866   13.0000          NaN        S  
865      0            236852   13.0000          NaN        S  
866      0     SC/PARIS 2149   13.8583          NaN        C  
867      0          PC 17590   50.4958          A24        S  
868      0            345777    9.5000          NaN        S  
869      1            347742   11.1333          NaN        S  
870      0            349248    7.8958          NaN        S  
871      1             11751   52.5542          D35        S  
872      0               695    5.0000  B51 B53 B55        S  
873      0            345765    9.0000          NaN        S  
874      0         P/PP 3381   24.0000          NaN        C  
875      0              2667    7.2250          NaN        C  
876      0              7534    9.8458          NaN        S  
877      0            349212    7.8958          NaN        S  
878      0            349217    7.8958          NaN        S  
879      1             11767   83.1583          C50        C  
880      1            230433   26.0000          NaN        S  
881      0            349257    7.8958          NaN        S  
882      0              7552   10.5167          NaN        S  
883      0  C.A./SOTON 34068   10.5000          NaN        S  
884      0   SOTON/OQ 392076    7.0500          NaN        S  
885      5            382652   29.1250          NaN        Q  
886      0            211536   13.0000          NaN        S  
887      0            112053   30.0000          B42        S  
888      2        W./C. 6607   23.4500          NaN        S  
889      0            111369   30.0000         C148        C  
890      0            370376    7.7500          NaN        Q  

[891 rows x 12 columns]

In [25]:
titanic.dtypes


Out[25]:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [31]:
obj_df = titanic.select_dtypes(include=['object']).copy()

In [32]:
obj_df.head()


Out[32]:
Name Sex Ticket Cabin Embarked
0 Braund, Mr. Owen Harris male A/5 21171 NaN S
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female PC 17599 C85 C
2 Heikkinen, Miss. Laina female STON/O2. 3101282 NaN S
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 113803 C123 S
4 Allen, Mr. William Henry male 373450 NaN S

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
sex_encoder = LabelEncoder().fit(titanic.Sex)
titanic['Sex_encoded'] = sex_encoder.transform(titanic.Sex)

In [29]:
cabin_encoder = LabelEncoder().fit(titanic.Cabin.fillna("None"))
titanic["Cabin_encoded"] = cabin_encoder.transform(titanic.Cabin.fillna("None"))
embarked_encoder = LabelEncoder().fit(titanic.Embarked.fillna("None"))
titanic["Embarked_encoded"] = embarked_encoder.transform(titanic.Embarked.fillna("None"))
titanic['Age_imputed'] = titanic.Age.fillna(titanic.Age.mean())

In [30]:
columns = ['Pclass', 'Sex_encoded', 'Cabin_encoded', 'Embarked_encoded', 'Age_imputed', 'SibSp', 'Parch']
titanic[columns].corr()


Out[30]:
Pclass Sex_encoded Cabin_encoded Embarked_encoded Age_imputed SibSp Parch
Pclass 1.000000 0.131900 0.682176 0.197493 -0.331339 0.083081 0.018443
Sex_encoded 0.131900 1.000000 0.095991 0.106395 0.084153 -0.114631 -0.245489
Cabin_encoded 0.682176 0.095991 1.000000 0.232192 -0.234912 0.043525 -0.028179
Embarked_encoded 0.197493 0.106395 0.232192 1.000000 -0.034883 0.068043 0.032517
Age_imputed -0.331339 0.084153 -0.234912 -0.034883 1.000000 -0.232625 -0.179191
SibSp 0.083081 -0.114631 0.043525 0.068043 -0.232625 1.000000 0.414838
Parch 0.018443 -0.245489 -0.028179 0.032517 -0.179191 0.414838 1.000000

In [31]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit

In [32]:
columns.remove("Cabin_encoded")

In [33]:
X = MinMaxScaler().fit_transform(titanic[columns])
y = titanic.Survived.values

In [34]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.1, random_state=1)

In [35]:
from sklearn.neighbors import KNeighborsClassifier

In [36]:
knn = KNeighborsClassifier()

In [37]:
splitRand = ShuffleSplit(test_size=.25, n_splits=1, random_state=0)

n_neighbors = np.arange(1, 30, 4)
param_grid = {'n_neighbors': n_neighbors}

In [38]:
from sklearn.model_selection import GridSearchCV

In [39]:
gridS = GridSearchCV(knn, param_grid, cv=splitRand)
gridS.fit(X_train, y_train)


Out[39]:
GridSearchCV(cv=ShuffleSplit(n_splits=1, random_state=0, test_size=0.25, train_size=None),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': array([ 1,  5,  9, 13, 17, 21, 25, 29])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [40]:
plt.figure(figsize=(10, 6))
plt.plot(n_neighbors, gridS.cv_results_['mean_train_score'])
plt.plot(n_neighbors, gridS.cv_results_['mean_test_score'])
plt.legend(['treino', 'teste'])
plt.title("Precisão")
plt.xlabel("Vizinhos")


Out[40]:
Text(0.5,0,'Vizinhos')

In [41]:
gridS.best_params_


Out[41]:
{'n_neighbors': 9}

In [42]:
melhor = gridS.best_estimator_
melhor.fit(X_train,y_train)


Out[42]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=9, p=2,
           weights='uniform')

In [43]:
from sklearn.metrics import classification_report
print(classification_report(melhor.predict(X_valid), y_valid))


             precision    recall  f1-score   support

          0       0.87      0.72      0.79        64
          1       0.51      0.73      0.60        26

avg / total       0.77      0.72      0.73        90


In [ ]:


In [ ]: