In [2]:
import numpy as np
from sklearn import datasets
from sklearn.neighbors import DistanceMetric
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import math
from sklearn.model_selection import train_test_split
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

In [3]:
class KNNClassifier(object):
    def __init__(self):
        self.X_train = None
        self.y_train = None

    def euc_distance(self, a, b):
        return DistanceMetric.get_metric('euclidean').pairwise([a],[b])[0][0]

    def closest(self, row):
        dists = [self.euc_distance(row, item) for  item in self.X_train]
        nei = dists.index(min(dists))
        
        return self.y_train[nei]
    
    def k_closest(self, row, k):
        dists = [(index,self.euc_distance(row, item)) for index,item in enumerate(self.X_train)]
        dists.sort(key = lambda x: x[1])
        vizinhos = []
        for i in range(k):
            vizinhos.append(self.y_train[dists[i][0]])
        classe = Counter(vizinhos).most_common(1)[0][0]
        
        return classe
        
    def fit(self, training_data, training_labels):
        self.X_train = training_data
        self.y_train = training_labels
        
        

    def predict(self, to_classify, k=3):
        predictions = []
        for row in to_classify:
            label = self.k_closest(row, k)
            predictions.append(label)
        return predictions

In [4]:
df = pd.read_csv('train.csv')


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-4-39b4f6e799f9> in <module>()
----> 1 df = pd.read_csv('train.csv')

~/miniconda2/envs/ambiente_1/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    653                     skip_blank_lines=skip_blank_lines)
    654 
--> 655         return _read(filepath_or_buffer, kwds)
    656 
    657     parser_f.__name__ = name

~/miniconda2/envs/ambiente_1/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    403 
    404     # Create the parser.
--> 405     parser = TextFileReader(filepath_or_buffer, **kwds)
    406 
    407     if chunksize or iterator:

~/miniconda2/envs/ambiente_1/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    762             self.options['has_index_names'] = kwds['has_index_names']
    763 
--> 764         self._make_engine(self.engine)
    765 
    766     def close(self):

~/miniconda2/envs/ambiente_1/lib/python3.6/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
    983     def _make_engine(self, engine='c'):
    984         if engine == 'c':
--> 985             self._engine = CParserWrapper(self.f, **self.options)
    986         else:
    987             if engine == 'python':

~/miniconda2/envs/ambiente_1/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1603         kwds['allow_leading_cols'] = self.index_col is not False
   1604 
-> 1605         self._reader = parsers.TextReader(src, **kwds)
   1606 
   1607         # XXX

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__ (pandas/_libs/parsers.c:4209)()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source (pandas/_libs/parsers.c:8873)()

FileNotFoundError: File b'train.csv' does not exist

In [ ]:
df['Age'] = df.Age.fillna(df.Age.mean())
df = df.where((pd.notnull(df)), 0)
for f in ["Sex", "Embarked"]:
    df[f] = df[f].astype('category')
    df[f] = df[f].cat.codes

df.head()

In [ ]:
feat = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]
X = df.get(feat)
X

In [ ]:
Y = df["Survived"].values

In [ ]:
Y

In [ ]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25)

In [5]:
knn = KNeighborsClassifier()

In [6]:
knn.fit(X_train,Y_train)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-921344aee3ea> in <module>()
----> 1 knn.fit(X_train,Y_train)

NameError: name 'X_train' is not defined

In [7]:
resultado = knn.predict(X_test)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-1c715c984946> in <module>()
----> 1 resultado = knn.predict(X_test)

NameError: name 'X_test' is not defined

In [12]:
accuracy_score(Y_test, resultado)


Out[12]:
0.66816143497757852

In [13]:
scores = []
ks = range(1,120, 4)
n_ks = len(ks)

for k in ks:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,Y_train)
    scores.append(knn.score(X_test,Y_test))

In [14]:
fig,ax = plt.subplots()
ax.plot(ks,scores,'b',linewidth=3)
ax.set_ylabel('Accuracy')
ax.set_xlabel('K Neighbors')

plt.grid()



In [15]:
max(scores)


Out[15]:
0.68609865470852016

In [16]:
bestK = scores.index(max(scores))

bestK = bestK*4 +1
bestK


Out[16]:
21

In [17]:
knn = KNeighborsClassifier(n_neighbors=bestK)
knn.fit(X_train,Y_train)


Out[17]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=21, p=2,
           weights='uniform')

In [18]:
result = knn.predict(X_test)

In [19]:
acr = accuracy_score(Y_test, result)
acr


Out[19]:
0.68609865470852016

In [20]:
report = classification_report(knn.predict(X_test), Y_test)

In [21]:
print(report)


             precision    recall  f1-score   support

          0       0.82      0.70      0.76       155
          1       0.49      0.65      0.56        68

avg / total       0.72      0.69      0.70       223


In [23]:
knn = KNNClassifier()

In [24]:
knn.fit(X_train.values, Y_train)

In [25]:
resultado = knn.predict(X_test.values, k=bestK)

In [26]:
print(classification_report(resultado, Y_test))


             precision    recall  f1-score   support

          0       0.83      0.69      0.75       159
          1       0.46      0.64      0.53        64

avg / total       0.72      0.68      0.69       223


In [27]:
accuracy_score(Y_test, resultado)


Out[27]:
0.67713004484304928

In [22]: