In [1]:
import pandas as pd
from scipy.spatial import distance as dst
from collections import Counter
import numpy as np
In [2]:
data = pd.read_csv('atomsradii.csv')
data
Out[2]:
In [3]:
test = pd.read_csv('testing.csv')
test
Out[3]:
In [4]:
def distance(row1, row2):
return dst.euclidean(row1, row2)
In [5]:
def gen_dist_list(train_df, row):
dist_list = []
for i in range(train_df.shape[0]):
dist_list.append([distance(train_df.ix[i, 0:2], row), train_df.ix[i, 3]])
dist_list.sort()
return dist_list
In [6]:
def decide_type(dist_list, k):
type_list = Counter(np.array(dist_list[0:k])[:, 1]).most_common()
count_list = list(np.array(type_list)[:, 1])
if count_list.count(max(count_list)) == 1:
type_ = type_list[0][0]
else:
count = count_list.count(max(count_list))
avg_dist_list = []
for tp in np.array(type_list[0:count])[:, 0]:
avg_dist = 0
for item in dist_list:
if item[1] == tp:
avg_dist += item[0]
avg_dist_list.append([avg_dist, tp])
avg_dist_list.sort()
type_ = avg_dist_list[0][1]
return(type_)
In [7]:
def knn(train_df, pred_df, k):
type_list = []
for i in range(pred_df.shape[0]):
dist_list = gen_dist_list(train_df, pred_df.ix[i, 0:2])
type_list.append(decide_type(dist_list, k))
pred_df['Pred_type'] = type_list
return pred_df
In [8]:
knn(data, test, 7)
Out[8]:
In [9]:
def gen_k_dict(train_df, pred_df, k):
k_dict = {}
for i in k:
pred_df = knn(train_df, pred_df, i)
accuracy = 0
for j in range(pred_df.shape[0]):
if pred_df.loc[j].Type == pred_df.loc[j].Pred_type:
accuracy += 1
k_dict[i] = accuracy/pred_df.shape[0]
return k_dict
In [10]:
gen_k_dict(data, test, list(range(1, 15)))
Out[10]:
In [ ]: