In [1]:
import pandas as pd
from scipy.spatial import distance as dst
from collections import Counter
import numpy as np

In [2]:
data = pd.read_csv('atomsradii.csv')
data


Out[2]:
rWC rCh Atom Type
0 0.78 0.50 B PT
1 0.90 0.67 Si PT
2 0.97 0.65 Ga PT
3 1.04 0.76 Al PT
4 1.10 0.79 Ir PT
5 0.32 0.62 Zn TM
6 0.45 0.68 Cd TM
7 0.14 0.66 Be Alk
8 0.25 0.87 Mg Alk
9 0.19 0.99 Li Alk
10 0.28 1.01 Na Alk
11 0.54 1.23 Ca Alk
12 0.59 1.34 K Alk
13 0.69 1.36 Sr Alk
14 0.74 1.45 Rb Alk

In [3]:
test = pd.read_csv('testing.csv')
test


Out[3]:
rWC rCh Atom Type
0 0.51 1.12 X1 Alk
1 0.37 0.77 X2 TM
2 0.62 0.35 X3 PT
3 0.62 0.62 X4 TM
4 0.62 0.93 X5 Alk

In [4]:
def distance(row1, row2):
    return dst.euclidean(row1, row2)

In [5]:
def gen_dist_list(train_df, row):
    dist_list = []
    for i in range(train_df.shape[0]):
        dist_list.append([distance(train_df.ix[i, 0:2], row), train_df.ix[i, 3]])
    dist_list.sort()
    return dist_list

In [6]:
def decide_type(dist_list, k):
    type_list = Counter(np.array(dist_list[0:k])[:, 1]).most_common()
    count_list = list(np.array(type_list)[:, 1])
    if count_list.count(max(count_list)) == 1:
        type_ = type_list[0][0]
    else:
        count = count_list.count(max(count_list))
        avg_dist_list = []
        for tp in np.array(type_list[0:count])[:, 0]:
            avg_dist = 0
            for item in dist_list:
                if item[1] == tp:
                    avg_dist += item[0]
            avg_dist_list.append([avg_dist, tp])
            avg_dist_list.sort()
            type_ = avg_dist_list[0][1]
    return(type_)

In [7]:
def knn(train_df, pred_df, k):
    type_list = []
    for i in range(pred_df.shape[0]):
        dist_list = gen_dist_list(train_df, pred_df.ix[i, 0:2])
        type_list.append(decide_type(dist_list, k))
    pred_df['Pred_type'] = type_list
    return pred_df

In [8]:
knn(data, test, 7)


Out[8]:
rWC rCh Atom Type Pred_type
0 0.51 1.12 X1 Alk Alk
1 0.37 0.77 X2 TM Alk
2 0.62 0.35 X3 PT PT
3 0.62 0.62 X4 TM PT
4 0.62 0.93 X5 Alk Alk

In [9]:
def gen_k_dict(train_df, pred_df, k):
    k_dict = {}
    for i in k:
        pred_df = knn(train_df, pred_df, i)
        accuracy = 0
        for j in range(pred_df.shape[0]):
            if pred_df.loc[j].Type == pred_df.loc[j].Pred_type:
                accuracy += 1
        k_dict[i] = accuracy/pred_df.shape[0]
    return k_dict

In [10]:
gen_k_dict(data, test, list(range(1, 15)))


Out[10]:
{1: 0.8,
 2: 0.6,
 3: 0.6,
 4: 0.8,
 5: 0.6,
 6: 0.6,
 7: 0.6,
 8: 0.6,
 9: 0.6,
 10: 0.6,
 11: 0.6,
 12: 0.6,
 13: 0.4,
 14: 0.4}

In [ ]: