## K-NN Classifier

### Import packages and data:

``````

In :

import knn
import pandas as pd

``````
``````

In :

``````
``````

In :

test

``````
``````

Out:

rWC
rCh
Atom
Type

0
0.51
1.12
X1
Alk

1
0.37
0.77
X2
TM

2
0.62
0.35
X3
PT

3
0.62
0.62
X4
TM

4
0.62
0.93
X5
Alk

``````
``````

In :

data

``````
``````

Out:

rWC
rCh
Atom
Type

0
0.78
0.50
B
PT

1
0.90
0.67
Si
PT

2
0.97
0.65
Ga
PT

3
1.04
0.76
Al
PT

4
1.10
0.79
Ir
PT

5
0.32
0.62
Zn
TM

6
0.45
0.68
Cd
TM

7
0.14
0.66
Be
Alk

8
0.25
0.87
Mg
Alk

9
0.19
0.99
Li
Alk

10
0.28
1.01
Na
Alk

11
0.54
1.23
Ca
Alk

12
0.59
1.34
K
Alk

13
0.69
1.36
Sr
Alk

14
0.74
1.45
Rb
Alk

``````

### Functions:

#### 1. dist(vec1,vec2): caculate Euclidean distance.

• input: 2 same size 1-D arrays.
• output: Euclidean distance between them.
``````

In :

"""
def dist(vec1,vec2):
#Use scipy package to calculate Euclidean distance.
from scipy.spatial import distance
return(distance.euclidean(vec1,vec2))
"""
print()

``````
``````

``````
``````

In :

#Try
knn.dist(data.loc[0:2],test.loc[0:2])

``````
``````

Out:

0.67623960250786863

``````

#### 2. def sortdist(df, vec): calculate Eulidean distance between input 1-D array and each row in input dataframe.

• input: (df: a dataframe; vec: 1-D array with 2 elements)
• output: distance is ordered from min to max with its index.
``````

In :

"""
def sortdist(df, vec):
#Use for loop, calculate Eulidean distance between input 1-D array and
#each row in input dataframe.
distlist = []
for index, row in df.iterrows():
distlist.append([dist(row[0:2],vec),index])
#Output is ordered from min to max with its index.
distlist.sort()
return(distlist)
"""
print()

``````
``````

``````
``````

In :

#Try
knn.sortdist(data,[0.5,0.5])

``````
``````

Out:

[[0.18681541692269407, 6],
[0.21633307652783934, 5],
[0.28000000000000003, 0],
[0.39395431207184417, 7],
[0.43462627624201466, 1],
[0.44654227123532214, 8],
[0.49335585534176041, 2],
[0.55542776307995267, 10],
[0.57982756057296891, 9],
[0.59933296255086788, 3],
[0.66640828326184554, 4],
[0.73109506905736954, 11],
[0.84480767041972349, 12],
[0.88073832663283147, 13],
[0.97984692682071517, 14]]

``````

#### 3. classify(k,df,vec): give 1-D array a group by choosing a k value and training dataframe.

• input: (k: k values; df: training dataframe, there should be a column named 'Type'; vec: 1-D array with 2 elements)
• output: group which vec belongs to. ##### Note: If there are 2 or more max values, it will choose the closet point as its group among those max values.(Please example below)
``````

In :

"""
def classify(k,df,vec):
import itertools as it
countlist = []
grouplist = []
gpsorted = []
makechoice = []
#Choose k numbers of sorted data.
kdistlist = sortdist(df,vec)[0:k]
#Find out the group it belongs to.
for i in range(k):
grouplist.append(df.loc[kdistlist[i]]['Type'])
#Use groupby to get numbers of the same group and unduplicated group name.
gp = sorted(grouplist)
#    print(grouplist)
for key, group in it.groupby(gp):
countlist.append(len(list(group)))
gpsorted.append(key)
#    print(countlist)
#    print(gpsorted)
#Use a if loop to deal with 2 or more max values in countlist.
if countlist.count(max(countlist))>1:
for j in range(len(countlist)):
if countlist[j] == max(countlist):
makechoice.append(grouplist.index(gpsorted[j]))
#        print(makechoice)
#It will choose the group which appeared ahead in grouplist.
#(In most case, it is the shortest distance among equal numbers of same
#groups.)
group = grouplist[min(makechoice)]
else:
group = gpsorted[countlist.index(max(countlist))]
return group
"""
print()

``````
``````

``````
``````

In :

# Try
knn.classify(8,data,[0.5,0.5])

``````
``````

Out:

'PT'

``````

In this class, the group types in order of the distance to point [0.5, 0.5] are ['TM', 'TM', 'PT', 'Alk', 'PT', 'Alk', 'PT', 'Alk'] (can be checked by `sortdist` function). There are 2 'TM', 3 'PT', and 3 'Alk'. It will choose 'PT' rather than 'Alk' because the first 'PT' (index = 2) is in front of the first 'Alk' (index = 3), which means the first 'PT' is closer to the point [0.5,0.5].

#### 4. knn(traindf,inputdf,k): Knn classfy by a given k.

• input: (traindf: training dataframe with a column named 'Type'; inputdf: dataframe which needs to be classified; k: k value)
• output: inputdf with a new column named 'Knn Type'.
``````

In :

"""
def knn(traindf,inputdf,k):
#Wrap all functions and append a column called 'Knn Type'.
group = []
for index, row in inputdf.iterrows():
group.append(classify(k,traindf,row[0:2]))
inputdf['Knn Type'] = group
return(inputdf)
"""
print()

``````
``````

``````
``````

In :

knn.knn(data, test, 4)

``````
``````

Out:

rWC
rCh
Atom
Type
Knn Type

0
0.51
1.12
X1
Alk
Alk

1
0.37
0.77
X2
TM
TM

2
0.62
0.35
X3
PT
PT

3
0.62
0.62
X4
TM
TM

4
0.62
0.93
X5
Alk
Alk

``````

#### 5. knnac(traindf,inputdf,k): calculate accuracy by different k value.

• input: (traindf: training dataframe with a column named 'Type'; inputdf: dataframe which needs to be classified; k: a list of k value)
• output: a dictionary with k as the keys and the training accuracy of the test set.
``````

In :

"""
def knnac(traindf,inputdf,k):
#Creat a dictionary, and calculate accuracy.
k_ac = {}
for i in k:
n = 0
for j in range(inputdf.shape):
if knn(traindf,inputdf,i)['Type'][j] == knn(traindf,inputdf,i)['Knn \$
n += 1
k_ac[i] = '{0:.1%}'.format(n/inputdf.shape)
return k_ac
"""
print()

``````
``````

``````
``````

In :

#Try
knn.knnac(data,test,range(1,data.shape))

``````
``````

Out:

{1: '80.0%',
2: '80.0%',
3: '60.0%',
4: '100.0%',
5: '60.0%',
6: '60.0%',
7: '60.0%',
8: '60.0%',
9: '60.0%',
10: '60.0%',
11: '60.0%',
12: '60.0%',
13: '40.0%',
14: '40.0%'}

``````

### Unit tests:

``````

In :

import test_knn as tk

``````

#### 1.

``````

In :

tk.test_dist()

``````
``````

In :

"""
def test_dist():
import knn
vec1 = [0, 0, 0, 1]
vec2 = [0, 0, 0, 0]
assert int(knn.dist(vec1, vec2)) == 1, "Distance calculate not correct."
"""
print()

``````
``````

``````

#### 2.

``````

In :

tk.test_sortdist()

``````
``````

In :

"""
def test_sortdist():
import knn
import pandas as pd
df = pd.DataFrame([[0,3], [0,1], [0,2]])
vec = [0,0]
assert int(knn.sortdist(df,vec)) == 1, "Sort not handled properly"
assert int(knn.sortdist(df,vec)) == 1, "Index finding not handled properly."
"""
print()

``````
``````

``````

#### 3.

``````

In :

tk.test_classify()

``````
``````

In :

"""
def test_classify():
import knn
import pandas as pd
df = pd.DataFrame([[0,3,'C'], [0,1,'B'], [0,2,'A']],columns = [1,2,'Type'])
vec = [0,0]
#Test if there are multiple group type by a given k.
assert knn.classify(1,df,vec) == 'B', "Group finding not handled properly."
assert knn.classify(2,df,vec) == 'B', "Group finding not handled properly."
assert knn.classify(3,df,vec) == 'B', "Group finding not handled properly."
"""
print()

``````
``````

``````

#### 4.

``````

In :

tk.test_knn()

``````
``````

In :

"""
def test_knn():
import knn
import pandas as pd
df1 = pd.DataFrame([[0,3,'C'], [0,1,'B'], [0,2,'A']],columns = [1,2,'Type'])
df2 = pd.DataFrame([[0,3.1], [0,1.1], [0,2.1]])
assert list(knn.knn(df1,df2,1)['Knn Type']) == ['C', 'B', 'A'], "Knn not handled properly."
assert knn.knn(df1,df2,1).shape == 3, "Column appending not handled properly."
"""
print()

``````
``````

``````

#### 5.

``````

In :

tk.test_knnac()

``````
``````

In :

"""
def test_knnac():
import knn
import pandas as pd
df1 = pd.DataFrame([[0,3,'C'], [0,1,'B'], [0,2,'A']],columns = [1,2,'Type'])
df2 = pd.DataFrame([[0,3.1,'C'], [0,1.1,'B'], [0,2.1,'A']],columns = [1,2,'Type'])
assert len(knn.knnac(df1,df2,[1,2,3])) == 3, "Maybe miss a calculation of a k value."
assert knn.knnac(df1,df2,[1,2,3]) == '100.0%', "Accuracy not handled properly."
assert type(knn.knnac(df1,df2,[1,2,3])) == dict, "Dictionary not handled properly"
"""
print()

``````
``````

``````
``````

In [ ]:

``````