K-NN Classifier

Import packages and data:



In [1]:

    
import knn
import pandas as pd



In [2]:

    
test = pd.read_csv('testing.csv')
data = pd.read_csv('atomsradii.csv')



In [3]:

    
test



In [4]:

    
data

Functions:

1. dist(vec1,vec2): caculate Euclidean distance.

input: 2 same size 1-D arrays.
output: Euclidean distance between them.



In [5]:

    
"""
def dist(vec1,vec2):
#Use scipy package to calculate Euclidean distance.
    from scipy.spatial import distance
    return(distance.euclidean(vec1,vec2))
"""
print()



In [6]:

    
#Try
knn.dist(data.loc[0][0:2],test.loc[0][0:2])









    Out[6]:





0.67623960250786863

2. def sortdist(df, vec): calculate Eulidean distance between input 1-D array and each row in input dataframe.

input: (df: a dataframe; vec: 1-D array with 2 elements)
output: distance is ordered from min to max with its index.



In [7]:

    
"""
def sortdist(df, vec):
#Use for loop, calculate Eulidean distance between input 1-D array and
#each row in input dataframe.
    distlist = []
    for index, row in df.iterrows():
        distlist.append([dist(row[0:2],vec),index])
#Output is ordered from min to max with its index.
    distlist.sort()
    return(distlist)
"""
print()



In [8]:

    
#Try
knn.sortdist(data,[0.5,0.5])









    Out[8]:





[[0.18681541692269407, 6],
 [0.21633307652783934, 5],
 [0.28000000000000003, 0],
 [0.39395431207184417, 7],
 [0.43462627624201466, 1],
 [0.44654227123532214, 8],
 [0.49335585534176041, 2],
 [0.55542776307995267, 10],
 [0.57982756057296891, 9],
 [0.59933296255086788, 3],
 [0.66640828326184554, 4],
 [0.73109506905736954, 11],
 [0.84480767041972349, 12],
 [0.88073832663283147, 13],
 [0.97984692682071517, 14]]

3. classify(k,df,vec): give 1-D array a group by choosing a k value and training dataframe.

input: (k: k values; df: training dataframe, there should be a column named 'Type'; vec: 1-D array with 2 elements)
output: group which vec belongs to. ##### Note: If there are 2 or more max values, it will choose the closet point as its group among those max values.(Please example below)



In [9]:

    
"""
def classify(k,df,vec):
    import itertools as it
    countlist = []
    grouplist = []
    gpsorted = []
    makechoice = []
#Choose k numbers of sorted data.
    kdistlist = sortdist(df,vec)[0:k]
#Find out the group it belongs to.
    for i in range(k):
        grouplist.append(df.loc[kdistlist[i][1]]['Type'])
#Use groupby to get numbers of the same group and unduplicated group name.
    gp = sorted(grouplist)
#    print(grouplist)
    for key, group in it.groupby(gp):
        countlist.append(len(list(group)))
        gpsorted.append(key)
#    print(countlist)
#    print(gpsorted)
#Use a if loop to deal with 2 or more max values in countlist.
    if countlist.count(max(countlist))>1:
        for j in range(len(countlist)):
            if countlist[j] == max(countlist):
                makechoice.append(grouplist.index(gpsorted[j]))
#        print(makechoice)
#It will choose the group which appeared ahead in grouplist.
#(In most case, it is the shortest distance among equal numbers of same
#groups.)
        group = grouplist[min(makechoice)]
    else:
        group = gpsorted[countlist.index(max(countlist))]
    return group
"""
print()



In [10]:

    
# Try
knn.classify(8,data,[0.5,0.5])









    Out[10]:





'PT'

In this class, the group types in order of the distance to point [0.5, 0.5] are ['TM', 'TM', 'PT', 'Alk', 'PT', 'Alk', 'PT', 'Alk'] (can be checked by sortdist function). There are 2 'TM', 3 'PT', and 3 'Alk'. It will choose 'PT' rather than 'Alk' because the first 'PT' (index = 2) is in front of the first 'Alk' (index = 3), which means the first 'PT' is closer to the point [0.5,0.5].

4. knn(traindf,inputdf,k): Knn classfy by a given k.

input: (traindf: training dataframe with a column named 'Type'; inputdf: dataframe which needs to be classified; k: k value)
output: inputdf with a new column named 'Knn Type'.



In [11]:

    
"""
def knn(traindf,inputdf,k):
#Wrap all functions and append a column called 'Knn Type'.
    group = []
    for index, row in inputdf.iterrows():
        group.append(classify(k,traindf,row[0:2]))
    inputdf['Knn Type'] = group
    return(inputdf)
"""
print()



In [12]:

    
knn.knn(data, test, 4)

5. knnac(traindf,inputdf,k): calculate accuracy by different k value.

input: (traindf: training dataframe with a column named 'Type'; inputdf: dataframe which needs to be classified; k: a list of k value)
output: a dictionary with k as the keys and the training accuracy of the test set.



In [13]:

    
"""
def knnac(traindf,inputdf,k):
#Creat a dictionary, and calculate accuracy.
    k_ac = {}
    for i in k:
        n = 0
        for j in range(inputdf.shape[0]):
            if knn(traindf,inputdf,i)['Type'][j] == knn(traindf,inputdf,i)['Knn $
                n += 1
        k_ac[i] = '{0:.1%}'.format(n/inputdf.shape[0])
    return k_ac
"""
print()



In [14]:

    
#Try
knn.knnac(data,test,range(1,data.shape[0]))









    Out[14]:





{1: '80.0%',
 2: '80.0%',
 3: '60.0%',
 4: '100.0%',
 5: '60.0%',
 6: '60.0%',
 7: '60.0%',
 8: '60.0%',
 9: '60.0%',
 10: '60.0%',
 11: '60.0%',
 12: '60.0%',
 13: '40.0%',
 14: '40.0%'}

Unit tests:



In [15]:

    
import test_knn as tk

1.



In [16]:

    
tk.test_dist()



In [17]:

    
"""
def test_dist():
    import knn
    vec1 = [0, 0, 0, 1]
    vec2 = [0, 0, 0, 0]
    assert int(knn.dist(vec1, vec2)) == 1, "Distance calculate not correct."
"""
print()

2.



In [18]:

    
tk.test_sortdist()



In [19]:

    
"""
def test_sortdist():
    import knn
    import pandas as pd
    df = pd.DataFrame([[0,3], [0,1], [0,2]])
    vec = [0,0]
    assert int(knn.sortdist(df,vec)[0][0]) == 1, "Sort not handled properly"
    assert int(knn.sortdist(df,vec)[0][1]) == 1, "Index finding not handled properly."
"""
print()

3.



In [20]:

    
tk.test_classify()



In [21]:

    
"""
def test_classify():
    import knn
    import pandas as pd
    df = pd.DataFrame([[0,3,'C'], [0,1,'B'], [0,2,'A']],columns = [1,2,'Type'])
    vec = [0,0]
#Test if there are multiple group type by a given k.
    assert knn.classify(1,df,vec) == 'B', "Group finding not handled properly."
    assert knn.classify(2,df,vec) == 'B', "Group finding not handled properly."
    assert knn.classify(3,df,vec) == 'B', "Group finding not handled properly."
"""
print()

4.



In [22]:

    
tk.test_knn()



In [23]:

    
"""
def test_knn():
    import knn
    import pandas as pd
    df1 = pd.DataFrame([[0,3,'C'], [0,1,'B'], [0,2,'A']],columns = [1,2,'Type'])
    df2 = pd.DataFrame([[0,3.1], [0,1.1], [0,2.1]])
    assert list(knn.knn(df1,df2,1)['Knn Type']) == ['C', 'B', 'A'], "Knn not handled properly."
    assert knn.knn(df1,df2,1).shape[1] == 3, "Column appending not handled properly."
"""
print()

5.



In [24]:

    
tk.test_knnac()



In [25]:

    
"""
def test_knnac():
    import knn
    import pandas as pd
    df1 = pd.DataFrame([[0,3,'C'], [0,1,'B'], [0,2,'A']],columns = [1,2,'Type'])
    df2 = pd.DataFrame([[0,3.1,'C'], [0,1.1,'B'], [0,2.1,'A']],columns = [1,2,'Type'])
    assert len(knn.knnac(df1,df2,[1,2,3])) == 3, "Maybe miss a calculation of a k value."
    assert knn.knnac(df1,df2,[1,2,3])[1] == '100.0%', "Accuracy not handled properly."
    assert type(knn.knnac(df1,df2,[1,2,3])) == dict, "Dictionary not handled properly"
"""
print()



In [ ]:

	rWC	rCh	Atom	Type
0	0.51	1.12	X1	Alk
1	0.37	0.77	X2	TM
2	0.62	0.35	X3	PT
3	0.62	0.62	X4	TM
4	0.62	0.93	X5	Alk

	rWC	rCh	Atom	Type
0	0.78	0.50	B	PT
1	0.90	0.67	Si	PT
2	0.97	0.65	Ga	PT
3	1.04	0.76	Al	PT
4	1.10	0.79	Ir	PT
5	0.32	0.62	Zn	TM
6	0.45	0.68	Cd	TM
7	0.14	0.66	Be	Alk
8	0.25	0.87	Mg	Alk
9	0.19	0.99	Li	Alk
10	0.28	1.01	Na	Alk
11	0.54	1.23	Ca	Alk
12	0.59	1.34	K	Alk
13	0.69	1.36	Sr	Alk
14	0.74	1.45	Rb	Alk

	rWC	rCh	Atom	Type	Knn Type
0	0.51	1.12	X1	Alk	Alk
1	0.37	0.77	X2	TM	TM
2	0.62	0.35	X3	PT	PT
3	0.62	0.62	X4	TM	TM
4	0.62	0.93	X5	Alk	Alk