In [1]:
import knn
import pandas as pd
In [2]:
test = pd.read_csv('testing.csv')
data = pd.read_csv('atomsradii.csv')
In [3]:
test
Out[3]:
In [4]:
data
Out[4]:
In [5]:
"""
def dist(vec1,vec2):
#Use scipy package to calculate Euclidean distance.
from scipy.spatial import distance
return(distance.euclidean(vec1,vec2))
"""
print()
In [6]:
#Try
knn.dist(data.loc[0][0:2],test.loc[0][0:2])
Out[6]:
In [7]:
"""
def sortdist(df, vec):
#Use for loop, calculate Eulidean distance between input 1-D array and
#each row in input dataframe.
distlist = []
for index, row in df.iterrows():
distlist.append([dist(row[0:2],vec),index])
#Output is ordered from min to max with its index.
distlist.sort()
return(distlist)
"""
print()
In [8]:
#Try
knn.sortdist(data,[0.5,0.5])
Out[8]:
In [9]:
"""
def classify(k,df,vec):
import itertools as it
countlist = []
grouplist = []
gpsorted = []
makechoice = []
#Choose k numbers of sorted data.
kdistlist = sortdist(df,vec)[0:k]
#Find out the group it belongs to.
for i in range(k):
grouplist.append(df.loc[kdistlist[i][1]]['Type'])
#Use groupby to get numbers of the same group and unduplicated group name.
gp = sorted(grouplist)
# print(grouplist)
for key, group in it.groupby(gp):
countlist.append(len(list(group)))
gpsorted.append(key)
# print(countlist)
# print(gpsorted)
#Use a if loop to deal with 2 or more max values in countlist.
if countlist.count(max(countlist))>1:
for j in range(len(countlist)):
if countlist[j] == max(countlist):
makechoice.append(grouplist.index(gpsorted[j]))
# print(makechoice)
#It will choose the group which appeared ahead in grouplist.
#(In most case, it is the shortest distance among equal numbers of same
#groups.)
group = grouplist[min(makechoice)]
else:
group = gpsorted[countlist.index(max(countlist))]
return group
"""
print()
In [10]:
# Try
knn.classify(8,data,[0.5,0.5])
Out[10]:
In this class, the group types in order of the distance to point [0.5, 0.5] are ['TM', 'TM', 'PT', 'Alk', 'PT', 'Alk', 'PT', 'Alk'] (can be checked by sortdist
function). There are 2 'TM', 3 'PT', and 3 'Alk'. It will choose 'PT' rather than 'Alk' because the first 'PT' (index = 2) is in front of the first 'Alk' (index = 3), which means the first 'PT' is closer to the point [0.5,0.5].
In [11]:
"""
def knn(traindf,inputdf,k):
#Wrap all functions and append a column called 'Knn Type'.
group = []
for index, row in inputdf.iterrows():
group.append(classify(k,traindf,row[0:2]))
inputdf['Knn Type'] = group
return(inputdf)
"""
print()
In [12]:
knn.knn(data, test, 4)
Out[12]:
In [13]:
"""
def knnac(traindf,inputdf,k):
#Creat a dictionary, and calculate accuracy.
k_ac = {}
for i in k:
n = 0
for j in range(inputdf.shape[0]):
if knn(traindf,inputdf,i)['Type'][j] == knn(traindf,inputdf,i)['Knn $
n += 1
k_ac[i] = '{0:.1%}'.format(n/inputdf.shape[0])
return k_ac
"""
print()
In [14]:
#Try
knn.knnac(data,test,range(1,data.shape[0]))
Out[14]:
In [15]:
import test_knn as tk
In [16]:
tk.test_dist()
In [17]:
"""
def test_dist():
import knn
vec1 = [0, 0, 0, 1]
vec2 = [0, 0, 0, 0]
assert int(knn.dist(vec1, vec2)) == 1, "Distance calculate not correct."
"""
print()
In [18]:
tk.test_sortdist()
In [19]:
"""
def test_sortdist():
import knn
import pandas as pd
df = pd.DataFrame([[0,3], [0,1], [0,2]])
vec = [0,0]
assert int(knn.sortdist(df,vec)[0][0]) == 1, "Sort not handled properly"
assert int(knn.sortdist(df,vec)[0][1]) == 1, "Index finding not handled properly."
"""
print()
In [20]:
tk.test_classify()
In [21]:
"""
def test_classify():
import knn
import pandas as pd
df = pd.DataFrame([[0,3,'C'], [0,1,'B'], [0,2,'A']],columns = [1,2,'Type'])
vec = [0,0]
#Test if there are multiple group type by a given k.
assert knn.classify(1,df,vec) == 'B', "Group finding not handled properly."
assert knn.classify(2,df,vec) == 'B', "Group finding not handled properly."
assert knn.classify(3,df,vec) == 'B', "Group finding not handled properly."
"""
print()
In [22]:
tk.test_knn()
In [23]:
"""
def test_knn():
import knn
import pandas as pd
df1 = pd.DataFrame([[0,3,'C'], [0,1,'B'], [0,2,'A']],columns = [1,2,'Type'])
df2 = pd.DataFrame([[0,3.1], [0,1.1], [0,2.1]])
assert list(knn.knn(df1,df2,1)['Knn Type']) == ['C', 'B', 'A'], "Knn not handled properly."
assert knn.knn(df1,df2,1).shape[1] == 3, "Column appending not handled properly."
"""
print()
In [24]:
tk.test_knnac()
In [25]:
"""
def test_knnac():
import knn
import pandas as pd
df1 = pd.DataFrame([[0,3,'C'], [0,1,'B'], [0,2,'A']],columns = [1,2,'Type'])
df2 = pd.DataFrame([[0,3.1,'C'], [0,1.1,'B'], [0,2.1,'A']],columns = [1,2,'Type'])
assert len(knn.knnac(df1,df2,[1,2,3])) == 3, "Maybe miss a calculation of a k value."
assert knn.knnac(df1,df2,[1,2,3])[1] == '100.0%', "Accuracy not handled properly."
assert type(knn.knnac(df1,df2,[1,2,3])) == dict, "Dictionary not handled properly"
"""
print()
In [ ]: