KNN & DTW


In [1]:
# -*- coding: utf-8 -*-
class Dtw(object):
    
    def __init__(self, seq1, seq2,
                 patterns = [(-1,-1), (-1,0), (0,-1)], 
                 weights = [{(0,0):2}, {(0,0):1}, {(0,0):1}], 
                 band_r=0.015): #EDIT HERE
        self._seq1 = seq1
        self._seq2 = seq2
        self.len_seq1 = len(seq1)
        self.len_seq2 = len(seq2)
        self.len_pattern = len(patterns)
        self.sum_w = [sum(ws.values()) for ws in weights]
        self._r = int(len(seq1)*band_r)
        assert len(patterns) == len(weights)
        self._patterns = patterns
        self._weights = weights
    
    def get_distance(self, i1, i2):
        return abs(self._seq1[i1] - self._seq2[i2])

    def calculate(self):
        g = list([float('inf')]*self.len_seq2 for i in range(self.len_seq1))
        cost = list([0]*self.len_seq2 for i in range(self.len_seq1))

        g[0][0] = 2*self.get_distance(0, 0)
        for i in range(self.len_seq1):
            for j in range(max(0,i-self._r), min(i+self._r+1, self.len_seq2)):
                for pat_i in range(self.len_pattern):
                    coor = (i+self._patterns[pat_i][0], j+self._patterns[pat_i][1])
                    if coor[0]<0 or coor[1]<0:
                        continue
                    dist = 0
                    for w_coor_offset, d_w in self._weights[pat_i].items():
                        w_coor = (i+w_coor_offset[0], j+w_coor_offset[1])
                        dist += d_w*self.get_distance(w_coor[0], w_coor[1])
                    this_val = g[coor[0]][coor[1]] + dist
                    this_cost = cost[coor[0]][coor[1]] + self.sum_w[pat_i]
                    if this_val < g[i][j]:
                        g[i][j] = this_val
                        cost[i][j] = this_cost
        return g[self.len_seq1-1][self.len_seq2-1]/cost[self.len_seq1-1][self.len_seq2-1], g, cost
    
    def print_table(self, tb):
        print('      '+' '.join(["{:^7d}".format(i) for i in range(self.len_seq2)]))
        for i in range(self.len_seq1):
            str = "{:^4d}: ".format(i)
            for j in range(self.len_seq2):
                str += "{:^7.3f} ".format(tb[i][j])
            print (str)

    def print_g_matrix(self):
        _, tb, _ = self.calculate()
        self.print_table(tb)

    def print_cost_matrix(self):
        _, _, tb = self.calculate()
        self.print_table(tb)
        
    def get_dtw(self):
        ans, _, _ = self.calculate()
        return ans

In [2]:
import csv
import random
import math
import operator
import numpy as np

def loadDataset(filename, data=[]):
    with open(filename, 'rb') as csvfile:
        lines = csv.reader(csvfile,delimiter=' ')
        dataset = list(lines)
        for x in range(len(dataset)):
            dataset[x] = filter(None, dataset[x])
            dataset[x] = list(map(float, dataset[x]))
            data.append(dataset[x])

def euclideanDistance(instance1, instance2, length):
	distance = 0
	for x in range(length):
		if x == 0:
			continue
		distance += pow((instance1[x] - instance2[x]), 2)
	return math.sqrt(distance)
 
def getNeighbors(trainingSet, testInstance, k, pattern, weight):
	distances = []
	length = len(testInstance)
	for x in range(len(trainingSet)):
#  z-normalization
		d = Dtw(testInstance[1:], trainingSet[x][1:], pattern, weight)
		dist = d.get_dtw()
# 		dist = euclideanDistance(testInstance, trainingSet[x], length)
		distances.append((trainingSet[x], dist))
	distances.sort(key=operator.itemgetter(1))
#  	print "dist >>>> ",distances
	neighbors = []
	for x in range(k):
		neighbors.append(distances[x][0])
	return neighbors

def getResponse(neighbors):
	classVotes = {}
	for x in range(len(neighbors)):
		response = neighbors[x][0]
		if response in classVotes:
			classVotes[response] += 1
		else:
			classVotes[response] = 1
	sortedVotes = sorted(classVotes.iteritems(), key=operator.itemgetter(1), reverse=True)
	return sortedVotes[0][0]
 
def getAccuracy(testSet, predictions):
	correct = 0
	for x in range(len(testSet)):
		if testSet[x][0] == predictions[x]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0
	
def knn(trainingSet, testSet, k, pattern, weight):
	# generate predictions
	predictions=[]
	for x in range(len(testSet)):
# 		print ">>",testSet[x]
		neighbors = getNeighbors(trainingSet, testSet[x], k, pattern, weight)
# 		print "neighbors >>", neighbors
		result = getResponse(neighbors)
# 		print "result >>", result
		predictions.append(result)
# 		print('> predicted=' + repr(result) + ', actual=' + repr(testSet[x][0]))
	accuracy = getAccuracy(testSet, predictions)
	return accuracy

def prepareData(train_data, test_data):
	# prepare data
	rawTrainingSet=[]
	rawTestSet=[]
	testSet=[]
	trainingSet=[]
	loadDataset(train_data, rawTrainingSet)
	loadDataset(test_data, rawTestSet)
	for x in rawTrainingSet:
		newTS = np.append(x[0], ( np.array(x[1:])-np.mean(x[1:]) )/np.std(x[1:]) )
		trainingSet.append(newTS)
	for x in rawTestSet:
		newTS = np.append(x[0], ( np.array(x[1:])-np.mean(x[1:]) )/np.std(x[1:]) )
		testSet.append(newTS)
# 	print 'Train set: ' + repr(len(trainingSet))
# 	print trainingSet
# 	print 'Test set: ' + repr(len(testSet))
# 	print testSet
	return trainingSet, testSet

Main


In [3]:
PATTERNS_1 = [(0,-1), (-1,-1), (-1,0)]
WEIGHTS_SYM_1 = [{(0,0):1}, {(0,0):2}, {(0,0):1}]

In [4]:
COUNT = 10
weights = []
for i in range(COUNT+1):
    for j in range(COUNT-i+1):
        k = COUNT - j - i
        weights.append([{(0,0):i}, {(0,0):j}, {(0,0):k}])

In [5]:
# EDIT HERE
TRAIN_DATA = 'dataset/synthetic_control_TRAIN'
TEST_DATA = 'dataset/synthetic_control_TEST'
OUTPUT_FILE = 'acc_synthetic_control_0.01band_no-dup-normalize.csv'

In [6]:
trainingSet, testSet = prepareData(TRAIN_DATA, TEST_DATA)

In [7]:
knn(trainingSet, testSet, 1, PATTERNS_1, WEIGHTS_SYM_1)


Out[7]:
88.0

In [8]:
with open(OUTPUT_FILE, "w") as myfile:
    myfile.write("i,j,k,accuracy\n")
for weight in weights:
    i = weight[0][(0,0)]
    j = weight[1][(0,0)]
    k = weight[2][(0,0)]
    print "i:", i, "j:", j,"k:", k
    acc = knn(trainingSet, testSet, 1, PATTERNS_1, weight)
    print acc
    with open(OUTPUT_FILE, "a") as myfile:
        myfile.write(str(i)+","+str(j)+","+str(k)+","+str(acc)+"\n")


i: 0 j: 0 k: 10
/opt/conda/envs/py27/lib/python2.7/site-packages/ipykernel/__main__.py:42: RuntimeWarning: divide by zero encountered in double_scalars
16.6666666667
i: 0 j: 1 k: 9
89.0
i: 0 j: 2 k: 8
88.0
i: 0 j: 3 k: 7
87.6666666667
i: 0 j: 4 k: 6
87.6666666667
i: 0 j: 5 k: 5
87.6666666667
i: 0 j: 6 k: 4
87.6666666667
i: 0 j: 7 k: 3
88.0
i: 0 j: 8 k: 2
88.0
i: 0 j: 9 k: 1
88.0
i: 0 j: 10 k: 0
88.0
i: 1 j: 0 k: 9
16.6666666667
i: 1 j: 1 k: 8
89.0
i: 1 j: 2 k: 7
88.0
i: 1 j: 3 k: 6
87.6666666667
i: 1 j: 4 k: 5
87.6666666667
i: 1 j: 5 k: 4
87.6666666667
i: 1 j: 6 k: 3
87.6666666667
i: 1 j: 7 k: 2
88.0
i: 1 j: 8 k: 1
88.0
i: 1 j: 9 k: 0
88.0
i: 2 j: 0 k: 8
16.6666666667
i: 2 j: 1 k: 7
89.0
i: 2 j: 2 k: 6
88.0
i: 2 j: 3 k: 5
87.6666666667
i: 2 j: 4 k: 4
87.6666666667
i: 2 j: 5 k: 3
87.6666666667
i: 2 j: 6 k: 2
87.6666666667
i: 2 j: 7 k: 1
88.0
i: 2 j: 8 k: 0
88.0
i: 3 j: 0 k: 7
16.6666666667
i: 3 j: 1 k: 6
89.0
i: 3 j: 2 k: 5
88.0
i: 3 j: 3 k: 4
87.6666666667
i: 3 j: 4 k: 3
87.6666666667
i: 3 j: 5 k: 2
87.6666666667
i: 3 j: 6 k: 1
87.6666666667
i: 3 j: 7 k: 0
88.0
i: 4 j: 0 k: 6
16.6666666667
i: 4 j: 1 k: 5
89.0
i: 4 j: 2 k: 4
88.0
i: 4 j: 3 k: 3
87.6666666667
i: 4 j: 4 k: 2
87.6666666667
i: 4 j: 5 k: 1
87.6666666667
i: 4 j: 6 k: 0
87.6666666667
i: 5 j: 0 k: 5
16.6666666667
i: 5 j: 1 k: 4
89.0
i: 5 j: 2 k: 3
88.0
i: 5 j: 3 k: 2
87.6666666667
i: 5 j: 4 k: 1
87.6666666667
i: 5 j: 5 k: 0
87.6666666667
i: 6 j: 0 k: 4
16.6666666667
i: 6 j: 1 k: 3
89.0
i: 6 j: 2 k: 2
88.0
i: 6 j: 3 k: 1
87.6666666667
i: 6 j: 4 k: 0
87.6666666667
i: 7 j: 0 k: 3
16.6666666667
i: 7 j: 1 k: 2
89.0
i: 7 j: 2 k: 1
88.0
i: 7 j: 3 k: 0
87.6666666667
i: 8 j: 0 k: 2
16.6666666667
i: 8 j: 1 k: 1
89.0
i: 8 j: 2 k: 0
88.0
i: 9 j: 0 k: 1
16.6666666667
i: 9 j: 1 k: 0
89.0
i: 10 j: 0 k: 0
16.6666666667

In [ ]: