In [1]:
# -*- coding: UTF-8 -*-
#%load_ext autoreload
%reload_ext autoreload
%autoreload 2

In [2]:
from __future__ import division
import numpy as np
import pandas as pd
import sys
import math
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import re
import os
import csv
from helpers.outliers import MyOutliers
from skroutz_mobile import SkroutzMobile
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score
from skroutz_mobile import SkroutzMobile
from sklearn.model_selection import StratifiedShuffleSplit
from helpers.my_train_test_split import MySplitTrainTest
from sklearn.preprocessing import StandardScaler
from preprocess_price_history import PreprocessPriceHistory
from price_history import PriceHistory
from dfa import dfa
import scipy.signal as ss
import nolds
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import random
from sklearn.metrics import silhouette_score
from os.path import isfile
from preprocess_price_history import PreprocessPriceHistory

In [3]:
random_state = np.random.RandomState(seed=16011984)
%matplotlib inline

Some processing


In [4]:
csv_in = "../price_history_03_seq_start_suddens_trimmed.csv"

In [5]:
df_fixed_width = PreprocessPriceHistory.keepOnlyPriceHistoriesWithMostFrequentLength(csv_in=csv_in)

In [18]:
csv_path = "../price_history_03a_fixed_width.csv"
#df_fixed_width.to_csv(csv_path, encoding='utf-8', quoting=csv.QUOTE_ALL)

testing


In [19]:
km = KMeans(init='random', #init='k-means++',
            n_clusters=3, n_init=10)
km


Out[19]:
KMeans(algorithm='auto', copy_x=True, init='random', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [20]:
km.fit(df_fixed_width.values)


Out[20]:
KMeans(algorithm='auto', copy_x=True, init='random', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [21]:
km.inertia_


Out[21]:
76613685.760995418

In [22]:
km.labels_


Out[22]:
array([2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 2, 2, 0, 2, 0, 1, 0, 0, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 0, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2,
       2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 0, 2, 2, 0, 2,
       2, 2], dtype=int32)

Dynamic Time Wrapping

Dynamic time warping finds the optimal non-linear alignment between two time series.

K means clustering


In [23]:
mms = MinMaxScaler()

In [24]:
seqs_norm = mms.fit_transform(seqs_fixed_width.T).T
seqs_norm.shape


Out[24]:
(94, 210)

In [25]:
#set random centroids or pseudorandom
#calculate the distance of all points with centroid 1, then the same for centroid 2
#then find a new centroid according to the points you had considered
#then repeat

In [26]:
#kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)

In [27]:
# centroids = random.sample(seqs, 3)
# len(centroids), len(centroids[0]), len(centroids[1]), len(centroids[2])

In [28]:
# aa , _ = fastdtw(seqs[0], seqs[2], dist=euclidean)
# aa

In [29]:
def k_means_clust(data, n_clusters, tol=0.1, n_max_iter=100): #tol=0.0001
    centroids = random.sample(data, n_clusters)
    prev_centroids = centroids[:]
    best_max_diff = float('inf')
    
    data_len = len(data)
    
    labels = np.empty(data_len)
    
    counter = 0
    while counter < n_max_iter:
        counter += 1
        print counter
        
        #we only care about the last inertia but it is minimum cost to calc it on every iter
        cur_inertia = 0
        
        assignments = {}  #dictionary cluster -> data points
        
        for data_ind, xx in enumerate(data):
            min_dist = float('inf')
            closest_clust_ind = None
            
            for c_ind, cc in enumerate(centroids):
                cur_dist, _ = fastdtw(xx, cc, dist=euclidean)
                if cur_dist < min_dist: #this means that current centroid is better
                    closest_clust_ind = c_ind
                    min_dist = cur_dist
                    
            labels[data_ind] = closest_clust_ind
            
            cur_inertia += min_dist
            
            if closest_clust_ind in assignments:
                assignments[closest_clust_ind].append(data_ind)
            else:
                assignments[closest_clust_ind] = [data_ind]
        
        #recalculate centroids of clusters, here we do it in a euclidean way
        #(we do not have reverse dtw)
        for key in assignments:
            cur_assignment_list = assignments[key]
            clust_sum = np.zeros(len(data[0]))
            for kk in cur_assignment_list:
                #print data[kk]
                clust_sum += data[kk]
            
            centroids[key] = clust_sum / len(cur_assignment_list)
        
        diffs = [fastdtw(aa, bb, dist=euclidean)[0] 
                 for aa, bb in zip(prev_centroids, centroids)]
        max_diff = max(diffs)
        
        if max_diff < best_max_diff:
            best_centroids = centroids[:]
            best_inertia = cur_inertia
            best_labels = labels
            best_max_diff = max_diff
        
        print "max diff {}".format(max_diff)
        if max_diff < tol:
            break
        else:
            prev_centroids = centroids[:]
                
    return best_centroids, best_inertia, best_labels

In [30]:
def time_series_kmeans_clust(data, n_clusters, tol=0.1, n_max_iter=100, n_inits = 10):
    best_centroids = None
    best_inertia = float('inf') #the smaller the better
    best_labels = None
    
    for ii in xrange(n_inits):
        print "N INIT {}".format(ii)
        
        centroids, inertia, labels = k_means_clust(
            data=data, n_clusters=n_clusters, tol=tol, n_max_iter=n_max_iter)
        
        if inertia < best_inertia:
            best_centroids = centroids[:]
            best_inertia = inertia
            best_labels = labels
        
        print
            
    return best_centroids, best_inertia, best_labels

In [31]:
dist_matrix = np.load('dist_matrix_fastdtw_94_210_fixed_len.npy')
dist_matrix.shape


Out[31]:
(94, 94)

In [32]:
def calc_n_clusters(n_clusters):
    filename = 'ts_kmeans_clust_k_{}.npz'.format(n_clusters)
    if isfile(filename):
        obj = np.load(filename)
        centroids=obj['centroids']
        inertia=obj['inertia']
        labels=obj['labels']
    else:
        
        centroids, inertia, labels = time_series_kmeans_clust(data=seqs_norm, n_clusters=n_clusters,
                                     n_max_iter=100, n_inits=5)
        np.savez(filename,
                 centroids=centroids, inertia=inertia, labels=labels)
    
    silhouette = silhouette_score(dist_matrix, labels=labels, metric="precomputed")
    
    fig = plt.figure(figsize=(15,6))
    for cc in centroids:
        plt.plot(cc)
    plt.show()
    
    return silhouette, inertia

In [35]:
silhouette_scores = dict()

In [33]:
# %%time
# dist_matrix = np.empty(shape=(data_len, data_len))
# for ii in xrange(data_len):
#     for jj in xrange(data_len):
#         dist_matrix[ii, jj] = fastdtw(data[ii], data[jj], dist=euclidean)[0]

In [35]:
n_clusters = 1

In [36]:
n_clusters += 1
print n_clusters
silhouette, inertia = calc_n_clusters(n_clusters = n_clusters)
print silhouette, inertia
silhouette_scores[n_clusters] = silhouette


2
0.364836794412 3184.86553549

In [37]:
n_clusters += 1
print n_clusters
silhouette, inertia = calc_n_clusters(n_clusters = n_clusters)
print silhouette, inertia
silhouette_scores[n_clusters] = silhouette


3
0.346282905256 2611.90134145

In [38]:
n_clusters = 4
print n_clusters
silhouette, inertia = calc_n_clusters(n_clusters = n_clusters)
print silhouette, inertia
silhouette_scores[n_clusters] = silhouette


4
0.308568252108 2368.95367704

In [39]:
n_clusters = 5
print n_clusters
silhouette, inertia = calc_n_clusters(n_clusters = n_clusters)
print silhouette, inertia
silhouette_scores[n_clusters] = silhouette


5
0.246270968418 2187.96750237

6 htan 0.23 7 htan 0.24


In [36]:
n_clusters = 6
print n_clusters
silhouette, inertia = calc_n_clusters(n_clusters = n_clusters)
print silhouette, inertia
silhouette_scores[n_clusters] = silhouette


6
0.241415247676 2068.37375528

In [37]:
n_clusters = 7
print n_clusters
silhouette, inertia = calc_n_clusters(n_clusters = n_clusters)
print silhouette, inertia
silhouette_scores[n_clusters] = silhouette


7
N INIT 0
1
max diff 39.7544251555
2
max diff 14.8849196152
3
max diff 15.8145017419
4
max diff 9.40737248584
5
max diff 8.73189460286
6
max diff 8.69812375262
7
max diff 7.94747416785
8
max diff 7.86723522368
9
max diff 9.34371810442
10
max diff 5.70015739853
11
max diff 8.60584748171
12
max diff 16.1446580107
13
max diff 12.8540360761
14
max diff 13.4446685994
15
max diff 14.2741473472
16
max diff 9.94846235325
17
max diff 8.8435826412
18
max diff 16.6842913047
19
max diff 8.61354796186
20
max diff 11.0939144193
21
max diff 16.648116927
22
max diff 12.7611440656
23
max diff 8.85535837023
24
max diff 7.66548121539
25
max diff 6.31709459435
26
max diff 11.6651519656
27
max diff 8.46615478779
28
max diff 1.55418155277
29
max diff 1.55418155277
30
max diff 1.55418155277
31
max diff 1.55418155277
32
max diff 1.55418155277
33
max diff 1.55418155277
34
max diff 1.55418155277
35
max diff 1.55418155277
36
max diff 1.55418155277
37
max diff 1.55418155277
38
max diff 1.55418155277
39
max diff 1.55418155277
40
max diff 1.55418155277
41
max diff 1.55418155277
42
max diff 1.55418155277
43
max diff 1.55418155277
44
max diff 1.55418155277
45
max diff 1.55418155277
46
max diff 1.55418155277
47
max diff 1.55418155277
48
max diff 1.55418155277
49
max diff 1.55418155277
50
max diff 1.55418155277
51
max diff 1.55418155277
52
max diff 1.55418155277
53
max diff 1.55418155277
54
max diff 1.55418155277
55
max diff 1.55418155277
56
max diff 1.55418155277
57
max diff 1.55418155277
58
max diff 1.55418155277
59
max diff 1.55418155277
60
max diff 1.55418155277
61
max diff 1.55418155277
62
max diff 1.55418155277
63
max diff 1.55418155277
64
max diff 1.55418155277
65
max diff 1.55418155277
66
max diff 1.55418155277
67
max diff 1.55418155277
68
max diff 1.55418155277
69
max diff 1.55418155277
70
max diff 1.55418155277
71
max diff 1.55418155277
72
max diff 1.55418155277
73
max diff 1.55418155277
74
max diff 1.55418155277
75
max diff 1.55418155277
76
max diff 1.55418155277
77
max diff 1.55418155277
78
max diff 1.55418155277
79
max diff 1.55418155277
80
max diff 1.55418155277
81
max diff 1.55418155277
82
max diff 1.55418155277
83
max diff 1.55418155277
84
max diff 1.55418155277
85
max diff 1.55418155277
86
max diff 1.55418155277
87
max diff 1.55418155277
88
max diff 1.55418155277
89
max diff 1.55418155277
90
max diff 1.55418155277
91
max diff 1.55418155277
92
max diff 1.55418155277
93
max diff 1.55418155277
94
max diff 1.55418155277
95
max diff 1.55418155277
96
max diff 1.55418155277
97
max diff 1.55418155277
98
max diff 1.55418155277
99
max diff 1.55418155277
100
max diff 1.55418155277

N INIT 1
1
max diff 29.859031165
2
max diff 13.682433019
3
max diff 12.7404102098
4
max diff 15.3470872742
5
max diff 8.27618801371
6
max diff 7.00317773581
7
max diff 9.57569221915
8
max diff 11.4338990551
9
max diff 10.1596860597
10
max diff 9.90532093494
11
max diff 6.4507167355
12
max diff 9.88086907032
13
max diff 9.28129514626
14
max diff 10.6699915992
15
max diff 7.55253848172
16
max diff 8.38960742051
17
max diff 8.3949486437
18
max diff 8.04155052613
19
max diff 7.957416467
20
max diff 8.303205437
21
max diff 6.22577334496
22
max diff 11.8012043414
23
max diff 8.88609218439
24
max diff 0.0

N INIT 2
1
max diff 32.6627479645
2
max diff 42.7662100457
3
max diff 21.5890086391
4
max diff 19.7693366929
5
max diff 12.9267022391
6
max diff 11.7497869679
7
max diff 8.32264740326
8
max diff 14.2331142509
9
max diff 10.2902739557
10
max diff 10.1592885593
11
max diff 8.35157486603
12
max diff 8.32252204007
13
max diff 9.21245555752
14
max diff 4.08112867276
15
max diff 4.03366551829
16
max diff 4.03366551829
17
max diff 4.03366551829
18
max diff 4.03366551829
19
max diff 4.03366551829
20
max diff 4.03366551829
21
max diff 4.03366551829
22
max diff 4.03366551829
23
max diff 4.03366551829
24
max diff 4.03366551829
25
max diff 4.03366551829
26
max diff 4.03366551829
27
max diff 4.03366551829
28
max diff 4.03366551829
29
max diff 4.03366551829
30
max diff 4.03366551829
31
max diff 4.03366551829
32
max diff 4.03366551829
33
max diff 4.03366551829
34
max diff 4.03366551829
35
max diff 4.03366551829
36
max diff 4.03366551829
37
max diff 4.03366551829
38
max diff 4.03366551829
39
max diff 4.03366551829
40
max diff 4.03366551829
41
max diff 4.03366551829
42
max diff 4.03366551829
43
max diff 4.03366551829
44
max diff 4.03366551829
45
max diff 4.03366551829
46
max diff 4.03366551829
47
max diff 4.03366551829
48
max diff 4.03366551829
49
max diff 4.03366551829
50
max diff 4.03366551829
51
max diff 4.03366551829
52
max diff 4.03366551829
53
max diff 4.03366551829
54
max diff 4.03366551829
55
max diff 4.03366551829
56
max diff 4.03366551829
57
max diff 4.03366551829
58
max diff 4.03366551829
59
max diff 4.03366551829
60
max diff 4.03366551829
61
max diff 4.03366551829
62
max diff 4.03366551829
63
max diff 4.03366551829
64
max diff 4.03366551829
65
max diff 4.03366551829
66
max diff 4.03366551829
67
max diff 4.03366551829
68
max diff 4.03366551829
69
max diff 4.03366551829
70
max diff 4.03366551829
71
max diff 4.03366551829
72
max diff 4.03366551829
73
max diff 4.03366551829
74
max diff 4.03366551829
75
max diff 4.03366551829
76
max diff 4.03366551829
77
max diff 4.03366551829
78
max diff 4.03366551829
79
max diff 4.03366551829
80
max diff 4.03366551829
81
max diff 4.03366551829
82
max diff 4.03366551829
83
max diff 4.03366551829
84
max diff 4.03366551829
85
max diff 4.03366551829
86
max diff 4.03366551829
87
max diff 4.03366551829
88
max diff 4.03366551829
89
max diff 4.03366551829
90
max diff 4.03366551829
91
max diff 4.03366551829
92
max diff 4.03366551829
93
max diff 4.03366551829
94
max diff 4.03366551829
95
max diff 4.03366551829
96
max diff 4.03366551829
97
max diff 4.03366551829
98
max diff 4.03366551829
99
max diff 4.03366551829
100
max diff 4.03366551829

N INIT 3
1
max diff 37.1004770684
2
max diff 7.72919351477
3
max diff 12.7611440656
4
max diff 11.7122159086
5
max diff 9.56703019009
6
max diff 9.26741502735
7
max diff 6.31671589943
8
max diff 5.85123828589
9
max diff 8.43285783226
10
max diff 10.5958146325
11
max diff 8.68595252529
12
max diff 14.5634529526
13
max diff 9.04250951957
14
max diff 10.7115102132
15
max diff 12.4686503605
16
max diff 18.2131127829
17
max diff 7.88979601892
18
max diff 21.7312992462
19
max diff 17.4930557196
20
max diff 9.39193409748
21
max diff 30.0128180408
22
max diff 26.0530934832
23
max diff 11.1886911773
24
max diff 32.9103745998
25
max diff 26.0530934832
26
max diff 11.9902791019
27
max diff 20.7746855685
28
max diff 10.3456416721
29
max diff 9.47913399348
30
max diff 7.36907461323
31
max diff 9.80258348517
32
max diff 3.78167532324
33
max diff 3.7114672373
34
max diff 0.0

N INIT 4
1
max diff 50.1401694637
2
max diff 18.0513957143
3
max diff 10.760398253
4
max diff 13.7525920405
5
max diff 8.0872366443
6
max diff 8.12251263769
7
max diff 6.94552476541
8
max diff 8.62448599159
9
max diff 8.28693145633
10
max diff 10.329290614
11
max diff 7.73374962333
12
max diff 4.25152130058
13
max diff 1.96909844173
14
max diff 2.24272987549
15
max diff 2.62315706129
16
max diff 3.51398030411
17
max diff 5.87883053665
18
max diff 4.71446012324
19
max diff 5.33873615693
20
max diff 5.26375373394
21
max diff 8.26471483453
22
max diff 6.9412519494
23
max diff 9.61819010362
24
max diff 8.22857476342
25
max diff 9.74094808919
26
max diff 5.88151516449
27
max diff 6.8815550776
28
max diff 3.88778246159
29
max diff 4.92507157947
30
max diff 2.46694917356
31
max diff 7.41425282657
32
max diff 3.66447587253
33
max diff 5.03646019086
34
max diff 6.3058781459
35
max diff 8.20061429952
36
max diff 6.29539487022
37
max diff 3.66447587253
38
max diff 5.03646019086
39
max diff 6.3058781459
40
max diff 8.20061429952
41
max diff 6.29539487022
42
max diff 3.66447587253
43
max diff 5.03646019086
44
max diff 6.3058781459
45
max diff 8.20061429952
46
max diff 6.29539487022
47
max diff 3.66447587253
48
max diff 5.03646019086
49
max diff 6.3058781459
50
max diff 8.20061429952
51
max diff 6.29539487022
52
max diff 3.66447587253
53
max diff 5.03646019086
54
max diff 6.3058781459
55
max diff 8.20061429952
56
max diff 6.29539487022
57
max diff 3.66447587253
58
max diff 5.03646019086
59
max diff 6.3058781459
60
max diff 8.20061429952
61
max diff 6.29539487022
62
max diff 3.66447587253
63
max diff 5.03646019086
64
max diff 6.3058781459
65
max diff 8.20061429952
66
max diff 6.29539487022
67
max diff 3.66447587253
68
max diff 5.03646019086
69
max diff 6.3058781459
70
max diff 8.20061429952
71
max diff 6.29539487022
72
max diff 3.66447587253
73
max diff 5.03646019086
74
max diff 6.3058781459
75
max diff 8.20061429952
76
max diff 6.29539487022
77
max diff 3.66447587253
78
max diff 5.03646019086
79
max diff 6.3058781459
80
max diff 8.20061429952
81
max diff 6.29539487022
82
max diff 3.66447587253
83
max diff 5.03646019086
84
max diff 6.3058781459
85
max diff 8.20061429952
86
max diff 6.29539487022
87
max diff 3.66447587253
88
max diff 5.03646019086
89
max diff 6.3058781459
90
max diff 8.20061429952
91
max diff 6.29539487022
92
max diff 3.66447587253
93
max diff 5.03646019086
94
max diff 6.3058781459
95
max diff 8.20061429952
96
max diff 6.29539487022
97
max diff 3.66447587253
98
max diff 5.03646019086
99
max diff 6.3058781459
100
max diff 8.20061429952

0.212728285197 2020.00197179

In [61]:
n_clusters = 8
print n_clusters
silhouette, inertia = calc_n_clusters(n_clusters = n_clusters)
print silhouette, inertia
silhouette_scores[n_clusters] = silhouette


8
N INIT 0
1
max diff 35.6139182393
2
max diff 16.0002590568
3
max diff 14.2745895608
4
max diff 28.3504393014
5
max diff 28.5580305454
6
max diff 10.3271901901
7
max diff 10.2395449604
8
max diff 11.9233483352
9
max diff 9.07686142278
10
max diff 15.0154415727
11
max diff 19.7988319649
12
max diff 17.6911590375
13
max diff 17.8323319811
14
max diff 15.866841604
15
max diff 11.4271508418
16
max diff 16.6280998109

N INIT 1
1
max diff 41.3699553891
2
max diff 12.4736618067
3
max diff 12.5275304409
4
max diff 15.754612946
5
max diff 13.8945347055
6
max diff 11.6626003594
7
max diff 8.53244223431
8
max diff 9.63515759419
9
max diff 10.177047995
10
max diff 12.2220599911
11
max diff 16.0262967557
12
max diff 11.1728777996
13
max diff 10.2648383794
14
max diff 9.52075471811
15
max diff 16.8005664957
16
max diff 32.9824000686

N INIT 2
1
max diff 29.7670002257
2
max diff 18.1857665093
3
max diff 8.77762969739
4
max diff 7.90572852814
5
max diff 11.6242326665
6
max diff 8.43486567179
7
max diff 10.5602529467
8
max diff 10.2582064353
9
max diff 7.78022448329
10
max diff 6.55751932004
11
max diff 11.1577163452
12
max diff 10.6513590645
13
max diff 14.9195349305
14
max diff 12.270578751
15
max diff 10.5926459857
16
max diff 16.6100951275

N INIT 3
1
max diff 33.7678824569
2
max diff 13.1392432853
3
max diff 14.6428436642
4
max diff 12.2486229149
5
max diff 13.8087964483
6
max diff 23.2921999372
7
max diff 19.8570002051
8
max diff 14.967797908
9
max diff 24.1126332741
10
max diff 13.9558914017
11
max diff 14.0936392907
12
max diff 15.4134814235
13
max diff 11.3938390092
14
max diff 20.0165564808
15
max diff 17.0617658584
16
max diff 4.33806808669

N INIT 4
1
max diff 37.7481703469
2
max diff 12.9681091609
3
max diff 10.6878152716
4
max diff 14.0492676265
5
max diff 13.0427166637
6
max diff 16.6904215475
7
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-61-cecaffaf8a8f> in <module>()
      1 n_clusters = 8
      2 print n_clusters
----> 3 silhouette, inertia = calc_n_clusters(n_clusters = n_clusters)
      4 print silhouette, inertia
      5 silhouette_scores[n_clusters] = silhouette

<ipython-input-51-046f14854c5e> in calc_n_clusters(n_clusters)
      9 
     10         centroids, inertia, labels = time_series_kmeans_clust(data=seqs_norm, n_clusters=n_clusters,
---> 11                                      n_max_iter=n_clusters*2, n_inits=5)
     12         np.savez(filename,
     13                  centroids=centroids, inertia=inertia, labels=labels)

<ipython-input-30-a2b53c523bf6> in time_series_kmeans_clust(data, n_clusters, tol, n_max_iter, n_inits)
      8 
      9         centroids, inertia, labels = k_means_clust(
---> 10             data=data, n_clusters=n_clusters, tol=tol, n_max_iter=n_max_iter)
     11 
     12         if inertia < best_inertia:

<ipython-input-29-8c23fb30236e> in k_means_clust(data, n_clusters, tol, n_max_iter)
     23 
     24             for c_ind, cc in enumerate(centroids):
---> 25                 cur_dist, _ = fastdtw(xx, cc, dist=euclidean)
     26                 if cur_dist < min_dist: #this means that current centroid is better
     27                     closest_clust_ind = c_ind

/home/studenthp/anaconda2/envs/dis/lib/python2.7/site-packages/fastdtw/fastdtw.pyc in fastdtw(x, y, radius, dist)
     63         dist = __norm(p=dist)
     64 
---> 65     return __fastdtw(x, y, radius, dist)
     66 
     67 

/home/studenthp/anaconda2/envs/dis/lib/python2.7/site-packages/fastdtw/fastdtw.pyc in __fastdtw(x, y, radius, dist)
     85         __fastdtw(x_shrinked, y_shrinked, radius=radius, dist=dist)
     86     window = __expand_window(path, len(x), len(y), radius)
---> 87     return dtw(x, y, window, dist=dist)
     88 
     89 

/home/studenthp/anaconda2/envs/dis/lib/python2.7/site-packages/fastdtw/fastdtw.pyc in dtw(x, y, window, dist)
    124     D[0, 0] = (0, 0, 0)
    125     for i, j in window:
--> 126         dt = dist(x[i-1], y[j-1])
    127         D[i, j] = min((D[i-1, j][0]+dt, i-1, j), (D[i, j-1][0]+dt, i, j-1),
    128                       (D[i-1, j-1][0]+dt, i-1, j-1), key=lambda a: a[0])

/home/studenthp/anaconda2/envs/dis/lib/python2.7/site-packages/scipy/spatial/distance.pyc in euclidean(u, v)
    429 
    430     """
--> 431     u = _validate_vector(u)
    432     v = _validate_vector(v)
    433     dist = norm(u - v)

/home/studenthp/anaconda2/envs/dis/lib/python2.7/site-packages/scipy/spatial/distance.pyc in _validate_vector(u, dtype)
    227 def _validate_vector(u, dtype=None):
    228     # XXX Is order='c' really necessary?
--> 229     u = np.asarray(u, dtype=dtype, order='c').squeeze()
    230     # Ensure values such as u=1 and u=[1] still return 1-D arrays.
    231     u = np.atleast_1d(u)

KeyboardInterrupt: 

Time Series Classification under More Realistic Assumptions

Project URL: http://sites.google.com/site/sdm13realistic/
Paper URL: http://www.cs.ucr.edu/~eamonn/SDM_RealisticTSClassifcation_cameraReady.pdf

How to choose the best k in k-means clustering?

Use the Silhouette score:
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html

Plotting Silhouette Scores


In [4]:
xx = [2, 3, 4, 5, 6, 7]
yy = [0.364836794412,
      0.346282905256,
     0.308568252108,
     0.246270968418,
     0.241415247676,
     0.212728285197]

In [5]:
plt.figure(figsize=(15, 6))
plt.plot(xx, yy)
plt.xlabel('Number of Clusters - K factor in K-means clustering')
plt.ylabel('Silhouette Score')
plt.show()



In [ ]: