In [1]:
# -*- coding: UTF-8 -*-
#%load_ext autoreload
%reload_ext autoreload
%autoreload 2
In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import sys
import math
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import re
import os
import csv
from helpers.outliers import MyOutliers
from skroutz_mobile import SkroutzMobile
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score
from skroutz_mobile import SkroutzMobile
from sklearn.model_selection import StratifiedShuffleSplit
from helpers.my_train_test_split import MySplitTrainTest
from sklearn.preprocessing import StandardScaler
from preprocess_price_history import PreprocessPriceHistory
from price_history import PriceHistory
from dfa import dfa
import scipy.signal as ss
import nolds
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import random
from sklearn.metrics import silhouette_score
from os.path import isfile
from preprocess_price_history import PreprocessPriceHistory
In [3]:
random_state = np.random.RandomState(seed=16011984)
%matplotlib inline
In [4]:
csv_in = "../price_history_03_seq_start_suddens_trimmed.csv"
In [5]:
df_fixed_width = PreprocessPriceHistory.keepOnlyPriceHistoriesWithMostFrequentLength(csv_in=csv_in)
In [18]:
csv_path = "../price_history_03a_fixed_width.csv"
#df_fixed_width.to_csv(csv_path, encoding='utf-8', quoting=csv.QUOTE_ALL)
In [19]:
km = KMeans(init='random', #init='k-means++',
n_clusters=3, n_init=10)
km
Out[19]:
In [20]:
km.fit(df_fixed_width.values)
Out[20]:
In [21]:
km.inertia_
Out[21]:
In [22]:
km.labels_
Out[22]:
In [23]:
mms = MinMaxScaler()
In [24]:
seqs_norm = mms.fit_transform(seqs_fixed_width.T).T
seqs_norm.shape
Out[24]:
In [25]:
#set random centroids or pseudorandom
#calculate the distance of all points with centroid 1, then the same for centroid 2
#then find a new centroid according to the points you had considered
#then repeat
In [26]:
#kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
In [27]:
# centroids = random.sample(seqs, 3)
# len(centroids), len(centroids[0]), len(centroids[1]), len(centroids[2])
In [28]:
# aa , _ = fastdtw(seqs[0], seqs[2], dist=euclidean)
# aa
In [29]:
def k_means_clust(data, n_clusters, tol=0.1, n_max_iter=100): #tol=0.0001
centroids = random.sample(data, n_clusters)
prev_centroids = centroids[:]
best_max_diff = float('inf')
data_len = len(data)
labels = np.empty(data_len)
counter = 0
while counter < n_max_iter:
counter += 1
print counter
#we only care about the last inertia but it is minimum cost to calc it on every iter
cur_inertia = 0
assignments = {} #dictionary cluster -> data points
for data_ind, xx in enumerate(data):
min_dist = float('inf')
closest_clust_ind = None
for c_ind, cc in enumerate(centroids):
cur_dist, _ = fastdtw(xx, cc, dist=euclidean)
if cur_dist < min_dist: #this means that current centroid is better
closest_clust_ind = c_ind
min_dist = cur_dist
labels[data_ind] = closest_clust_ind
cur_inertia += min_dist
if closest_clust_ind in assignments:
assignments[closest_clust_ind].append(data_ind)
else:
assignments[closest_clust_ind] = [data_ind]
#recalculate centroids of clusters, here we do it in a euclidean way
#(we do not have reverse dtw)
for key in assignments:
cur_assignment_list = assignments[key]
clust_sum = np.zeros(len(data[0]))
for kk in cur_assignment_list:
#print data[kk]
clust_sum += data[kk]
centroids[key] = clust_sum / len(cur_assignment_list)
diffs = [fastdtw(aa, bb, dist=euclidean)[0]
for aa, bb in zip(prev_centroids, centroids)]
max_diff = max(diffs)
if max_diff < best_max_diff:
best_centroids = centroids[:]
best_inertia = cur_inertia
best_labels = labels
best_max_diff = max_diff
print "max diff {}".format(max_diff)
if max_diff < tol:
break
else:
prev_centroids = centroids[:]
return best_centroids, best_inertia, best_labels
In [30]:
def time_series_kmeans_clust(data, n_clusters, tol=0.1, n_max_iter=100, n_inits = 10):
best_centroids = None
best_inertia = float('inf') #the smaller the better
best_labels = None
for ii in xrange(n_inits):
print "N INIT {}".format(ii)
centroids, inertia, labels = k_means_clust(
data=data, n_clusters=n_clusters, tol=tol, n_max_iter=n_max_iter)
if inertia < best_inertia:
best_centroids = centroids[:]
best_inertia = inertia
best_labels = labels
print
return best_centroids, best_inertia, best_labels
In [31]:
dist_matrix = np.load('dist_matrix_fastdtw_94_210_fixed_len.npy')
dist_matrix.shape
Out[31]:
In [32]:
def calc_n_clusters(n_clusters):
filename = 'ts_kmeans_clust_k_{}.npz'.format(n_clusters)
if isfile(filename):
obj = np.load(filename)
centroids=obj['centroids']
inertia=obj['inertia']
labels=obj['labels']
else:
centroids, inertia, labels = time_series_kmeans_clust(data=seqs_norm, n_clusters=n_clusters,
n_max_iter=100, n_inits=5)
np.savez(filename,
centroids=centroids, inertia=inertia, labels=labels)
silhouette = silhouette_score(dist_matrix, labels=labels, metric="precomputed")
fig = plt.figure(figsize=(15,6))
for cc in centroids:
plt.plot(cc)
plt.show()
return silhouette, inertia
In [35]:
silhouette_scores = dict()
In [33]:
# %%time
# dist_matrix = np.empty(shape=(data_len, data_len))
# for ii in xrange(data_len):
# for jj in xrange(data_len):
# dist_matrix[ii, jj] = fastdtw(data[ii], data[jj], dist=euclidean)[0]
In [35]:
n_clusters = 1
In [36]:
n_clusters += 1
print n_clusters
silhouette, inertia = calc_n_clusters(n_clusters = n_clusters)
print silhouette, inertia
silhouette_scores[n_clusters] = silhouette
In [37]:
n_clusters += 1
print n_clusters
silhouette, inertia = calc_n_clusters(n_clusters = n_clusters)
print silhouette, inertia
silhouette_scores[n_clusters] = silhouette
In [38]:
n_clusters = 4
print n_clusters
silhouette, inertia = calc_n_clusters(n_clusters = n_clusters)
print silhouette, inertia
silhouette_scores[n_clusters] = silhouette
In [39]:
n_clusters = 5
print n_clusters
silhouette, inertia = calc_n_clusters(n_clusters = n_clusters)
print silhouette, inertia
silhouette_scores[n_clusters] = silhouette
6 htan 0.23 7 htan 0.24
In [36]:
n_clusters = 6
print n_clusters
silhouette, inertia = calc_n_clusters(n_clusters = n_clusters)
print silhouette, inertia
silhouette_scores[n_clusters] = silhouette
In [37]:
n_clusters = 7
print n_clusters
silhouette, inertia = calc_n_clusters(n_clusters = n_clusters)
print silhouette, inertia
silhouette_scores[n_clusters] = silhouette
In [61]:
n_clusters = 8
print n_clusters
silhouette, inertia = calc_n_clusters(n_clusters = n_clusters)
print silhouette, inertia
silhouette_scores[n_clusters] = silhouette
Project URL: http://sites.google.com/site/sdm13realistic/
Paper URL: http://www.cs.ucr.edu/~eamonn/SDM_RealisticTSClassifcation_cameraReady.pdf
Use the Silhouette score:
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
In [ ]: