With static dataset, e.g. load the grabbed data.
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline
In [ ]:
# parse the data file and extra the results
filename = "subset.csv"
df = pd.read_csv(filename)
df.head(20)
In [ ]:
#df.columns = ["info", "SpotPrice", "TimeStamp", "InstanceType", "OS type", "AvailabilityZone"]
df['TimeStamp'] =pd.to_datetime(df.TimeStamp)
df.index = df.TimeStamp
#df = df.drop('info', 1).drop(['OS type'],axis=1)
df = df.drop(['TimeStamp'],axis=1).sort_index()
df.head(15)
#print (df['InstanceType'].unique())
#print (df['AvailabilityZone'].unique())
For each machine type there exists a region that is more favorable to use, as the market volatility is very low and the prices tend to stay cheaper than the other regions.
With in proving this hypothesis users will be able to find the best region they should be bidding in, as long as latency is not an issue for them.
Data Science tools & Techniques: We can use clustering and classification methods.
In [ ]:
def corrGraph(title, df):
corr_df = df.corr()
mask = np.zeros_like(corr_df)
mask[np.triu_indices_from(mask)] = True
seaborn.heatmap(corr_df, cmap='RdYlGn_r', vmax=1.0, vmin=-1.0 , mask = mask, linewidths=2.5)
plt.yticks(rotation=0)
plt.title(title)
plt.xticks(rotation=90)
plt.show()
In [ ]:
# Some info about the data
print (df.index.min())
print (df.index.max())
print(df.index.max()- df.index.min())
df = df.truncate(before='2016-10-11', after='2016-12-12')
df.head(3)
In [ ]:
def awsResampler(df):
#RESAMPLE Data by the hour
dfSorted = df.groupby(['AvailabilityZone', 'InstanceType'])
dfSorted = dfSorted.resample('H').mean()
dfSorted = dfSorted.fillna(method="ffill")
dfSorted.head(2)
#dfSorted=dfSorted.drop('InstanceType', axis=1).drop('AvailabilityZone', axis=3
# We have to load it into a csv to clear an issue caused by the grouping
# TODO investigate how to do this better for speed increase
dfSorted.to_csv("im.csv")
depa = pd.read_csv("im.csv")
depa = depa.groupby(['AvailabilityZone', 'InstanceType'])
return depa
In [ ]:
"""
depa = awsResampler(df)
# Initialize dictionary of all combos of dfs we want to graph and corr
zonedfs={}
typedfs={}
for item in df['InstanceType'].unique():
typedfs.update({item: pd.DataFrame()})
for item in df['AvailabilityZone'].unique():
zonedfs.update({item: pd.DataFrame()})
#Fill zonedfs with dataframes of all machines in that zone pricing
for name, group in depa:
if zonedfs[name[0]].empty:
zonedfs[name[0]] = group
zonedfs[name[0]] = zonedfs[name[0]].drop('InstanceType', axis=1).drop(['AvailabilityZone'],axis=1)
zonedfs[name[0]].rename(columns = {'SpotPrice':name[1]}, inplace = True)
else:
group1 = group.drop('InstanceType', axis=1).drop(['AvailabilityZone'],axis=1)
group1.rename(columns = {'SpotPrice':name[1]}, inplace = True)
zonedfs[name[0]] = zonedfs[name[0]].merge(group1,how='right')
#Fill typedfs with dataframes of all machines in that zone pricing
for name, group in depa:
if typedfs[name[1]].empty:
typedfs[name[1]] = group
typedfs[name[1]] = typedfs[name[1]].drop('InstanceType', axis=1).drop(['AvailabilityZone'],axis=1)
typedfs[name[1]].rename(columns = {'SpotPrice':name[0]}, inplace = True)
else:
group1 = group.drop('InstanceType', axis=1).drop(['AvailabilityZone'],axis=1)
group1.rename(columns = {'SpotPrice':name[0]}, inplace = True)
typedfs[name[1]] = typedfs[name[1]].merge(group1,how='right')
"""
In [ ]:
# generate ts
df_us_west_one_a = df[df.AvailabilityZone == "us-west-1a"]
df_us_west_one_b = df[df.AvailabilityZone == "us-west-1b"]
df_us_east_one_a = df[df.AvailabilityZone == "us-east-1a"]
df_us_east_one_b = df[df.AvailabilityZone == "us-east-1b"]
df_us_east_one_c = df[df.AvailabilityZone == "us-east-1c"]
df_us_east_one_d = df[df.AvailabilityZone == "us-east-1d"]
df_ap_southeast_one_a = df[df.AvailabilityZone == "ap-southeast-1a"]
df_ap_southeast_one_b = df[df.AvailabilityZone == "ap-southeast-1b"]
df_ap_southeast_two_a = df[df.AvailabilityZone == "ap-southeast-2a"]
df_ap_southeast_two_b = df[df.AvailabilityZone == "ap-southeast-2b"]
#train = np.genfromtxt('datasets/train.csv', delimiter='\t')
#test = np.genfromtxt('datasets/test.csv', delimiter='\t')
#print(type(train))
def get_ts_data(inst_type):
type_dict = {}
i=0
for dff in df_us_west_one_a,df_us_west_one_b,df_ap_southeast_two_a,df_ap_southeast_two_b, \
df_us_east_one_a,df_us_east_one_b,df_us_east_one_c,df_us_east_one_d,\
df_ap_southeast_one_a, df_ap_southeast_one_b:
df2 = df[df.InstanceType == inst_type]
dflist = df2["SpotPrice"]
type_dict[i] = dflist
#type_dict.append(dflist)
i = i+1
return type_dict
c3 = get_ts_data("c3.large")
c3_x = get_ts_data("c3.xlarge")
c3_2x = get_ts_data("c3.2xlarge")
c3_4x = get_ts_data("c3.4xlarge")
c3_8x = get_ts_data("c3.8xlarge")
print(type(c3))
print(len(c3[0]))
ts1 = c3[0]
ts2 = c3[1]
ts3 = c3_8x[2]
ts1.plot()
ts2.plot()
ts3.plot()
plt.ylim(-2,10)
plt.legend(['ts1','ts2','ts3'])
plt.show()
In [ ]:
variations = []
for d in c3,c3_x,c3_2x,c3_4x,c3_8x:
for i in range(10):
variations.append(d[i][:12000])
print(len(variations[0]))
#variations = []
#prices = np.array([q.open for q in quotes]).astype(np.float)
#print(len(variations[0]))
prices = np.array(variations)
prices.tofile("prices.csv")
In [ ]:
# DTW
import time
from math import sqrt
def DTWDistance(s1, s2):
DTW={}
for i in range(len(s1)):
DTW[(i, -1)] = float('inf')
for i in range(len(s2)):
DTW[(-1, i)] = float('inf')
DTW[(-1, -1)] = 0
for i in range(len(s1)):
for j in range(len(s2)):
dist= (s1[i]-s2[j])**2
DTW[(i, j)] = dist + min(DTW[(i-1, j)],DTW[(i, j-1)], DTW[(i-1, j-1)])
return sqrt(DTW[len(s1)-1, len(s2)-1])
def DTWDistance(s1, s2,w):
DTW={}
w = max(w, abs(len(s1)-len(s2)))
for i in range(-1,len(s1)):
for j in range(-1,len(s2)):
DTW[(i, j)] = float('inf')
DTW[(-1, -1)] = 0
for i in range(len(s1)):
for j in range(max(0, i-w), min(len(s2), i+w)):
dist= (s1[i]-s2[j])**2
DTW[(i, j)] = dist + min(DTW[(i-1, j)],DTW[(i, j-1)], DTW[(i-1, j-1)])
return sqrt(DTW[len(s1)-1, len(s2)-1])
def LB_Keogh(s1,s2,r):
LB_sum=0
for ind,i in enumerate(s1):
lower_bound=min(s2[(ind-r if ind-r>=0 else 0):(ind+r)])
upper_bound=max(s2[(ind-r if ind-r>=0 else 0):(ind+r)])
if i>upper_bound:
LB_sum=LB_sum+(i-upper_bound)**2
elif i<lower_bound:
LB_sum=LB_sum+(i-lower_bound)**2
return sqrt(LB_sum)
start_time = time.time()
# your code
#print(LB_Keogh(ts1, ts3, 20))
#print(LB_Keogh(ts1[:-1],ts2[:-1],5))
#print(time.time() - start_time , "sec")
In [ ]:
# h clustering
In [ ]:
# knn
import numpy as np
from sklearn.metrics import classification_report
def knn(train,test,w):
preds=[]
for ind,i in enumerate(test):
min_dist=float('inf')
closest_seq=[]
#print ind
print(i)
for j in train:
print(i[:-1])
print(LB_Keogh(i[:-1],j[:-1],5))
if LB_Keogh(i[:-1],j[:-1],5)<min_dist:
dist=DTWDistance(i[:-1],j[:-1],w)
if dist<min_dist:
min_dist=dist
closest_seq=j
preds.append(closest_seq[-1])
return classification_report(test[:,-1],preds)
In [ ]:
train = np.genfromtxt('datasets/train.csv', delimiter='\t')
test = np.genfromtxt('datasets/test.csv', delimiter='\t')
print(train)
print(len(train))
print(train[0])
print(len(train[0]))
In [ ]:
train = prices[:25]
test = prices[:-25]
len(train)
train.tofile("train.csv")
test.tofile("test.csv")
print(train)
print(len(train))
print(train[0])
print(len(train[0]))
In [ ]:
start_time = time.time()
#train = np.genfromtxt('train.csv', delimiter='\t')
#test = np.genfromtxt('test.csv', delimiter='\t')
train = np.fromfile("train.csv")
test = np.fromfile("test.csv")
train2 = np.array(train)
test2 = np.array(test)
# your code
#print (knn(train, test, 4))
print(time.time() - start_time , "sec")
In [ ]:
import random
import numpy as np
def k_means_clust(data,num_clust,num_iter,w=5):
centroids=random.sample(data,num_clust)
counter=0
for n in range(num_iter):
counter+=1
print (counter)
assignments={}
#assign data points to clusters
for ind,i in enumerate(data):
min_dist=float('inf')
closest_clust=None
for c_ind,j in enumerate(centroids):
if LB_Keogh(i,j,5)<min_dist:
cur_dist=DTWDistance(i,j,w)
if cur_dist<min_dist:
min_dist=cur_dist
closest_clust=c_ind
if closest_clust in assignments:
assignments[closest_clust].append(ind)
else:
assignments[closest_clust]=[]
#recalculate centroids of clusters
for key in assignments:
clust_sum=0
for k in assignments[key]:
clust_sum=clust_sum+data[k]
centroids[key]=[m/len(assignments[key]) for m in clust_sum]
return centroids
train = np.genfromtxt('datasets/train.csv', delimiter='\t')
test = np.genfromtxt('datasets/test.csv', delimiter='\t')
data=np.vstack((train[:,:-1],test[:,:-1]))
d = np.vstack(prices)
d.tofile("d.csv")
In [ ]:
import random
import numpy as np
def k_means_clust(data,num_clust,num_iter,w=5):
centroids=random.sample(data,num_clust)
counter=0
for n in range(num_iter):
counter+=1
print (counter)
assignments={}
#assign data points to clusters
for ind,i in enumerate(data):
min_dist=float('inf')
closest_clust=None
for c_ind,j in enumerate(centroids):
if LB_Keogh(i,j,5)<min_dist:
cur_dist=DTWDistance(i,j,w)
if cur_dist<min_dist:
min_dist=cur_dist
closest_clust=c_ind
if closest_clust in assignments:
assignments[closest_clust].append(ind)
else:
assignments[closest_clust]=[]
#recalculate centroids of clusters
for key in assignments:
clust_sum=0
for k in assignments[key]:
clust_sum=clust_sum+data[k]
centroids[key]=[m/len(assignments[key]) for m in clust_sum]
return centroids
import matplotlib.pylab as plt
f = np.fromfile("prices.csv")
d = np.vstack(f)
centroids=k_means_clust(list(d),4,10,4)
for i in centroids:
plt.plot(i)
plt.show()
In [ ]:
#for i in centroids:
# plt.plot(i)
#plt.show()
for i in centroids:
print(i[0])
plt.plot(i[0])
plt.show()
In [ ]:
"""
for key in typedfs:
typedfs[key].index = typedfs[key].TimeStamp
typedfs[key] = typedfs[key].drop(['TimeStamp'],axis=1)
#Normalize data
#typedfs[key] = typedfs[key].apply(lambda row: np.log(row).diff(), axis=0 )
typedfs[key] = typedfs[key].diff(axis=0)
corrGraph(key, typedfs[key])
for key in zonedfs:
zonedfs[key].index = zonedfs[key].TimeStamp
zonedfs[key] = zonedfs[key].drop(['TimeStamp'],axis=1)
#Normalize data
#zonedfs[key] = zonedfs[key].apply(lambda row: np.log(row).diff(), axis=0 )
zonedfs[key] = zonedfs[key].diff(axis=0)
corrGraph(key, zonedfs[key])
"""
In [ ]:
# The reason for some grey rows,
# is because those values are all zero meaning
# the price did not change ever for that time series.
#print(typedfs['c3.large']['sa-east-1b'].head(20))