Spot resources Analytics

Here we perform some initial process and analysis on the dataset.

With static dataset, e.g. load the grabbed data.



In [ ]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline



In [ ]:

    
# parse the data file and extra the results
filename = "subset.csv"

df = pd.read_csv(filename)
df.head(20)



In [ ]:

    
#df.columns = ["info", "SpotPrice", "TimeStamp", "InstanceType", "OS type", "AvailabilityZone"]
df['TimeStamp'] =pd.to_datetime(df.TimeStamp)

df.index = df.TimeStamp
#df = df.drop('info', 1).drop(['OS type'],axis=1)
df = df.drop(['TimeStamp'],axis=1).sort_index()
 

df.head(15)
#print (df['InstanceType'].unique())
#print (df['AvailabilityZone'].unique())

Hypothesis #2

For each machine type there exists a region that is more favorable to use, as the market volatility is very low and the prices tend to stay cheaper than the other regions.

With in proving this hypothesis users will be able to find the best region they should be bidding in, as long as latency is not an issue for them.

Data Science tools & Techniques: We can use clustering and classification methods.



In [ ]:

    
def corrGraph(title, df):
    corr_df = df.corr()
    mask = np.zeros_like(corr_df)
    mask[np.triu_indices_from(mask)] = True
    seaborn.heatmap(corr_df, cmap='RdYlGn_r', vmax=1.0, vmin=-1.0 , mask = mask, linewidths=2.5)
    plt.yticks(rotation=0) 
    plt.title(title)
    plt.xticks(rotation=90) 
    plt.show()



In [ ]:

    
# Some info about the data
print (df.index.min())
print (df.index.max())
print(df.index.max()- df.index.min()) 
df = df.truncate(before='2016-10-11', after='2016-12-12')
df.head(3)



In [ ]:

    
def awsResampler(df):
    #RESAMPLE Data by the hour 
    dfSorted = df.groupby(['AvailabilityZone', 'InstanceType'])
    dfSorted = dfSorted.resample('H').mean()
    dfSorted = dfSorted.fillna(method="ffill")
    dfSorted.head(2)
    #dfSorted=dfSorted.drop('InstanceType', axis=1).drop('AvailabilityZone', axis=3

    # We have to load it into a csv to clear an issue caused by the grouping
    # TODO investigate how to do this better for speed increase
    dfSorted.to_csv("im.csv")
    depa = pd.read_csv("im.csv")
    depa = depa.groupby(['AvailabilityZone', 'InstanceType'])
    return depa



In [ ]:

    
"""
depa = awsResampler(df)
# Initialize dictionary of all combos of dfs we want to graph and corr
zonedfs={}
typedfs={}
for item in df['InstanceType'].unique():
    typedfs.update({item: pd.DataFrame()})
for item in df['AvailabilityZone'].unique():
    zonedfs.update({item: pd.DataFrame()})

#Fill zonedfs with dataframes of all machines in that zone pricing
for name, group in depa:
    if zonedfs[name[0]].empty:
        zonedfs[name[0]] = group
        zonedfs[name[0]] = zonedfs[name[0]].drop('InstanceType', axis=1).drop(['AvailabilityZone'],axis=1)
        zonedfs[name[0]].rename(columns = {'SpotPrice':name[1]}, inplace = True)
    else:
        group1 = group.drop('InstanceType', axis=1).drop(['AvailabilityZone'],axis=1)
        group1.rename(columns = {'SpotPrice':name[1]}, inplace = True)
        zonedfs[name[0]] = zonedfs[name[0]].merge(group1,how='right')

#Fill typedfs with dataframes of all machines in that zone pricing
for name, group in depa:
    if typedfs[name[1]].empty:
        typedfs[name[1]] = group
        typedfs[name[1]] = typedfs[name[1]].drop('InstanceType', axis=1).drop(['AvailabilityZone'],axis=1)
        typedfs[name[1]].rename(columns = {'SpotPrice':name[0]}, inplace = True)
    else:
        group1 = group.drop('InstanceType', axis=1).drop(['AvailabilityZone'],axis=1)
        group1.rename(columns = {'SpotPrice':name[0]}, inplace = True)
        typedfs[name[1]] = typedfs[name[1]].merge(group1,how='right')
"""

TSC



In [ ]:

    
# generate ts

df_us_west_one_a = df[df.AvailabilityZone == "us-west-1a"]
df_us_west_one_b = df[df.AvailabilityZone == "us-west-1b"]

df_us_east_one_a = df[df.AvailabilityZone == "us-east-1a"]
df_us_east_one_b = df[df.AvailabilityZone == "us-east-1b"]
df_us_east_one_c = df[df.AvailabilityZone == "us-east-1c"]
df_us_east_one_d = df[df.AvailabilityZone == "us-east-1d"]

df_ap_southeast_one_a = df[df.AvailabilityZone == "ap-southeast-1a"]
df_ap_southeast_one_b = df[df.AvailabilityZone == "ap-southeast-1b"]

df_ap_southeast_two_a = df[df.AvailabilityZone == "ap-southeast-2a"]
df_ap_southeast_two_b = df[df.AvailabilityZone == "ap-southeast-2b"]

#train = np.genfromtxt('datasets/train.csv', delimiter='\t')
#test = np.genfromtxt('datasets/test.csv', delimiter='\t')

#print(type(train))


def get_ts_data(inst_type):
    type_dict = {}
    i=0
    for dff in df_us_west_one_a,df_us_west_one_b,df_ap_southeast_two_a,df_ap_southeast_two_b, \
        df_us_east_one_a,df_us_east_one_b,df_us_east_one_c,df_us_east_one_d,\
        df_ap_southeast_one_a, df_ap_southeast_one_b:
    
        df2 = df[df.InstanceType == inst_type]
        dflist = df2["SpotPrice"]
        type_dict[i] = dflist
        #type_dict.append(dflist)
        i = i+1
    return type_dict

c3 = get_ts_data("c3.large")
c3_x = get_ts_data("c3.xlarge")
c3_2x = get_ts_data("c3.2xlarge")
c3_4x = get_ts_data("c3.4xlarge")
c3_8x = get_ts_data("c3.8xlarge")

print(type(c3))

print(len(c3[0]))
ts1 = c3[0]
ts2 = c3[1]
ts3 = c3_8x[2]
ts1.plot()
ts2.plot()
ts3.plot()

plt.ylim(-2,10)
plt.legend(['ts1','ts2','ts3'])
plt.show()



In [ ]:

    
variations = []
for d in c3,c3_x,c3_2x,c3_4x,c3_8x:
    for i in range(10):
        variations.append(d[i][:12000])
print(len(variations[0]))

#variations = []
#prices = np.array([q.open for q in quotes]).astype(np.float)
#print(len(variations[0]))
prices = np.array(variations)
prices.tofile("prices.csv")



In [ ]:

    
# DTW
import time
from math import sqrt

def DTWDistance(s1, s2):
    DTW={}
    
    for i in range(len(s1)):
        DTW[(i, -1)] = float('inf')
    for i in range(len(s2)):
        DTW[(-1, i)] = float('inf')
    DTW[(-1, -1)] = 0

    for i in range(len(s1)):
        for j in range(len(s2)):
            dist= (s1[i]-s2[j])**2
            DTW[(i, j)] = dist + min(DTW[(i-1, j)],DTW[(i, j-1)], DTW[(i-1, j-1)])
		
    return sqrt(DTW[len(s1)-1, len(s2)-1])


def DTWDistance(s1, s2,w):
    DTW={}
    
    w = max(w, abs(len(s1)-len(s2)))
    
    for i in range(-1,len(s1)):
        for j in range(-1,len(s2)):
            DTW[(i, j)] = float('inf')
    DTW[(-1, -1)] = 0
  
    for i in range(len(s1)):
        for j in range(max(0, i-w), min(len(s2), i+w)):
            dist= (s1[i]-s2[j])**2
            DTW[(i, j)] = dist + min(DTW[(i-1, j)],DTW[(i, j-1)], DTW[(i-1, j-1)])
		
    return sqrt(DTW[len(s1)-1, len(s2)-1])

def LB_Keogh(s1,s2,r):
    LB_sum=0
    for ind,i in enumerate(s1):
        
        lower_bound=min(s2[(ind-r if ind-r>=0 else 0):(ind+r)])
        upper_bound=max(s2[(ind-r if ind-r>=0 else 0):(ind+r)])
        
        if i>upper_bound:
            LB_sum=LB_sum+(i-upper_bound)**2
        elif i<lower_bound:
            LB_sum=LB_sum+(i-lower_bound)**2
    
    return sqrt(LB_sum)

start_time = time.time()
# your code
#print(LB_Keogh(ts1, ts3, 20))
#print(LB_Keogh(ts1[:-1],ts2[:-1],5))
#print(time.time() - start_time , "sec")



In [ ]:

    
# h clustering



In [ ]:

    
# knn
import numpy as np
from sklearn.metrics import classification_report

def knn(train,test,w):
    preds=[]
    for ind,i in enumerate(test):
        min_dist=float('inf')
        closest_seq=[]
        #print ind
        print(i)
        for j in train:
            print(i[:-1])
            
            print(LB_Keogh(i[:-1],j[:-1],5))
            if LB_Keogh(i[:-1],j[:-1],5)<min_dist:
                dist=DTWDistance(i[:-1],j[:-1],w)
                if dist<min_dist:
                    min_dist=dist
                    closest_seq=j
        preds.append(closest_seq[-1])
    return classification_report(test[:,-1],preds)



In [ ]:

    
train = np.genfromtxt('datasets/train.csv', delimiter='\t')
test = np.genfromtxt('datasets/test.csv', delimiter='\t')

print(train)
print(len(train))
print(train[0])
print(len(train[0]))



In [ ]:

    
train = prices[:25]
test = prices[:-25]
len(train)
train.tofile("train.csv")
test.tofile("test.csv")
print(train)
print(len(train))
print(train[0])
print(len(train[0]))



In [ ]:

    
start_time = time.time()
#train = np.genfromtxt('train.csv', delimiter='\t')
#test = np.genfromtxt('test.csv', delimiter='\t')
train = np.fromfile("train.csv")
test = np.fromfile("test.csv")
train2 = np.array(train)
test2 = np.array(test)
# your code
#print (knn(train, test, 4))

print(time.time() - start_time , "sec")



In [ ]:

    
import random
import numpy as np

def k_means_clust(data,num_clust,num_iter,w=5):
    centroids=random.sample(data,num_clust)
    counter=0
    for n in range(num_iter):
        counter+=1
        print (counter)
        assignments={}
        #assign data points to clusters
        for ind,i in enumerate(data):
            min_dist=float('inf')
            closest_clust=None
            for c_ind,j in enumerate(centroids):
                if LB_Keogh(i,j,5)<min_dist:
                    cur_dist=DTWDistance(i,j,w)
                    if cur_dist<min_dist:
                        min_dist=cur_dist
                        closest_clust=c_ind
            if closest_clust in assignments:
                assignments[closest_clust].append(ind)
            else:
                assignments[closest_clust]=[]
    
        #recalculate centroids of clusters
        for key in assignments:
            clust_sum=0
            for k in assignments[key]:
                clust_sum=clust_sum+data[k]
            centroids[key]=[m/len(assignments[key]) for m in clust_sum]
    
    return centroids

train = np.genfromtxt('datasets/train.csv', delimiter='\t')
test = np.genfromtxt('datasets/test.csv', delimiter='\t')
data=np.vstack((train[:,:-1],test[:,:-1]))
d = np.vstack(prices)
d.tofile("d.csv")



In [ ]:

    
import random
import numpy as np

def k_means_clust(data,num_clust,num_iter,w=5):
    centroids=random.sample(data,num_clust)
    counter=0
    for n in range(num_iter):
        counter+=1
        print (counter)
        assignments={}
        #assign data points to clusters
        for ind,i in enumerate(data):
            min_dist=float('inf')
            closest_clust=None
            for c_ind,j in enumerate(centroids):
                if LB_Keogh(i,j,5)<min_dist:
                    cur_dist=DTWDistance(i,j,w)
                    if cur_dist<min_dist:
                        min_dist=cur_dist
                        closest_clust=c_ind
            if closest_clust in assignments:
                assignments[closest_clust].append(ind)
            else:
                assignments[closest_clust]=[]
    
        #recalculate centroids of clusters
        for key in assignments:
            clust_sum=0
            for k in assignments[key]:
                clust_sum=clust_sum+data[k]
            centroids[key]=[m/len(assignments[key]) for m in clust_sum]
    
    return centroids

import matplotlib.pylab as plt


f = np.fromfile("prices.csv")
d = np.vstack(f)

centroids=k_means_clust(list(d),4,10,4)
for i in centroids:
    
    plt.plot(i)

plt.show()



In [ ]:

    
#for i in centroids:    
#    plt.plot(i)
#plt.show()

for i in centroids:
    print(i[0])
    plt.plot(i[0])
plt.show()



In [ ]:

    
"""
for key in typedfs:
    typedfs[key].index = typedfs[key].TimeStamp
    typedfs[key]       = typedfs[key].drop(['TimeStamp'],axis=1)
    #Normalize data
        #typedfs[key] = typedfs[key].apply(lambda row: np.log(row).diff(), axis=0 )
    typedfs[key] = typedfs[key].diff(axis=0)
    corrGraph(key, typedfs[key])
    
for key in zonedfs:
    zonedfs[key].index = zonedfs[key].TimeStamp
    zonedfs[key]       = zonedfs[key].drop(['TimeStamp'],axis=1)
    #Normalize data
        #zonedfs[key] = zonedfs[key].apply(lambda row: np.log(row).diff(), axis=0 )
    zonedfs[key] = zonedfs[key].diff(axis=0)
    corrGraph(key, zonedfs[key])
"""



In [ ]:

    
#  The reason for some grey rows, 
#  is because those values are all zero meaning
#  the price did not change ever for that time series.

#print(typedfs['c3.large']['sa-east-1b'].head(20))

Reference:

pattern matching over time series data