With static dataset, e.g. load the grabbed data.
In [197]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline
import time
import dateutil.parser
In [3]:
# The filtered file make sure to use the fi
filename = 'data/aws-recent-filtered.csv'
#filename = './data/subset.csv'
In [4]:
start = time.time()
# parse the data file and extra the results
df = pd.read_csv(filename)
print(time.time()-start)
df.head(3)
Out[4]:
In [5]:
df = df.dropna()
df = df[df.AvailabilityZone != "AvailabilityZone"]
df['TimeStamp'] = pd.to_datetime(df.TimeStamp)
df.index = df.TimeStamp
df = df.drop('TimeStamp', axis=1)
df= df.sort_index()
df.head(3)
Out[5]:
In [6]:
print (len(df))
print (df['InstanceType'].unique())
print (df['AvailabilityZone'].unique())
print (df['SpotPrice'].unique())
For each machine type there exists a region that is more favorable to use, as the market volatility is very low and the prices tend to stay cheaper than the other regions.
With in proving this hypothesis users will be able to find the best region they should be bidding in, as long as latency is not an issue for them.
Data Science tools & Techniques: We can use clustering and classification methods.
In [7]:
def corrGraph(title, df):
corr_df = df.corr()
mask = np.zeros_like(corr_df)
mask[np.triu_indices_from(mask)] = True
seaborn.heatmap(corr_df, cmap='RdYlGn_r', vmax=1.0, vmin=-1.0 , mask = mask, linewidths=2.5)
plt.yticks(rotation=0)
plt.title(title)
plt.xticks(rotation=90)
plt.show()
In [13]:
# Some info about the data
df = df.truncate(before='2016-10-13 00:00:00', after='2016-12-11 00:00:00')
print (df.index.min())
print (df.index.max())
print(df.index.max()- df.index.min())
df.head(3)
Out[13]:
In [68]:
depa = df.groupby(['AvailabilityZone', 'InstanceType'])
In [75]:
#depa = awsResampler(df)
# Initialize dictionary of all combos of empty dfs we want to graph and corr
zonedfs={}
typedfs={}
for item in df['InstanceType'].unique():
typedfs.update({item: pd.DataFrame()})
for item in df['AvailabilityZone'].unique():
zonedfs.update({item: pd.DataFrame()})
#Fill zonedfs with dataframes of all machines in that zone pricing
for name, group in depa:
#We have to create the data frame by merging the rows and the first run we need our first row
if zonedfs[name[0]].empty:
#RESAMPLE DATA HOURLY getting the avg of the hour
zonedfs[name[0]] = group
zonedfs[name[0]] = zonedfs[name[0]].resample('H').mean()
zonedfs[name[0]] = zonedfs[name[0]].fillna(method="ffill")
#zonedfs[name[0]] = zonedfs[name[0]].drop('InstanceType', axis=1).drop(['AvailabilityZone'],axis=1)
#Remove index so that you can merge
zonedfs[name[0]] = zonedfs[name[0]].reset_index()
#print(zonedfs[name[0]].head(40) )
zonedfs[name[0]].rename(columns = {'SpotPrice':name[1]}, inplace = True)
else:
#RESAMPLE DATA HOURLY getting the avg of the hour
group1 = group.resample('H').mean()
group1 = group1.fillna(method="ffill")
#print(zonedfs[name[0]].head(20) )
#group1 = group.drop('InstanceType', axis=1).drop(['AvailabilityZone'],axis=1)
group1.rename(columns = {'SpotPrice':name[1]}, inplace = True)
#Remove index so that you can merge
group1 = group1.reset_index()
#print(group1.head(20))
zonedfs[name[0]] = zonedfs[name[0]].merge(group1,how='right')
#Fill typedfs with dataframes of all machines in that zone pricing
for name, group in depa:
if typedfs[name[1]].empty:
typedfs[name[1]] = group
typedfs[name[1]] = typedfs[name[1]].resample('H').mean()
typedfs[name[1]] = typedfs[name[1]].fillna(method="ffill")
#typedfs[name[1]] = typedfs[name[1]].drop('InstanceType', axis=1).drop(['AvailabilityZone'],axis=1)
typedfs[name[1]].rename(columns = {'SpotPrice':name[0]}, inplace = True)
typedfs[name[1]] = typedfs[name[1]].reset_index()
else:
group1 = group.resample('H').mean()
group1 = group1.fillna(method="ffill")
#group1 = group.drop('InstanceType', axis=1).drop(['AvailabilityZone'],axis=1)
group1.rename(columns = {'SpotPrice':name[0]}, inplace = True)
group1 = group1.reset_index()
typedfs[name[1]] = typedfs[name[1]].merge(group1,how='right')
In [76]:
# Reindex time by timestamp
for key in typedfs:
typedfs[key].index = typedfs[key].TimeStamp
typedfs[key] = typedfs[key].drop(['TimeStamp'],axis=1)
for key in zonedfs:
zonedfs[key].index = zonedfs[key].TimeStamp
zonedfs[key] = zonedfs[key].drop(['TimeStamp'],axis=1)
In [96]:
typedfs['r4.large'].head(3)
Out[96]:
In [82]:
#NORMALIZE THE DATA HERE
tN=typedfs.copy()
zN=zonedfs.copy()
for key in tN:
#typedfs[key].index = typedfs[key].TimeStamp
#typedfs[key] = typedfs[key].drop(['TimeStamp'],axis=1)
#Normalize data
#typedfs[key] = typedfs[key].apply(lambda row: np.log(row).diff(), axis=0 )
tN[key] = tN[key].diff(axis=0)
corrGraph(key, tN[key])
for key in zonedfsN:
#zonedfs[key].index = zonedfs[key].TimeStamp
#zonedfs[key] = zonedfs[key].drop(['TimeStamp'],axis=1)
#Normalize data
#zonedfs[key] = zonedfs[key].apply(lambda row: np.log(row).diff(), axis=0 )
zN[key] = zN[key].diff(axis=0)
corrGraph(key, zN[key])
In [87]:
#NORMALIZE THE DATA HERE
tavg = typedfs
dfAVG= []
dfA = pd.DataFrame()
#["InstanceType","AvailabilityZone","Mean", "Median", "Max", "Min", "STD"]
for key in tavg:
for column in tavg[key]:
dfAVG.append({'InstanceType':key, 'AvailabilityZone':column,'Mean':tavg[key][column].mean(),'Median':tavg[key][column].median(),
'Max':tavg[key][column].max(), 'Min':tavg[key][column].min(), 'STD':tavg[key][column].std()})
dfA = pd.DataFrame(dfAVG)
print(dfA.head(3))
print(len(dfA))
In [89]:
dfPricing = pd.read_csv("data/pricing-data.csv")
In [95]:
dfPricing.drop('Unnamed: 0', axis=1,inplace=True)
dfPricing.head(3)
Out[95]:
In [107]:
az = ['eu-west-1b', 'eu-west-1c', 'eu-west-1a', 'us-west-2c', 'ap-northeast-2c',
'ap-northeast-2a', 'us-west-1b', 'us-west-1c', 'us-west-2b', 'us-west-2a',
'ap-southeast-2c' ,'eu-central-1b','ap-southeast-2b','ap-southeast-2a',
'eu-central-1a','us-east-1a','us-east-1e' 'sa-east-1c' 'sa-east-1a',
'ap-southeast-1a','ap-southeast-1b', 'us-east-1d', 'ap-northeast-1c',
'ap-northeast-1a', 'us-east-1c']
dfPricingAdj = pd.DataFrame()
In [126]:
#modify dfPricing Availability Zone to contain a, b ...etc
count =0
for item in az:
#print(item)
azM = item[:-1]
if count == 0:
dfPricingAdj= dfPricing[dfPricing.AvailabilityZone == azM].copy()
dfPricingAdj.replace(to_replace=azM, value=item, inplace=True)
count = count+1
else:
dfTemp = dfPricing[dfPricing.AvailabilityZone == azM].copy()
dfTemp.replace(to_replace=azM, value=item, inplace=True)
#print(dfTemp.head(3))
dfPricingAdj = dfPricingAdj.append(dfTemp)
dfPricingAdj = dfPricingAdj.reset_index().drop('index', axis=1)
#dfPricingAdj.head(1000)
#for item in dfPricing['AvailabilityZone']:
# print(item)
# break
#Merge the prices with the stats
#dfA = dfA(dfPricing,how='right')
In [128]:
#Merge the prices with the stats
dfA = dfA.merge(dfPricingAdj,how='right')
dfA.head(3)
Out[128]:
In [130]:
dfA.to_csv("data/instance_price_stats.csv")
In [221]:
dfA = pd.read_csv("data/instance_price_stats.csv")
dfA.drop('Unnamed: 0', axis=1,inplace=True)
In [222]:
### DO analysis by subtracting the Mean & Max with the prices
#lets only look at the cheapest prices
dfA = dfA.drop('yrTerm1Standard.partialUpfront', axis=1).drop('yrTerm1Standard.noUpfront', axis=1).drop('yrTerm3Standard.partialUpfront',axis=1)
dfA = dfA.drop('yrTerm3Convertible.noUpfront',axis=1).drop('yrTerm3Convertible.partialUpfront',axis=1).drop('yrTerm3Convertible.allUpfront',axis=1)
#dfA.head(2)
In [223]:
dfMean = dfA.copy()
dfMax = dfA.copy()
dfMean = dfMean.drop('Max', axis=1).drop('Min', axis=1).drop('STD', axis=1).drop('Median', axis=1)
dfMax = dfMax.drop('Mean', axis=1).drop('Min', axis=1).drop('STD', axis=1).drop('Median', axis=1)
In [224]:
dfMean.head(1)
Out[224]:
In [225]:
dfMax.head(1)
Out[225]:
In [226]:
def npCount(dfin):
NPcounter={'odp':0, 'odn':0, '1p':0, '1n':0, '3p':0, '3n':0}
for row in dfin['OnDemand']:
if(row > 0):
NPcounter['odp'] = NPcounter['odp'] +1
else:
NPcounter['odn'] = NPcounter['odn'] +1
for row in dfin['yrTerm1Standard.allUpfront']:
if(row > 0):
NPcounter['1p'] = NPcounter['1p'] +1
else:
NPcounter['1n'] = NPcounter['1n'] +1
for row in dfin['OnDemand']:
if(row > 0):
NPcounter['3p'] = NPcounter['3p'] +1
else:
NPcounter['3n'] = NPcounter['3n'] +1
return NPcounter
In [227]:
dfMean['OnDemand'] = dfMean['OnDemand'].sub(dfMean['Mean'])
dfMean['yrTerm1Standard.allUpfront'] = dfMean['yrTerm1Standard.allUpfront'].sub(dfMean['Mean'])
dfMean['yrTerm3Standard.allUpfront'] = dfMean['yrTerm3Standard.allUpfront'].sub(dfMean['Mean'])
dfMean.head(1)
Out[227]:
In [228]:
print("Median OnDemand "+ str(dfMean['OnDemand'].median()))
print("Median 1 Yr Upfront "+ str(dfMean['yrTerm1Standard.allUpfront'].median()))
print("Median 3 Yr Upfront "+ str(dfMean['yrTerm3Standard.allUpfront'].median()))
print("AVG OnDemand "+ str(dfMean['OnDemand'].mean()))
print("AVG 1 Yr Upfront "+ str(dfMean['yrTerm1Standard.allUpfront'].mean()))
print("AVG 3 Yr Upfront "+ str(dfMean['yrTerm3Standard.allUpfront'].mean()))
print("Min OnDemand "+ str(dfMean['OnDemand'].min()))
print("Min 1 Yr Upfront "+ str(dfMean['yrTerm1Standard.allUpfront'].min()))
print("Min 3 Yr Upfront "+ str(dfMean['yrTerm3Standard.allUpfront'].min()))
print("Max OnDemand "+ str(dfMean['OnDemand'].max()))
print("Max 1 Yr Upfront "+ str(dfMean['yrTerm1Standard.allUpfront'].max()))
print("Max 3 Yr Upfront "+ str(dfMean['yrTerm3Standard.allUpfront'].max()))
countRes = npCount(dfMean)
print("Positive OnDemand "+ str(countRes['odp']))
print("Negative OnDemand "+ str(countRes['odn']))
print("Positive 1 yr "+ str(countRes['1p']))
print("Negative 1 yr "+ str(countRes['1n']))
print("Positive 3 yr "+ str(countRes['3p']))
print("Negative 3 yr "+ str(countRes['3n']))
In [229]:
dfMax['OnDemand'] = dfMax['OnDemand'].sub(dfMax['Max'])
dfMax['yrTerm1Standard.allUpfront'] = dfMax['yrTerm1Standard.allUpfront'].sub(dfMax['Max'])
dfMax['yrTerm3Standard.allUpfront'] = dfMax['yrTerm3Standard.allUpfront'].sub(dfMax['Max'])
dfMax.head(1)
Out[229]:
In [230]:
print("MEDIAN OnDemand "+ str(dfMax['OnDemand'].median()))
print("MEDIAN 1 Yr Upfront "+ str(dfMax['yrTerm1Standard.allUpfront'].median()))
print("MEDIAN 3 Yr Upfront "+ str(dfMax['yrTerm3Standard.allUpfront'].median()))
print("AVG OnDemand "+ str(dfMax['OnDemand'].mean()))
print("AVG 1 Yr Upfront "+ str(dfMax['yrTerm1Standard.allUpfront'].mean()))
print("AVG 3 Yr Upfront "+ str(dfMax['yrTerm3Standard.allUpfront'].mean()))
print("MIN OnDemand "+ str(dfMax['OnDemand'].min()))
print("MIN 1 Yr Upfront "+ str(dfMax['yrTerm1Standard.allUpfront'].min()))
print("MIN 3 Yr Upfront "+ str(dfMax['yrTerm3Standard.allUpfront'].min()))
print("MAX OnDemand "+ str(dfMax['OnDemand'].max()))
print("MAX 1 Yr Upfront "+ str(dfMax['yrTerm1Standard.allUpfront'].max()))
print("MAX 3 Yr Upfront "+ str(dfMax['yrTerm3Standard.allUpfront'].max()))
countRes = npCount(dfMax)
print("Positive OnDemand "+ str(countRes['odp']))
print("Negative OnDemand "+ str(countRes['odn']))
print("Positive 1 yr "+ str(countRes['1p']))
print("Negative 1 yr "+ str(countRes['1n']))
print("Positive 3 yr "+ str(countRes['3p']))
print("Negative 3 yr "+ str(countRes['3n']))
In [ ]: