In [26]:
import urllib
from bs4 import BeautifulSoup
import urllib3
import requests
import time
import csv
import ast
import pandas as pd
from pandas.stats.api import ols
import statsmodels.formula.api as sm
In [41]:
#url = "http://www.ec2instances.info/?region=ap-southeast-2&reserved_term=yrTerm3Standard.allUpfront"
urlBase="http://www.ec2instances.info/?region="
term = ["yrTerm3Standard.allUpfront", "yrTerm1Standard.partialUpfront", "yrTerm1Standard.allUpfront","yrTerm3Standard.partialUpfront"]
instance_types = [ 't1.micro', 't2.nano', 't2.micro', 't2.small', 't2.medium', 't2.large', 't2.xlarge', 't2.2xlarge',
'm1.small', 'm1.medium', 'm1.large', 'm1.xlarge', 'm3.medium', 'm3.large', 'm3.xlarge', 'm3.2xlarge',
'm4.large', 'm4.xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.10xlarge', 'm4.16xlarge', 'm2.xlarge',
'm2.2xlarge', 'm2.4xlarge', 'cr1.8xlarge', 'r3.large', 'r3.xlarge', 'r3.2xlarge', 'r3.4xlarge',
'r3.8xlarge', 'r4.large', 'r4.xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.16xlarge',
'x1.16xlarge', 'x1.32xlarge', 'i2.xlarge', 'i2.2xlarge', 'i2.4xlarge', 'i2.8xlarge', 'hi1.4xlarge',
'hs1.8xlarge', 'c1.medium', 'c1.xlarge', 'c3.large', 'c3.xlarge', 'c3.2xlarge', 'c3.4xlarge',
'c3.8xlarge', 'c4.large', 'c4.xlarge', 'c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'cc1.4xlarge',
'cc2.8xlarge', 'g2.2xlarge', 'g2.8xlarge', 'cg1.4xlarge', 'p2.xlarge', 'p2.8xlarge', 'p2.16xlarge',
'd2.xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'f1.2xlarge', 'f1.16xlarge']
regions = ['us-east-1', 'us-west-2', 'us-west-1', 'eu-west-1','eu-central-1', 'ap-southeast-1', 'ap-northeast-1',
'ap-northeast-2', 'ap-southeast-2', 'sa-east-1']
In [54]:
dataHolder =[]
response = requests.get("http://www.ec2instances.info/?region=ap-southeast-2&reserved_term=yrTerm3Standard.allUpfront")
soup = BeautifulSoup(response.content, "html.parser")
for intype in instance_types:
#Find each types data and store it
try:
trhold = soup.find('tr', {'id':intype})
resCost = trhold.find('td', {'class':"cost-reserved cost-reserved-linux"})
resCost = resCost.get('data-pricing')
demandCost = trhold.find('td', {'class':"cost-ondemand cost-ondemand-linux"})
demandCost = demandCost.get('data-pricing')
dC = ast.literal_eval(demandCost)
rC = ast.literal_eval(resCost)
for region in regions:
try:
dh = {}
if dC[region] != "N/A":
noNA = True
dh['OnDemand'] = dC[region]
dh['InstanceType'] = intype
dh['AvailabilityZone']= region
for term in rC[region]:
if rC[region][term]:
dh[term] = rC[region][term]
else:
noNA = False
if noNA == True:
dataHolder.append(dh)
except KeyError:
#Triggered because that region doesnt exist in dC
#print("ERROR "+ region)
#print (dC)
pass
except AttributeError:
#Triggered because Instance Type doesnt have pricing data on site
#print("ATRIBUTE: "+intype)
pass
In [55]:
df = pd.DataFrame(dataHolder)
df.head(20)
Out[55]:
In [56]:
df.to_csv('pricing-data.csv')