Grab Actual Pricing

Here is the code that scrapes http://www.ec2instances.info/ for data on actual pricing


In [26]:
import urllib
from bs4 import BeautifulSoup
import urllib3
import requests
import time
import csv
import ast
import pandas as pd
from pandas.stats.api import ols
import statsmodels.formula.api as sm

In [41]:
#url = "http://www.ec2instances.info/?region=ap-southeast-2&reserved_term=yrTerm3Standard.allUpfront"
urlBase="http://www.ec2instances.info/?region="

term = ["yrTerm3Standard.allUpfront", "yrTerm1Standard.partialUpfront", "yrTerm1Standard.allUpfront","yrTerm3Standard.partialUpfront"]
instance_types  = [ 't1.micro', 't2.nano', 't2.micro', 't2.small', 't2.medium', 't2.large', 't2.xlarge', 't2.2xlarge', 
                   'm1.small', 'm1.medium', 'm1.large', 'm1.xlarge', 'm3.medium', 'm3.large', 'm3.xlarge', 'm3.2xlarge', 
                   'm4.large', 'm4.xlarge', 'm4.2xlarge', 'm4.4xlarge', 'm4.10xlarge', 'm4.16xlarge', 'm2.xlarge', 
                   'm2.2xlarge', 'm2.4xlarge', 'cr1.8xlarge', 'r3.large', 'r3.xlarge', 'r3.2xlarge', 'r3.4xlarge',
                   'r3.8xlarge', 'r4.large', 'r4.xlarge', 'r4.2xlarge', 'r4.4xlarge', 'r4.8xlarge', 'r4.16xlarge',
                   'x1.16xlarge', 'x1.32xlarge', 'i2.xlarge', 'i2.2xlarge', 'i2.4xlarge', 'i2.8xlarge', 'hi1.4xlarge',
                   'hs1.8xlarge', 'c1.medium', 'c1.xlarge', 'c3.large', 'c3.xlarge', 'c3.2xlarge', 'c3.4xlarge', 
                   'c3.8xlarge', 'c4.large', 'c4.xlarge', 'c4.2xlarge', 'c4.4xlarge', 'c4.8xlarge', 'cc1.4xlarge',
                   'cc2.8xlarge', 'g2.2xlarge', 'g2.8xlarge', 'cg1.4xlarge', 'p2.xlarge', 'p2.8xlarge', 'p2.16xlarge',
                   'd2.xlarge', 'd2.2xlarge', 'd2.4xlarge', 'd2.8xlarge', 'f1.2xlarge', 'f1.16xlarge']
 
regions = ['us-east-1', 'us-west-2', 'us-west-1', 'eu-west-1','eu-central-1', 'ap-southeast-1', 'ap-northeast-1',
           'ap-northeast-2', 'ap-southeast-2', 'sa-east-1']

In [54]:
dataHolder =[]
response = requests.get("http://www.ec2instances.info/?region=ap-southeast-2&reserved_term=yrTerm3Standard.allUpfront")
soup = BeautifulSoup(response.content, "html.parser")
    
for intype in instance_types:
    #Find each types data and store it
    try:
        trhold = soup.find('tr', {'id':intype})
        resCost = trhold.find('td', {'class':"cost-reserved cost-reserved-linux"})
        resCost = resCost.get('data-pricing')
        demandCost = trhold.find('td', {'class':"cost-ondemand cost-ondemand-linux"})
        demandCost = demandCost.get('data-pricing')
        dC = ast.literal_eval(demandCost)
        rC = ast.literal_eval(resCost)

        for region in regions:
            try:
                dh = {}
                if dC[region] != "N/A":
                    noNA = True
                    dh['OnDemand'] = dC[region]
                    dh['InstanceType'] = intype 
                    dh['AvailabilityZone']= region 
                    for term in rC[region]:
                        if rC[region][term]:
                            dh[term] = rC[region][term]                 
                        else:
                            noNA = False
                    if noNA == True:
                        dataHolder.append(dh) 
            except KeyError:
                #Triggered because that region doesnt exist in dC
                #print("ERROR "+ region)
                #print (dC)
                pass
    except AttributeError:
        #Triggered because Instance Type doesnt have pricing data on site
        #print("ATRIBUTE: "+intype)
        pass

In [55]:
df = pd.DataFrame(dataHolder)
df.head(20)


Out[55]:
AvailabilityZone InstanceType OnDemand yrTerm1Standard.allUpfront yrTerm1Standard.noUpfront yrTerm1Standard.partialUpfront yrTerm3Convertible.allUpfront yrTerm3Convertible.noUpfront yrTerm3Convertible.partialUpfront yrTerm3Standard.allUpfront yrTerm3Standard.partialUpfront
0 us-east-1 t1.micro 0.02 0.012 0.014 0.012 NaN NaN NaN 0.008 0.009
1 us-west-2 t1.micro 0.02 0.012 0.014 0.012 NaN NaN NaN 0.008 0.009
2 us-west-1 t1.micro 0.025 0.015 0.017 0.015 NaN NaN NaN 0.011 0.012
3 eu-west-1 t1.micro 0.02 0.015 0.016 0.015 NaN NaN NaN 0.011 0.012
4 ap-southeast-1 t1.micro 0.02 0.015 0.016 0.015 NaN NaN NaN 0.011 0.012
5 ap-northeast-1 t1.micro 0.026 0.016 0.018 0.016 NaN NaN NaN 0.012 0.013
6 ap-southeast-2 t1.micro 0.02 0.015 0.016 0.015 NaN NaN NaN 0.011 0.012
7 sa-east-1 t1.micro 0.027 0.016 0.019 0.017 NaN NaN NaN 0.011 0.012
8 us-east-1 t2.nano 0.0059 0.004 0.005 0.004 0.004 0.004 0.004 0.003 0.003
9 us-west-2 t2.nano 0.0059 0.004 0.005 0.004 0.004 0.004 0.004 0.003 0.003
10 us-west-1 t2.nano 0.0077 0.005 0.006 0.005 0.005 0.005 0.005 0.004 0.004
11 eu-west-1 t2.nano 0.0063 0.004 0.005 0.004 0.004 0.005 0.004 0.003 0.003
12 eu-central-1 t2.nano 0.0068 0.005 0.006 0.005 0.004 0.005 0.004 0.003 0.003
13 ap-southeast-1 t2.nano 0.0075 0.006 0.007 0.006 0.005 0.006 0.005 0.004 0.004
14 ap-northeast-1 t2.nano 0.008 0.006 0.007 0.006 0.005 0.006 0.005 0.004 0.004
15 ap-northeast-2 t2.nano 0.008 0.006 0.007 0.006 0.005 0.006 0.005 0.004 0.004
16 ap-southeast-2 t2.nano 0.008 0.006 0.007 0.006 0.005 0.006 0.006 0.004 0.005
17 sa-east-1 t2.nano 0.0101 0.006 0.007 0.006 0.005 0.006 0.005 0.004 0.005
18 us-east-1 t2.micro 0.012 0.008 0.009 0.008 0.007 0.008 0.007 0.005 0.005
19 us-west-2 t2.micro 0.012 0.008 0.009 0.008 0.007 0.008 0.007 0.005 0.005

In [56]:
df.to_csv('pricing-data.csv')