Make Use of Beautiful Soup to automate fetching data

  • Historic Annual PM2.5, PM10 Plot
    • INDIA - National
    • Delhi NCR
    • Delhi Location wise
    • Gurgaon

In [10]:
from bs4 import BeautifulSoup
import mechanize
import re
import time
import random

In [11]:
url = 'http://www.cpcb.gov.in/CAAQM/frmUserAvgReportCriteria.aspx'

In [12]:
random.seed()

In [13]:
def getOptions(param, form):
    return re.findall("{}=\[(.+)\]".format(param), str(form))

def getStations(ddlState, ddlCity):
    br = mechanize.Browser()
    br.set_handle_robots(False)
    res = br.open(url)
    forms = mechanize.ParseResponse(res, backwards_compat=False)
    form = forms[0]
    # State
    form["ddlState"] = ["{}".format(ddlState),]
    form, html = updateForm(br, form)
    # City
    form["ddlCity"] = ["{}".format(ddlCity),]
    form, html = updateForm(br, form)
    
    options = getOptions("ddlStation", form)
    options = options[0].split(',')[1:]
    options = map(lambda x: x.strip(), options)
    stations = options
    return stations

In [14]:
#     Example Usage: 
#     soup = BeautifulSoup(html)
#     getMap(soup, "ddlState")
def getMap(soup, ty):
    select_node = soup.findAll('select', attrs={'name': ty})
    if not select_node:
        select_node = soup.findAll('select', attrs={'id': ty})
#     print select_node
    option_map = {}
    if select_node:
        for option in select_node[0].findAll('option'):
            option_map[option['value']] = option.text
    return option_map

In [15]:
maxSleepTime = 2.0
def randSleepTime():
    return (1.0 + random.random() * maxSleepTime)

In [34]:
def getPM25Id(html):
    return re.findall("<option value=(.+)>(.*)PM2\.5(.*)</option>", str(html))

def getData(state, city):
    # Delhi, Delhi
#     state, city = "6", "85"
    stations = getStations(state, city)
    soups = []
    #
    random.shuffle(stations)
    for station in stations:
        try:
            br = mechanize.Browser()
            br.addheaders = [('user-agent', 
            '   Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3'),
            ('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')]

            br.set_handle_robots(False)
            res = br.open(url)

            forms = mechanize.ParseResponse(res, backwards_compat=False)
            form = forms[0]

            form["ddlState"] = [state,]
            form, html = updateForm(br, form)

            form["ddlCity"] = [city,]
            form, html = updateForm(br, form)

            form["ddlStation"] = ["{}".format(station),]
            form, html = updateForm(br, form)

    #         print html
            parsed_pm25 = getPM25Id(html)
            if parsed_pm25:
                PM25Id = getPM25Id(html)[0][0][1:-1]
                print "PM2.5 Id =", PM25Id
            else:
                print "No PM2.5 for {}, {}, {}".format(state, city, station)
                continue

            form["lstBoxChannelLeft"] = ["{}".format(PM25Id),]
            form, html = updateForm(br, form)

            form["ddlCriteria"] = ["0",]
            form, html = updateForm(br, form)

            form["txtYear"] = "2000"
            form["txtYearTo"] = "2017"
            form, html = updateForm(br, form)

            form["btnSubmit"] = "True"
            form, html = updateForm(br, form)

            br.select_form("form1")

            time.sleep(randSleepTime())
            res = br.submit(name='btnSubmit')
            html = res.read()
            soup = BeautifulSoup(html, 'html.parser')
            soups.append([(state, city, station), soup])
    #         if len(soups) >= 2:
    #             break
            time.sleep(randSleepTime())
            br.close()
            print "Finished Crawling for {}, {}, {}".format(state, city, station)

        except:
            print "Exception raised for {}, {}, {}".format(state, city, station)
            time.sleep(5)
            br.close()
            continue
            
    return soups

In [35]:
def updateForm(br, form):
    time.sleep(randSleepTime())
    res = br.open(form.click())
    html = res.get_data()
    forms = mechanize.ParseResponse(res, backwards_compat=False)
    form = forms[0]
    return form, html

In [36]:
def getValsHtml(table):
    data = []
    rows = table.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele]) # Get rid of empty values
    vals = re.findall("\d+\.\d+", data[0][0])[:-1]
    data = []
    for val in vals:
        yr = val[:4]
        data.append([yr, val[4:]])
    return data

def cleanupData(data):
    clean_data = []
    for elem in data:
        if elem[0] in map(str, range(2000, 2018)):
            clean_data.append(elem)
    return clean_data

In [37]:
f = open('city-ids.txt', 'r')
cities = f.readlines()
cities = map(lambda elem: elem.split(","), cities)[1:]
cities = map(lambda elem: [elem[0], elem[1], elem[2], elem[3][:-1]], cities)
cities = filter(lambda elem: elem[2].isdigit(), cities)

In [38]:
for elem in cities:
    stateId, stateName, cityId, cityName = elem
    f = open('data/{}_{}.txt'.format(stateName, cityName), 'w')
    try:
        soups = getData(stateId, cityId)
        for s in soups:
            state, city, station = s[0]
            f.write("state:{},city:{},station:{}\n".format(state, city, station))
            html_data = s[1].find(id="gvReportStation")
            if html_data:
                data = getValsHtml(html_data)
                data = cleanupData(data)
                for elem in data:
                    f.write(','.join(str(x) for x in elem) + "\n")
            else:
                f.write("None\n")

            f.write("\n")
        f.close()
        print "---"
        print "Finished Crawling city {}, {}".format(stateName, cityName)
        time.sleep(7) # Sleep for 7 seconds before continuing for next city
    except:
        print "Error occurred at {}, {}, {}, {}".format(stateId, stateName, cityId, cityName)
        break


PM2.5 Id = 1373
Finished Crawling for 25, 546, 8
PM2.5 Id = 1374
Finished Crawling for 25, 546, 9
PM2.5 Id = 1375
Finished Crawling for 25, 546, 33
---
Finished Crawling city Tamil Nadu, Chennai
No PM2.5 for 22, 194, 103
---
Finished Crawling city Punjab, Ludhiana
PM2.5 Id = 1886
Finished Crawling for 22, 188, 102
---
Finished Crawling city Punjab, Amritsar
PM2.5 Id = 1893
Finished Crawling for 22, 553, 106
---
Finished Crawling city Punjab, Mandi Gobindgarh
PM2.5 Id = 1536
Finished Crawling for 23, 212, 85
---
Finished Crawling city Rajasthan, Jodhpur
PM2.5 Id = 1500
Finished Crawling for 23, 223, 83
---
Finished Crawling city Rajasthan, Jaipur
PM2.5 Id = 1632
Finished Crawling for 28, 278, 38
---
Finished Crawling city Uttar Pradesh, Kanpur
PM2.5 Id = 1383
Finished Crawling for 28, 253, 39
PM2.5 Id = 1086
Finished Crawling for 28, 253, 21
---
Finished Crawling city Uttar Pradesh, Agra
PM2.5 Id = 1379
Finished Crawling for 28, 270, 40
---
Finished Crawling city Uttar Pradesh, Varanasi
PM2.5 Id = 1369
Finished Crawling for 28, 256, 12
PM2.5 Id = 1368
Finished Crawling for 28, 256, 11
PM2.5 Id = 1367
Finished Crawling for 28, 256, 10
---
Finished Crawling city Uttar Pradesh, Lucknow
No PM2.5 for 29, 300, 35
PM2.5 Id = 1599
Finished Crawling for 29, 300, 31
PM2.5 Id = 1638
Finished Crawling for 29, 300, 92
---
Finished Crawling city West Bengal, Kolkata
PM2.5 Id = 1734
Finished Crawling for 29, 548, 88
---
Finished Crawling city West Bengal, Howrah
PM2.5 Id = 1590
Finished Crawling for 29, 549, 82
---
Finished Crawling city West Bengal, Haldia
No PM2.5 for 29, 552, 98
---
Finished Crawling city West Bengal, Durgapur
PM2.5 Id = 1702
Finished Crawling for 1, 9, 95
---
Finished Crawling city Andhra Pradesh, Visakhapatnam
PM2.5 Id = 1721
Finished Crawling for 1, 21, 96
---
Finished Crawling city Andhra Pradesh, Tirupati
PM2.5 Id = 1465
Finished Crawling for 4, 54, 81
---
Finished Crawling city Bihar, Muzaffarpur
PM2.5 Id = 1610
Finished Crawling for 4, 75, 90
---
Finished Crawling city Bihar, Gaya
PM2.5 Id = 1334
Finished Crawling for 4, 70, 65
---
Finished Crawling city Bihar, Patna
PM2.5 Id = 1378
Finished Crawling for 6, 85, 5
PM2.5 Id = 508
Finished Crawling for 6, 85, 3
PM2.5 Id = 7
Finished Crawling for 6, 85, 47
PM2.5 Id = 509
Finished Crawling for 6, 85, 4
PM2.5 Id = 1376
Finished Crawling for 6, 85, 7
PM2.5 Id = 513
Finished Crawling for 6, 85, 45
PM2.5 Id = 507
Finished Crawling for 6, 85, 2
No PM2.5 for 6, 85, 51
No PM2.5 for 6, 85, 55
PM2.5 Id = 514
Finished Crawling for 6, 85, 46
PM2.5 Id = 1377
Finished Crawling for 6, 85, 6
PM2.5 Id = 506
Finished Crawling for 6, 85, 1
PM2.5 Id = 1290
Finished Crawling for 6, 85, 52
PM2.5 Id = 1316
Exception raised for 6, 85, 53
No PM2.5 for 6, 85, 58
PM2.5 Id = 1341
Finished Crawling for 6, 85, 54
---
Finished Crawling city Delhi, Delhi
PM2.5 Id = 1589
Finished Crawling for 9, 364, 86
---
Finished Crawling city Haryana, Gurgaon
PM2.5 Id = 1384
Finished Crawling for 9, 365, 34
---
Finished Crawling city Haryana, Faridabad
PM2.5 Id = 1588
Finished Crawling for 9, 348, 84
---
Finished Crawling city Haryana, Panchkula
PM2.5 Id = 1629
Finished Crawling for 9, 360, 91
---
Finished Crawling city Haryana, Rohtak
PM2.5 Id = 1187
Finished Crawling for 8, 337, 41
---
Finished Crawling city Gujarat, Ahmedabad
PM2.5 Id = 1372
Finished Crawling for 13, 136, 16
No PM2.5 for 13, 136, 24
No PM2.5 for 13, 136, 25
PM2.5 Id = 1371
Finished Crawling for 13, 136, 15
PM2.5 Id = 1370
Finished Crawling for 13, 136, 14
---
Finished Crawling city Karnataka, Bengaluru
PM2.5 Id = 1751
Finished Crawling for 16, 308, 97
---
Finished Crawling city Maharashtra, Aurangabad
PM2.5 Id = 1779
Finished Crawling for 16, 309, 99
---
Finished Crawling city Maharashtra, Thane
PM2.5 Id = 1401
Finished Crawling for 16, 312, 30
---
Finished Crawling city Maharashtra, Pune
PM2.5 Id = 999
Finished Crawling for 16, 310, 29
PM2.5 Id = 1391
Finished Crawling for 16, 310, 27
PM2.5 Id = 1357
Finished Crawling for 16, 310, 28
---
Finished Crawling city Maharashtra, Mumbai
PM2.5 Id = 1566
Finished Crawling for 16, 314, 87
---
Finished Crawling city Maharashtra, Solapur
PM2.5 Id = 1676
Finished Crawling for 16, 307, 94
---
Finished Crawling city Maharashtra, Nashik
PM2.5 Id = 1654
Finished Crawling for 16, 327, 93
---
Finished Crawling city Maharashtra, Nagpur
PM2.5 Id = 1796
Finished Crawling for 16, 329, 100
PM2.5 Id = 1427
Finished Crawling for 16, 329, 79
---
Finished Crawling city Maharashtra, Chandrapur
PM2.5 Id = 1813
Finished Crawling for 30, 7, 101
PM2.5 Id = 1380
Finished Crawling for 30, 7, 36
PM2.5 Id = 1443
Finished Crawling for 30, 7, 80
PM2.5 Id = 1844
Finished Crawling for 30, 7, 104
PM2.5 Id = 1863
Finished Crawling for 30, 7, 105
---
Finished Crawling city Telangana, Hyderabad

In [32]:
states = getStatesMap()

In [105]:
soups = getData()


PM2.5 Id = 1341
PM2.5 Id = 508
PM2.5 Id = 1378
PM2.5 Id = 1290
PM2.5 Id = 1377
PM2.5 Id = 7
PM2.5 Id = 513
PM2.5 Id = 1376
PM2.5 Id = 507
PM2.5 Id = 514
No PM2.5 for 6, 85, 58
PM2.5 Id = 1316
Exception raised for 6, 85, 53
No PM2.5 for 6, 85, 51
PM2.5 Id = 509
PM2.5 Id = 506
No PM2.5 for 6, 85, 55

In [107]:
f = open('delhi-pm25-data.txt', 'w')
for s in soups:
    state, city, station = s[0]
    f.write("state:{},city:{},station:{}\n".format(state, city, station))
    html_data = s[1].find(id="gvReportStation")
    if html_data:
        data = getValsHtml(html_data)
        data = cleanupData(data)
        for elem in data:
            f.write(','.join(str(x) for x in elem) + "\n")
    else:
        f.write("None\n")
    
    f.write("\n")
f.close()

In [79]:
for soup in soups:
    table = soup[1].find(id="gvReportStation")
    if table:
        data = getValsHtml(table)
        data = cleanupData(data)
        print data
    else:
        print "No Data available for this station"


No Data available for this station
[[u'2016', u'216.32'], [u'2017', u'123.35']]

In [10]:
print getStations("6", "85")


['1', '2', '3', '4', '5', '6', '7', '45', '46', '47', '51', '52', '53', '54', '55', '58']