In [1]:
from bs4 import BeautifulSoup
import mechanize
import re
import time
import random

In [2]:
url = 'http://www.cpcb.gov.in/CAAQM/frmUserAvgReportCriteria.aspx'

In [3]:
random.seed()

In [4]:
def getOptions(param, form):
    return re.findall("{}=\[(.+)\]".format(param), str(form))

In [5]:
maxSleepTime = 2.0
def randSleepTime():
    return (1.0 + random.random() * maxSleepTime)

In [9]:
#     Example Usage: 
#     soup = BeautifulSoup(html)
#     getMap(soup, "ddlState")
def getMap(soup, ty):
    select_node = soup.findAll('select', attrs={'name': ty})
    if not select_node:
        select_node = soup.findAll('select', attrs={'id': ty})
#     print select_node
    option_map = {}
    if select_node:
        for option in select_node[0].findAll('option'):
            option_map[option['value']] = option.text
    return option_map

In [10]:
def getStatesMap():
    br = mechanize.Browser()
    br.addheaders = [('user-agent', 
    '   Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3'),
    ('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')]

    br.set_handle_robots(False)
    res = br.open(url)
    
    html = res.read()
    soup = BeautifulSoup(html, 'html.parser')
    states = getMap(soup, "ddlState")
    f = open('state-ids.txt', 'w')
    f.write("stateId,state\n")
    for stateId, state in states.iteritems():
        f.write("{},{}\n".format(stateId, state))
    f.close()
    return states

def getCitiesMap(stateId, stateName):
    br = mechanize.Browser()
    br.addheaders = [('user-agent', 
    '   Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3'),
    ('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')]

    br.set_handle_robots(False)
    res = br.open(url)
    forms = mechanize.ParseResponse(res, backwards_compat=False)
    form = forms[0]

    form["ddlState"] = [stateId,]
    form, html = f(br, form)

    soup = BeautifulSoup(html, 'html.parser')
    cities = getMap(soup, "ddlCity")
    f = open('city-ids.txt', 'a')
    for cityId, city in cities.iteritems():
        f.write("{},{},{},{}\n".format(stateId, stateName, cityId, city))
    f.close()
    br.close()
    return cities

def getStationsMap(stateId, stateName, cityId, cityName):
    br = mechanize.Browser()
    br.addheaders = [('user-agent', 
    '   Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3'),
    ('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')]

    br.set_handle_robots(False)
    res = br.open(url)
    forms = mechanize.ParseResponse(res, backwards_compat=False)
    form = forms[0]

    form["ddlState"] = [stateId,]
    form, html = updateForm(br, form)

    form["ddlCity"] = [cityId,]
    form, html = updateForm(br, form)
    
    soup = BeautifulSoup(html, 'html.parser')
    stations = getMap(soup, "ddlStation")
    
    f = open('station-ids.txt', 'a')
    for stationId, stationName in stations.iteritems():
        f.write("{},{},{},{},{},{}\n".format(stateId, stateName, cityId, cityName, stationId, stationName))
    f.close()
    br.close()
    return stations

def updateForm(br, form):
    time.sleep(randSleepTime())
    res = br.open(form.click())
    html = res.get_data()
    forms = mechanize.ParseResponse(res, backwards_compat=False)
    form = forms[0]
    return form, html

In [7]:
f = open('city-ids.txt', 'r')
cities = f.readlines()
cities = map(lambda elem: elem.split(","), cities)[1:]
cities = map(lambda elem: [elem[0], elem[1], elem[2], elem[3][:-1]], cities)
cities = filter(lambda elem: elem[2].isdigit(), cities)

In [11]:
f = open('station-ids.txt', 'w')
f.write("stateId,state,cityId,city,stationId,station\n")
f.close()
for elem in cities:
    stateId, stateName, cityId, cityName = elem
    getStationsMap(stateId, stateName, cityId, cityName)

In [ ]: