In [1]:
from bs4 import BeautifulSoup
import mechanize
import re
import time
import random
In [2]:
url = 'http://www.cpcb.gov.in/CAAQM/frmUserAvgReportCriteria.aspx'
In [3]:
random.seed()
In [4]:
def getOptions(param, form):
return re.findall("{}=\[(.+)\]".format(param), str(form))
In [5]:
maxSleepTime = 2.0
def randSleepTime():
return (1.0 + random.random() * maxSleepTime)
In [9]:
# Example Usage:
# soup = BeautifulSoup(html)
# getMap(soup, "ddlState")
def getMap(soup, ty):
select_node = soup.findAll('select', attrs={'name': ty})
if not select_node:
select_node = soup.findAll('select', attrs={'id': ty})
# print select_node
option_map = {}
if select_node:
for option in select_node[0].findAll('option'):
option_map[option['value']] = option.text
return option_map
In [10]:
def getStatesMap():
br = mechanize.Browser()
br.addheaders = [('user-agent',
' Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3'),
('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')]
br.set_handle_robots(False)
res = br.open(url)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
states = getMap(soup, "ddlState")
f = open('state-ids.txt', 'w')
f.write("stateId,state\n")
for stateId, state in states.iteritems():
f.write("{},{}\n".format(stateId, state))
f.close()
return states
def getCitiesMap(stateId, stateName):
br = mechanize.Browser()
br.addheaders = [('user-agent',
' Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3'),
('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')]
br.set_handle_robots(False)
res = br.open(url)
forms = mechanize.ParseResponse(res, backwards_compat=False)
form = forms[0]
form["ddlState"] = [stateId,]
form, html = f(br, form)
soup = BeautifulSoup(html, 'html.parser')
cities = getMap(soup, "ddlCity")
f = open('city-ids.txt', 'a')
for cityId, city in cities.iteritems():
f.write("{},{},{},{}\n".format(stateId, stateName, cityId, city))
f.close()
br.close()
return cities
def getStationsMap(stateId, stateName, cityId, cityName):
br = mechanize.Browser()
br.addheaders = [('user-agent',
' Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3'),
('accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')]
br.set_handle_robots(False)
res = br.open(url)
forms = mechanize.ParseResponse(res, backwards_compat=False)
form = forms[0]
form["ddlState"] = [stateId,]
form, html = updateForm(br, form)
form["ddlCity"] = [cityId,]
form, html = updateForm(br, form)
soup = BeautifulSoup(html, 'html.parser')
stations = getMap(soup, "ddlStation")
f = open('station-ids.txt', 'a')
for stationId, stationName in stations.iteritems():
f.write("{},{},{},{},{},{}\n".format(stateId, stateName, cityId, cityName, stationId, stationName))
f.close()
br.close()
return stations
def updateForm(br, form):
time.sleep(randSleepTime())
res = br.open(form.click())
html = res.get_data()
forms = mechanize.ParseResponse(res, backwards_compat=False)
form = forms[0]
return form, html
In [7]:
f = open('city-ids.txt', 'r')
cities = f.readlines()
cities = map(lambda elem: elem.split(","), cities)[1:]
cities = map(lambda elem: [elem[0], elem[1], elem[2], elem[3][:-1]], cities)
cities = filter(lambda elem: elem[2].isdigit(), cities)
In [11]:
f = open('station-ids.txt', 'w')
f.write("stateId,state,cityId,city,stationId,station\n")
f.close()
for elem in cities:
stateId, stateName, cityId, cityName = elem
getStationsMap(stateId, stateName, cityId, cityName)
In [ ]: