In [1]:
import requests
res = requests.get('https://www.python.org/')
#print res.text
print res.status_code
print res.headers['content-type']
In [2]:
import requests
payload = {
'StartStation':'977abb69-413a-4ccf-a109-0272c24fd490',
'EndStation':'60831846-f0e4-47f6-9b5b-46323ebdcef7',
'SearchDate':'2014/06/18',
'SearchTime':'10:30',
'SearchWay':'DepartureInMandarin'
}
res = requests.post('http://www.thsrc.com.tw/tw/TimeTable/SearchResult', data=payload)
print res.status_code
#print res.text
In [4]:
from bs4 import BeautifulSoup
html_sample = ' \
<html> \
<body> \
<h1 id="title">Hello World</h1> \
<a href="#" class="link">This is link1</a> \
<a href="# link2" class="link">This is link2</a> \
</body> \
</html>'
soup = BeautifulSoup(html_sample)
print soup.text
In [8]:
soup = BeautifulSoup(html_sample)
alink = soup.find('a')
print alink['href']
In [10]:
for alink in soup.findAll('a'):
print alink
In [11]:
alink = soup.find('h1', {'id':'title'})
print alink
In [7]:
soup = BeautifulSoup(html_sample)
for link in soup.findAll('a', {'class': 'link'}):
print link
In [24]:
alinks = soup.findAll('a', {'href': True})
#for link in alinks:
# print link['href']
a = soup.select("a")
#print a[1].text
for i in a:
print i.text
In [4]:
# -*- coding: utf-8 -*-
import requests
payload = { 'method':'search', 'searchMethod':'true', 'searchTarget':'ATM',
'orgName':'', 'orgId':'', 'hid_1':'1',
'tenderName':'', 'tenderId':'', 'tenderStatus':'4,5,21,29',
'tenderWay':'', 'awardAnnounceStartDate':'103/04/29',
'awardAnnounceEndDate':'103/04/29', 'radProctrgCate':'3',
'proctrgCate':'3', 'tenderRange':'', 'minBudget':'',
'maxBudget':'', 'item':'','hid_2':'1',
'gottenVendorName':'', 'gottenVendorId':'', 'hid_3':'1',
'submitVendorName':'', 'submitVendorId':'', 'location':'',
'priorityCate':'', 'isReConstruct':'', 'btnQuery':'查詢' }
In [6]:
user_post = requests.post("http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance", data=payload)
response_text = user_post.text.encode('utf8')
print user_post
In [26]:
rs = requests.session()
rs_post = rs.post("http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance", data=payload)
rs_get = rs.get("http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=2")
response = rs_get.text.encode('utf8')
#print response
In [28]:
from bs4 import BeautifulSoup
user_post = rs.post("http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance", data=payload)
response_text = user_post.text.encode('utf8')
soup = BeautifulSoup(response_text)
rec_number_element = soup.select('.T11b')[0]
#print int(rec_number_element.text)
rec_number = int(rec_number_element.text)
print rec_number
In [13]:
from math import ceil
#int(ceil(float(rec_number) / 100))
page_number = int(ceil(float(rec_number) / 100))
print page_number
In [10]:
page_format = "http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex={0}"
for page in range(1, 10 + 1):
print page_format.format(page)
In [20]:
page_format = "http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=%d"
for page in range(1, 5 + 1):
print page_format%(page)
In [29]:
page_format = "http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=%d"
for page in range(1, page_number + 1):
bid_list = rs.get(page_format%(page))
bid_response = bid_list.text.encode('utf8')
#print bid_response
In [32]:
page_format = "http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=%d"
page = 1
bid_list = rs.get(page_format%(page))
bid_response = bid_list.text.encode('utf8')
bid_soup = BeautifulSoup(bid_response)
#print bid_response
bid_table = bid_soup.select('#print_area')[0]
#print bid_table
In [33]:
bid_rows = bid_table.select('tr')[1:]
#print bid_rows
In [34]:
bid_rows = bid_table.select('tr')[1:-1]
#print bid_rows
In [35]:
ary = [1,2,3,4,5,6]
print ary[1:-1]
#[2,3,4,5]
In [36]:
bid_rows = bid_table.select('tr')[1:-1]
for bid_row in bid_rows:
links = [tag['href']for tag in bid_row.select('a')][0]
#print links
In [37]:
for bid_row in bid_rows:
links = [tag['href']
for tag in bid_row.select('a')][0]
#print links
In [40]:
for bid_row in bid_rows:
link = [tag['href']
for tag in bid_row.findAll('a',{'href': True})][0]
link_href = "http://web.pcc.gov.tw/tps" + link[2:]
#print link_href
In [42]:
import urlparse
for bid_row in bid_rows:
link = [tag['href']
for tag in bid_row.findAll('a',{'href': True})][0]
link_href = urlparse.urljoin("http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance", link)
#print link_href
In [29]:
f = open("test.txt", 'w')
f.write("Hello World\n")
f.close()
In [43]:
bid_file = open("bid_list.txt", 'w')
page_format = "http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=%d"
for page in range(1, page_number + 1):
bid_list = rs.get(page_format%(page))
bid_response = bid_list.text.encode('utf8')
bid_soup = BeautifulSoup(bid_response)
bid_table = bid_soup.select("#print_area") [0]
bid_rows = bid_table.select('tr')[1:-1]
for bid_row in bid_rows:
link = [tag['href'] for tag in bid_row.select('a')][0]
link_href = urlparse.urljoin("http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance", link)
bid_file.write(link_href + "\n")
bid_file.close()
In [32]:
# -*- coding: utf-8 -*-
import requests
import urlparse
from bs4 import BeautifulSoup
from math import ceil
In [36]:
payload = { 'method':'search', 'searchMethod':'true', 'searchTarget':'ATM',
'orgName':'', 'orgId':'', 'hid_1':'1',
'tenderName':'', 'tenderId':'', 'tenderStatus':'4,5,21,29',
'tenderWay':'', 'awardAnnounceStartDate':'103/04/29',
'awardAnnounceEndDate':'103/04/29', 'radProctrgCate':'3',
'proctrgCate':'3', 'tenderRange':'', 'minBudget':'',
'maxBudget':'', 'item':'','hid_2':'1',
'gottenVendorName':'', 'gottenVendorId':'', 'hid_3':'1',
'submitVendorName':'', 'submitVendorId':'', 'location':'',
'priorityCate':'', 'isReConstruct':'', 'btnQuery':'查詢' }
rs = requests.session()
user_post = rs.post("http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance", data=payload)
response_text = user_post.text.encode('utf8')
In [37]:
soup = BeautifulSoup(response_text)
rec_number_element = soup.select(".T11b" )[0]
rec_number = int(rec_number_element.text)
page_number = int(ceil(float(rec_number) / 100))
In [44]:
bid_file = open("bid_list.txt", 'w')
page_format = "http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=%d"
for page in range(1, page_number + 1):
bid_list = rs.get(page_format%(page))
bid_response = bid_list.text.encode('utf8')
bid_soup = BeautifulSoup(bid_response)
bid_table = bid_soup.select("#print_area")[0]
bid_rows = bid_table.select('tr')[1:-1]
for bid_row in bid_rows:
link = [tag['href'] for tag in bid_row.select('a')][0]
link_href = urlparse.urljoin("http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance", link)
bid_file.write(link_href + "\n")
bid_file.close()
In [46]:
line_num = 0
f = open("bid_list.txt", "r")
for line in f.readlines():
line_num = line_num + 1
print line_num
f.close()
In [47]:
soup = BeautifulSoup(response_text)
rec_number_element = soup.select(".T11b" )[0]
rec_number = int(rec_number_element.text)
line_num = 0
f = open("bid_list.txt", "r")
for line in f.readlines():
line_num = line_num + 1
print line_num
f.close()
#使用if 判斷
if rec_number == line_num:
print "record number on web page is equal to number of lines in file"
In [49]:
request_get = requests.get("http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51239291&tenderCaseNo=0607L1030328")
response = request_get.text.encode('utf8')
#print response
In [50]:
request_get = requests.get("http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51239291&tenderCaseNo=0607L1030328")
response = request_get.text.encode('utf8')
soup = BeautifulSoup(response)
printarea = soup.select('#printArea') [0]
#print printarea
In [51]:
bid_detail = open("bid_detail.txt", 'w')
request_get = requests.get("http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51239291&tenderCaseNo=0607L1030328")
response = request_get.text.encode('utf8')
soup = BeautifulSoup(response)
printarea = soup.select('#printArea') [0]
bid_detail.write(printarea)
bid_detail.close()
In [52]:
bid_detail = open("bid_detail.txt", 'w')
request_get = requests.get("http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51239291&tenderCaseNo=0607L1030328")
response = request_get.text.encode('utf8')
soup = BeautifulSoup(response)
printarea = soup.select('#printArea') [0]
bid_detail.write(printarea.prettify("utf-8"))
bid_detail.close()
In [67]:
case_dic = {}
f = open("bid_list.txt", "r")
for line in f.readlines():
rec = line.strip()
caseno = rec.split('tenderCaseNo=')[1]
if caseno not in case_dic:
case_dic[caseno] = 1
else:
print caseno
f.close()
In [53]:
link = "http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51239291&tenderCaseNo=0607L1030328"
In [54]:
rear_substring = link.split("pkAtmMain=")[1]
param = rear_substring.split("&tenderCaseNo=")
pkAtmMain= param[0]
caseno= param[1]
print pkAtmMain,caseno
In [55]:
import re
m = re.match(r"(?P<FirstName>\w+) (?P<LastName>\w+)", "David Chiu")
print m.group("FirstName"), m.group("LastName")
In [56]:
import re
link = "http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51239291&tenderCaseNo=0607L1030328"
m = re.match(r"([^ ]+)pkAtmMain=(?P<pkAtmMain>.+)&tenderCaseNo=(?P<tenderCaseNo>.+)", link)
print m.group('pkAtmMain'), m.group('tenderCaseNo')
In [58]:
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re, requests
bid_list = open('bid_list.txt', 'r')
for line in bid_list.readlines():
pagelink = line.strip()
m = re.match(r"([^ ]+)pkAtmMain=(?P<pkAtmMain>.+)&tenderCaseNo=(?P<tenderCaseNo>.+)", pagelink)
filename = "%s_%s"%(m.group('pkAtmMain'), m.group('tenderCaseNo'))
request_get = requests.get(pagelink)
response = request_get.text.encode('utf8')
soup = BeautifulSoup(response)
printarea = soup.select('#printArea')[0]
bid_detail = open("gov/%s.txt"%(filename), 'w')
bid_detail.write(printarea .prettify("utf-8"))
bid_detail.close()
bid_list.close()
In [ ]:
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import re, requests, time
import os
if not os.path.exists('gov'):
os.mkdir('gov')
bid_list = open('bid_list.txt', 'r')
for line in bid_list.readlines():
pagelink = line.strip()
m = re.match(r"([^ ]+)pkAtmMain=(?P<pkAtmMain>.+)&tenderCaseNo=(?P<tenderCaseNo>.+)", pagelink)
filename = "%s_%s"%(m.group('pkAtmMain'), m.group('tenderCaseNo'))
request_get = requests.get(pagelink)
response = request_get.text.encode('utf8')
soup = BeautifulSoup(response)
printarea = soup.select('#printArea')[0]
bid_detail = open("gov/%s.txt"%(filename), 'w')
bid_detail.write(printarea .prettify("utf-8"))
time.sleep(3)
bid_detail.close()
bid_list.close()