Python ETL Data Crawling

使用requests.get


In [1]:
import requests
res = requests.get('https://www.python.org/')
#print res.text
print res.status_code 
print res.headers['content-type']


200
text/html; charset=utf-8

使用requests.post


In [2]:
import requests
payload = {
'StartStation':'977abb69-413a-4ccf-a109-0272c24fd490',
'EndStation':'60831846-f0e4-47f6-9b5b-46323ebdcef7',
'SearchDate':'2014/06/18',
'SearchTime':'10:30',
'SearchWay':'DepartureInMandarin'
}
res = requests.post('http://www.thsrc.com.tw/tw/TimeTable/SearchResult', data=payload)
print res.status_code
#print res.text


200

將網頁讀進BeautifulSoup 中


In [4]:
from bs4 import BeautifulSoup 
html_sample = ' \
<html> \
 <body> \
 <h1 id="title">Hello World</h1> \
 <a href="#" class="link">This is link1</a> \
 <a href="# link2" class="link">This is link2</a> \
 </body> \
 </html>'

soup = BeautifulSoup(html_sample)
print soup.text


 Hello World This is link1 This is link2 

使用Find 找出(第一個)含有a tag 的元素


In [8]:
soup = BeautifulSoup(html_sample) 
alink = soup.find('a') 
print alink['href']


#

找出所有含a tag 的HTML 元素


In [10]:
for alink in soup.findAll('a'): 
    print alink


<a class="link" href="#">This is link1</a>
<a class="link" href="# link2">This is link2</a>

使用Find 找出所有id為title的元素


In [11]:
alink = soup.find('h1', {'id':'title'}) 
print alink


<h1 id="title">Hello World</h1>

取得含有特定class的元素


In [7]:
soup = BeautifulSoup(html_sample) 
for link in soup.findAll('a', {'class': 'link'}): 
    print link


<a class="link" href="#">This is link1</a>
<a class="link" href="# link2">This is link2</a>

取得所有a tag 內的連結


In [24]:
alinks = soup.findAll('a', {'href': True}) 
#for link in alinks: 
#    print link['href']
    
    
a = soup.select("a")
#print a[1].text
for i in a:
    print i.text


This is link1
This is link2

設定POST 的Form Data


In [4]:
# -*- coding: utf-8 -*- 
import requests

payload = { 'method':'search', 'searchMethod':'true', 'searchTarget':'ATM',
 'orgName':'', 'orgId':'', 'hid_1':'1',
 'tenderName':'', 'tenderId':'', 'tenderStatus':'4,5,21,29',
 'tenderWay':'', 'awardAnnounceStartDate':'103/04/29',
 'awardAnnounceEndDate':'103/04/29', 'radProctrgCate':'3', 
 'proctrgCate':'3', 'tenderRange':'', 'minBudget':'',
 'maxBudget':'', 'item':'','hid_2':'1',
 'gottenVendorName':'', 'gottenVendorId':'', 'hid_3':'1',
 'submitVendorName':'', 'submitVendorId':'', 'location':'',
 'priorityCate':'', 'isReConstruct':'', 'btnQuery':'查詢' }

送出Post 取得資料


In [6]:
user_post = requests.post("http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance", data=payload) 
response_text = user_post.text.encode('utf8') 
print user_post


<Response [200]>

如何延續Session


In [26]:
rs = requests.session() 
rs_post = rs.post("http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance", data=payload) 
rs_get = rs.get("http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=2") 
response = rs_get.text.encode('utf8')
#print response

取得資料筆數


In [28]:
from bs4 import BeautifulSoup

user_post = rs.post("http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance", data=payload)
response_text = user_post.text.encode('utf8') 
soup = BeautifulSoup(response_text) 
rec_number_element = soup.select('.T11b')[0]
#print int(rec_number_element.text) 
rec_number = int(rec_number_element.text) 
print rec_number


265

將資料筆數轉換成頁數


In [13]:
from math import ceil 
#int(ceil(float(rec_number) / 100))
page_number = int(ceil(float(rec_number) / 100)) 
print page_number


3

字串格式化 - 使用format


In [10]:
page_format = "http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex={0}" 
for page in range(1, 10 + 1): 
    print page_format.format(page)


http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=1
http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=2
http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=3
http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=4
http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=5
http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=6
http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=7
http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=8
http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=9
http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=10

使用%


In [20]:
page_format = "http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=%d" 
for page in range(1, 5 + 1): 
    print page_format%(page)


http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=1
http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=2
http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=3
http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=4
http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=5

取得每頁標案清單的內容


In [29]:
page_format = "http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=%d" 
for page in range(1, page_number + 1): 
    bid_list = rs.get(page_format%(page)) 
    bid_response = bid_list.text.encode('utf8')
    #print bid_response

先拿一頁做試驗


In [32]:
page_format = "http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=%d" 
page = 1 
bid_list = rs.get(page_format%(page)) 
bid_response = bid_list.text.encode('utf8') 
bid_soup = BeautifulSoup(bid_response) 
#print bid_response
bid_table = bid_soup.select('#print_area')[0]
#print bid_table

去掉標頭


In [33]:
bid_rows = bid_table.select('tr')[1:] 
#print bid_rows

去掉標頭跟頁次


In [34]:
bid_rows = bid_table.select('tr')[1:-1] 
#print bid_rows

索引範例


In [35]:
ary = [1,2,3,4,5,6] 
print ary[1:-1] 
#[2,3,4,5]


[2, 3, 4, 5]

抓出每一列所有的連結


In [36]:
bid_rows = bid_table.select('tr')[1:-1] 
for bid_row in bid_rows: 
    links = [tag['href']for tag in bid_row.select('a')][0] 

    #print links

用[0]取第一個連結


In [37]:
for bid_row in bid_rows: 

    links = [tag['href'] 
      for tag in bid_row.select('a')][0]
 
    #print links

取得實際連結


In [40]:
for bid_row in bid_rows: 
    link = [tag['href'] 
      for tag in bid_row.findAll('a',{'href': True})][0] 

    link_href = "http://web.pcc.gov.tw/tps" + link[2:] 

    #print link_href

使用urljoin


In [42]:
import urlparse
for bid_row in bid_rows: 
    link = [tag['href'] 
      for tag in bid_row.findAll('a',{'href': True})][0] 

    link_href = urlparse.urljoin("http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance", link)

    #print link_href

複習一下如何使用Python做檔案寫入


In [29]:
f = open("test.txt", 'w') 
f.write("Hello World\n") 
f.close()

將每頁標案清單的連結存入檔案


In [43]:
bid_file = open("bid_list.txt", 'w') 
page_format = "http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=%d" 
for page in range(1, page_number + 1): 
    bid_list = rs.get(page_format%(page)) 
    bid_response = bid_list.text.encode('utf8') 
    bid_soup = BeautifulSoup(bid_response) 
    bid_table = bid_soup.select("#print_area") [0]
    bid_rows = bid_table.select('tr')[1:-1] 
    for bid_row in bid_rows:         
        link = [tag['href'] for tag in bid_row.select('a')][0] 
        link_href = urlparse.urljoin("http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance", link) 
        bid_file.write(link_href + "\n") 
bid_file.close()

0. 引入該用的package


In [32]:
# -*- coding: utf-8 -*- 
import requests
import urlparse 
from bs4 import BeautifulSoup 
from math import ceil

1. 設定查詢條件


In [36]:
payload = { 'method':'search', 'searchMethod':'true', 'searchTarget':'ATM',
 'orgName':'', 'orgId':'', 'hid_1':'1',
 'tenderName':'', 'tenderId':'', 'tenderStatus':'4,5,21,29',
 'tenderWay':'', 'awardAnnounceStartDate':'103/04/29',
 'awardAnnounceEndDate':'103/04/29', 'radProctrgCate':'3', 
 'proctrgCate':'3', 'tenderRange':'', 'minBudget':'',
 'maxBudget':'', 'item':'','hid_2':'1',
 'gottenVendorName':'', 'gottenVendorId':'', 'hid_3':'1',
 'submitVendorName':'', 'submitVendorId':'', 'location':'',
 'priorityCate':'', 'isReConstruct':'', 'btnQuery':'查詢' }

rs = requests.session() 
user_post = rs.post("http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance", data=payload) 
response_text = user_post.text.encode('utf8')

2. 計算抓取頁數


In [37]:
soup = BeautifulSoup(response_text) 
rec_number_element = soup.select(".T11b" )[0] 
rec_number = int(rec_number_element.text)
page_number = int(ceil(float(rec_number) / 100))

3. 依每頁抓取所有標案連結 & 4. 存入每頁標案連結


In [44]:
bid_file = open("bid_list.txt", 'w') 
page_format = "http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance&searchTarget=ATM&method=search&isSpdt=&pageIndex=%d" 
for page in range(1, page_number + 1): 
    bid_list = rs.get(page_format%(page)) 
    bid_response = bid_list.text.encode('utf8') 
    bid_soup = BeautifulSoup(bid_response)
    bid_table = bid_soup.select("#print_area")[0] 
    bid_rows = bid_table.select('tr')[1:-1] 
    for bid_row in bid_rows: 
        link = [tag['href'] for tag in bid_row.select('a')][0] 
        link_href = urlparse.urljoin("http://web.pcc.gov.tw/tps/pss/tender.do?searchMode=common&searchType=advance", link) 
        bid_file.write(link_href + "\n") 
bid_file.close()

計算總共存了多少個連結


In [46]:
line_num = 0 
f = open("bid_list.txt", "r") 
for line in f.readlines(): 
    line_num = line_num + 1 
print line_num 
f.close()


265

判斷抓下來的資料筆數 是否與網頁上呈現的相同


In [47]:
soup = BeautifulSoup(response_text) 
rec_number_element = soup.select(".T11b" )[0] 
rec_number = int(rec_number_element.text)

line_num = 0 
f = open("bid_list.txt", "r") 
for line in f.readlines(): 
    line_num = line_num + 1 
print line_num 
f.close() 

#使用if 判斷
if rec_number == line_num: 
    print "record number on web page is equal to number of lines in file"


265
record number on web page is equal to number of lines in file

使用requests.get 抓取標案細節


In [49]:
request_get = requests.get("http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51239291&tenderCaseNo=0607L1030328") 
response = request_get.text.encode('utf8') 
#print response

取得id為printArea 的區塊


In [50]:
request_get = requests.get("http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51239291&tenderCaseNo=0607L1030328") 
response = request_get.text.encode('utf8') 
soup = BeautifulSoup(response)
printarea = soup.select('#printArea') [0]
#print printarea

將標案資料寫進檔案中


In [51]:
bid_detail = open("bid_detail.txt", 'w') 
request_get = requests.get("http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51239291&tenderCaseNo=0607L1030328") 
response = request_get.text.encode('utf8') 
soup = BeautifulSoup(response)
printarea = soup.select('#printArea') [0]
bid_detail.write(printarea) 
bid_detail.close()


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-51-5926ae9a0713> in <module>()
      4 soup = BeautifulSoup(response)
      5 printarea = soup.select('#printArea') [0]
----> 6 bid_detail.write(printarea)
      7 bid_detail.close()

TypeError: expected a character buffer object

使用prettify 將soup 抓到的tag 轉換成str


In [52]:
bid_detail = open("bid_detail.txt", 'w') 
request_get = requests.get("http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51239291&tenderCaseNo=0607L1030328") 
response = request_get.text.encode('utf8') 
soup = BeautifulSoup(response)
printarea = soup.select('#printArea') [0] 
bid_detail.write(printarea.prettify("utf-8")) 
bid_detail.close()

怎麼找出案號是否會重複?


In [67]:
case_dic = {} 
f = open("bid_list.txt", "r") 
for line in f.readlines(): 
    rec = line.strip() 
    caseno = rec.split('tenderCaseNo=')[1] 
    if caseno not in case_dic: 
        case_dic[caseno] = 1 
    else: 
        print caseno 
f.close()


1030401
10302
10302
103003
1030415

如何從url 取得這兩個值?


In [53]:
link = "http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51239291&tenderCaseNo=0607L1030328"

使用split


In [54]:
rear_substring = link.split("pkAtmMain=")[1] 
param = rear_substring.split("&tenderCaseNo=") 
pkAtmMain= param[0] 
caseno= param[1] 
print pkAtmMain,caseno


51239291 0607L1030328

re.match 範例


In [55]:
import re 
m = re.match(r"(?P<FirstName>\w+) (?P<LastName>\w+)", "David Chiu") 
print m.group("FirstName"), m.group("LastName")


David Chiu

使用re.match


In [56]:
import re 
link = "http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51239291&tenderCaseNo=0607L1030328" 
m = re.match(r"([^ ]+)pkAtmMain=(?P<pkAtmMain>.+)&tenderCaseNo=(?P<tenderCaseNo>.+)", link) 
print m.group('pkAtmMain'), m.group('tenderCaseNo')


51239291 0607L1030328

讀取bid_list.txt 依序抓出標案內容


In [58]:
# -*- coding: utf-8 -*- 
from bs4 import BeautifulSoup 
import re, requests 

bid_list = open('bid_list.txt', 'r') 
for line in bid_list.readlines(): 
    pagelink = line.strip() 
    m = re.match(r"([^ ]+)pkAtmMain=(?P<pkAtmMain>.+)&tenderCaseNo=(?P<tenderCaseNo>.+)", pagelink) 
    filename = "%s_%s"%(m.group('pkAtmMain'), m.group('tenderCaseNo')) 
    request_get = requests.get(pagelink) 
    response = request_get.text.encode('utf8') 
    soup = BeautifulSoup(response) 
    printarea = soup.select('#printArea')[0] 
    bid_detail = open("gov/%s.txt"%(filename), 'w') 
    bid_detail.write(printarea .prettify("utf-8")) 
    bid_detail.close() 
bid_list.close()

抓取標案細節完成版


In [ ]:
# -*- coding: utf-8 -*- 
from bs4 import BeautifulSoup 
import re, requests, time 
import os
if not os.path.exists('gov'):
    os.mkdir('gov')
    
bid_list = open('bid_list.txt', 'r') 
for line in bid_list.readlines(): 
    pagelink = line.strip() 
    m = re.match(r"([^ ]+)pkAtmMain=(?P<pkAtmMain>.+)&tenderCaseNo=(?P<tenderCaseNo>.+)", pagelink) 
    filename = "%s_%s"%(m.group('pkAtmMain'), m.group('tenderCaseNo')) 
    request_get = requests.get(pagelink) 
    response = request_get.text.encode('utf8') 
    soup = BeautifulSoup(response) 
    printarea = soup.select('#printArea')[0] 
    bid_detail = open("gov/%s.txt"%(filename), 'w') 
    bid_detail.write(printarea .prettify("utf-8")) 
    time.sleep(3)
    bid_detail.close() 
bid_list.close()