練習

觀察 https://www.thsrc.com.tw/tw/TimeTable/SearchResult 並撰寫爬蟲程式
抓取一個禮拜後的高鐵時刻表
台北到台南下午兩點的班次
使用 requests + BeautifulSoup 實作
透過 pandas 輸出成 csv



In [1]:

    
import os
import requests
import pandas as pd
import datetime

from bs4 import BeautifulSoup

url = 'https://www.thsrc.com.tw/tw/TimeTable/SearchResult'



In [2]:

    
after_one_week = datetime.datetime.now() + datetime.timedelta(weeks=1)
after_one_week_format = after_one_week.strftime('%Y/%m/%d')
print('The date after one week - {}'.format(after_one_week_format))

form_data = {
    'StartStation': '977abb69-413a-4ccf-a109-0272c24fd490',
    'EndStation': '9c5ac6ca-ec89-48f8-aab0-41b738cb1814',
    'SearchDate': after_one_week_format,
    'SearchTime': '14:00',
    'SearchWay': 'DepartureInMandarin',
    'RestTime': '',
    'EarlyOrLater': ''
}









    



The date after one week - 2018/02/28



In [3]:

    
resp = requests.post(url, data=form_data)
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, 'lxml')



In [4]:

    
rows = soup.table.find_all('tr', recursive=False)

colname, rows = rows[1], rows[2:]
colname = list(colname.stripped_strings)

for i, row in enumerate(rows):
    trips = row.find('td', class_='column1')
    t_departure = row.find('td', class_='column3')
    t_arrive = row.find('td', class_='column4')
    duration = row.find('td', class_='column2')
    early_ticket = row.find('td', class_='Width1')
    
    trips = trips.text if trips else None
    t_departure = t_departure.text if t_departure else ''
    t_arrive = t_arrive.text if t_arrive else ''
    duration = duration.text if duration else ''
    early_ticket = list(early_ticket.stripped_strings) if early_ticket else ''
    early_ticket = early_ticket[0] if early_ticket else ''
    
    rows[i] = [trips, t_departure, t_arrive, duration, early_ticket]

df = pd.DataFrame(rows, columns=colname)
df



In [5]:

    
results = os.path.abspath('../results')
if not os.path.exists(results):
    os.makedirs(results)

filename = os.path.join(results, 'thsrc_{}.csv'.format(after_one_week.strftime('%Y%m%d')))
df.to_csv(filename, index=False)
print('Save csv to {}'.format(filename))









    



Save csv to /home/afun/github/Python-Crawling-Tutorial/results/thsrc_20180228.csv

	車次	出發時間	抵達時間	行車時間	早鳥
0	0833	14:11	16:11	02:00	8折起
1	0651	14:46	16:32	01:46
2	0837	15:11	17:11	02:00	8折起
3	0657	15:46	17:32	01:46
4	0841	16:11	18:11	02:00	65折起
5	0661	16:21	18:06	01:45	8折起
6	0663	16:46	18:32	01:46
7	0845	17:11	19:11	02:00	65折起
8	0667	17:21	19:06	01:45	8折起
9	0669	17:46	19:32	01:46