練習


In [1]:
import os
import requests
import pandas as pd
import datetime

from bs4 import BeautifulSoup

url = 'https://www.thsrc.com.tw/tw/TimeTable/SearchResult'

In [2]:
after_one_week = datetime.datetime.now() + datetime.timedelta(weeks=1)
after_one_week_format = after_one_week.strftime('%Y/%m/%d')
print('The date after one week - {}'.format(after_one_week_format))

form_data = {
    'StartStation': '977abb69-413a-4ccf-a109-0272c24fd490',
    'EndStation': '9c5ac6ca-ec89-48f8-aab0-41b738cb1814',
    'SearchDate': after_one_week_format,
    'SearchTime': '14:00',
    'SearchWay': 'DepartureInMandarin',
    'RestTime': '',
    'EarlyOrLater': ''
}


The date after one week - 2018/02/28

In [3]:
resp = requests.post(url, data=form_data)
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, 'lxml')

In [4]:
rows = soup.table.find_all('tr', recursive=False)

colname, rows = rows[1], rows[2:]
colname = list(colname.stripped_strings)

for i, row in enumerate(rows):
    trips = row.find('td', class_='column1')
    t_departure = row.find('td', class_='column3')
    t_arrive = row.find('td', class_='column4')
    duration = row.find('td', class_='column2')
    early_ticket = row.find('td', class_='Width1')
    
    trips = trips.text if trips else None
    t_departure = t_departure.text if t_departure else ''
    t_arrive = t_arrive.text if t_arrive else ''
    duration = duration.text if duration else ''
    early_ticket = list(early_ticket.stripped_strings) if early_ticket else ''
    early_ticket = early_ticket[0] if early_ticket else ''
    
    rows[i] = [trips, t_departure, t_arrive, duration, early_ticket]

df = pd.DataFrame(rows, columns=colname)
df


Out[4]:
車次 出發時間 抵達時間 行車時間 早鳥
0 0833 14:11 16:11 02:00 8折起
1 0651 14:46 16:32 01:46
2 0837 15:11 17:11 02:00 8折起
3 0657 15:46 17:32 01:46
4 0841 16:11 18:11 02:00 65折起
5 0661 16:21 18:06 01:45 8折起
6 0663 16:46 18:32 01:46
7 0845 17:11 19:11 02:00 65折起
8 0667 17:21 19:06 01:45 8折起
9 0669 17:46 19:32 01:46

In [5]:
results = os.path.abspath('../results')
if not os.path.exists(results):
    os.makedirs(results)

filename = os.path.join(results, 'thsrc_{}.csv'.format(after_one_week.strftime('%Y%m%d')))
df.to_csv(filename, index=False)
print('Save csv to {}'.format(filename))


Save csv to /home/afun/github/Python-Crawling-Tutorial/results/thsrc_20180228.csv