In [1]:
import re
import requests
import pandas as pd

In [2]:
# 获取城市字符对应表

url = 'https://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.8993'
response = requests.get(url, verify=False)
stations = re.findall(u'([\u4e00-\u9fa5]+)\|([A-Z]+)', response.text)
stations = dict(stations)


/home/jeff/anaconda3/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py:838: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/security.html
  InsecureRequestWarning)

In [3]:
# 根据出发、到达城市以及日期获取数据

def get_raw_data(from_station, to_station, date, stations):
    from_station = stations[from_station]
    to_station = stations[to_station]
    date = date
    # 构建URL
    url = ('https://kyfw.12306.cn/otn/leftTicket/queryA?leftTicketDTO.train_date={}&leftTicketDTO.from_station={}&leftTicketDTO.to_station={}&purpose_codes=ADULT').format(
        date, from_station, to_station
    )
    r = requests.get(url, verify=False)
    return r.json()

from_station = '哈尔滨'
to_station = '南京'
date = '2017-01-20'
result = get_raw_data(from_station, to_station, date, stations)['data']


/home/jeff/anaconda3/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py:838: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/security.html
  InsecureRequestWarning)

In [4]:
# 废弃,改用另一种方式

def reshape_train_data(train):
    result = {}
    result['车次'] = train.get('station_train_code')
    result['车站'] = train.get('from_station_name') + '->' + train.get('to_station_name')
    result['全程'] = train.get('start_station_name') + '->' + train.get('end_station_name')
    result['时间'] = train.get('start_time') + '->' + train.get('arrive_time')
    result['发售时间'] = train.get('sale_time')
    result['商务座'] = train.get('swz_num')
    result['特等座'] = train.get('tz_num')
    result['一等座'] = train.get('zy_num')
    result['二等座'] = train.get('ze_num')
    result['高级软卧'] = train.get('gr_num')
    result['软卧'] = train.get('rw_num')
    result['硬卧'] = train.get('yw_num')
    result['软座'] = train.get('rz_num')
    result['硬座'] = train.get('yz_num')
    result['无座'] = train.get('wz_num')
    return result
train1 = result[1]['queryLeftNewDTO']
reshape_train_data(train1).keys()


Out[4]:
dict_keys(['车次', '高级软卧', '车站', '发售时间', '特等座', '时间', '一等座', '软卧', '商务座', '软座', '无座', '二等座', '全程', '硬卧', '硬座'])

In [5]:
def reshape_train_data(train):
    result = [
    train.get('station_train_code'),
    train.get('from_station_name') + '->' + train.get('to_station_name'),
    train.get('start_station_name') + '->' + train.get('end_station_name'),
    train.get('start_time') + '->' + train.get('arrive_time'),
    train.get('sale_time'),
    train.get('swz_num'),
    train.get('tz_num'),
    train.get('zy_num'),
    train.get('ze_num'),
    train.get('gr_num'),
    train.get('rw_num'),
    train.get('yw_num'),
    train.get('rz_num'),
    train.get('yz_num'),
    train.get('wz_num')
        ]
    return result

train1 = result[1]['queryLeftNewDTO']
reshape_train_data(train1)


Out[5]:
['Z174',
 '哈尔滨->南京',
 '哈尔滨->上海',
 '12:45->09:57',
 '1030',
 '--',
 '--',
 '--',
 '--',
 '--',
 '无',
 '无',
 '--',
 '无',
 '有']

In [6]:
# 把结果整理成数据框

result_dict = {}
for i in range(len(result)):
    train_temp = result[i]['queryLeftNewDTO']
    result_dict[train_temp['station_train_code']] = reshape_train_data(train_temp)

result_table = pd.DataFrame(result_dict).T
result_table.columns = ['车次', '车站', '全程', '时间', '发售时间', '商务座', '特等座', '一等座', '二等座', '高级软卧', '软卧', '硬卧', '软座', '硬座', '无座']

In [7]:
result_table


Out[7]:
车次 车站 全程 时间 发售时间 商务座 特等座 一等座 二等座 高级软卧 软卧 硬卧 软座 硬座 无座
G1202 G1202 哈尔滨西->南京南 哈尔滨西->上海虹桥 08:20->19:22 1000 -- -- -- -- -- -- --
K554 K554 哈尔滨->南京 牡丹江->温州 13:59->19:30 1030 -- -- -- -- -- --
Z174 Z174 哈尔滨->南京 哈尔滨->上海 12:45->09:57 1030 -- -- -- -- -- --
Z178 Z178 哈尔滨->南京 哈尔滨->杭州 22:30->23:41 1030 -- -- -- -- -- --
Z4518 Z4518 哈尔滨->南京 哈尔滨->南京 12:59->12:26 1030 -- -- -- -- -- --