In [1]:
import tqdm
import tarfile
import pandas
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL
import gzip
from urllib.request import urlopen
%matplotlib inline
matplotlib.style.use('ggplot')
#matplotlib.style.use('bmh')
# progress bar
tqdm.tqdm.pandas()
# 檔案名稱格式
filename_format="M06A_{year:04d}{month:02d}{day:02d}.tar.gz".format
xz_filename_format="xz/M06A_{year:04d}{month:02d}{day:02d}.tar.xz".format
csv_format = "M06A/{year:04d}{month:02d}{day:02d}/{hour:02d}/TDCS_M06A_{year:04d}{month:02d}{day:02d}_{hour:02d}0000.csv".format
In [2]:
# 打開剛才下載的檔案試試
data_config ={"year":2016, "month":12, "day":18}
tar = tarfile.open(filename_format(**data_config), 'r')
In [3]:
# 如果沒有下載,可以試試看 xz 檔案
#data_dconfig ={"year":2016, "month":11, "day":18}
#tar = tarfile.open(xz_filename_format(**data_config), 'r')
In [4]:
# 設定欄位名稱
M06A_fields = ['VehicleType',
'DetectionTime_O','GantryID_O',
'DetectionTime_D','GantryID_D ',
'TripLength', 'TripEnd', 'TripInformation']
# 打開裡面 10 點鐘的資料
csv = tar.extractfile(csv_format(hour=10, **data_config))
# 讀進資料
data = pandas.read_csv(csv, names=M06A_fields)
# 檢查異常的資料
print("異常資料數:", data[data.TripEnd == 'N'].shape[0])
# 去除異常資料
data = data[data.TripEnd == 'Y']
# 焦點放在 TripInformation 和 VehicleType
data = data[['VehicleType', "TripInformation"]]
# 看前五筆
data.head(5)
Out[4]:
In [5]:
import datetime
# 用來解析時間格式
def strptime(x):
return datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
In [6]:
def parse_tripinfo(tripinfo):
split1 = tripinfo.split("; ")
split2 = (x.split('+') for x in split1)
return [(strptime(t), node) for t,node in split2]
# 試試看
data.head(10).TripInformation.apply(parse_tripinfo)
Out[6]:
In [7]:
# 新增一欄
data['Trip'] = data.TripInformation.progress_apply(parse_tripinfo)
In [8]:
trip = data['Trip'][0]
trip
Out[8]:
In [9]:
for (t1,n1), (t2,n2) in zip(trip[:-1], trip[1:]):
# 去除換路線的
if n1[:3] != n2[:3] or n1[-1]!=n2[-1]:
continue
# 去除額外路線的
if n1[3]=='R' or n2[3]=='R':
continue
# 從站名取得公里數
km1 = int(n1[3:-1])/10
km2 = int(n2[3:-1])/10
hr_delta = (t2-t1).total_seconds()/60/60
speed = abs(km2-km1)/hr_delta
print(speed)
In [10]:
def compute_speed(trip):
rtn = []
for (t1,n1), (t2,n2) in zip(trip[:-1], trip[1:]):
# 去除換路線的
if n1[:3] != n2[:3] or n1[-1]!=n2[-1]:
continue
# 去除額外路線的
if n1[3]=='R' or n2[3]=='R':
continue
# 從站名取得公里數
km1 = int(n1[3:-1])/10
km2 = int(n2[3:-1])/10
hr_delta = (t2-t1).total_seconds()/60/60
speed = abs(km2-km1)/hr_delta
rtn.append(speed)
return np.array(rtn)
In [11]:
data.head(10).Trip.apply(compute_speed)
Out[11]:
In [12]:
data['Speed'] = data.Trip.progress_apply(compute_speed)
In [13]:
# 只留下有速度的旅程
valid_idx = data.Speed.apply(len).astype('bool')
valid_data = data[valid_idx].copy()
del valid_data['TripInformation']
valid_data.head()
Out[13]:
這樣就能計算旅程中的最高速度
In [14]:
valid_data['MaxSpeed']=valid_data.Speed.apply(max)
In [15]:
valid_data.sort_values('MaxSpeed', ascending=False)
Out[15]:
In [ ]:
查詢超過最高速限 110 十公里以上的旅程
In [16]:
valid_data.query("MaxSpeed > 120").shape
Out[16]:
In [17]:
# 看看統計圖表
valid_data.MaxSpeed.hist(bins=np.arange(0,200,10))
Out[17]:
In [18]:
# 不同車種的速度
valid_data.boxplot(by='VehicleType', showmeans=True, showfliers=False);
參考 http://matplotlib.org/api/pyplot_api.html 用不同方法畫畫看 如混何應用看看
# 畫很多 histogram 小圖
valid_data.hist(by='VehicleType', column='MaxSpeed', bins=np.arange(0,200,10));
# 另一種很多 histogram 小圖
valid_data.groupby('VehicleType').hist()
# histogram 疊合在一起
for vt in [31,32,41,42,5]:
valid_data[valid_data.VehicleType==vt].MaxSpeed.hist(bins=np.arange(0,200,10), alpha=0.5)
# 空心的畫法,圖示說明,透明度
for vt in [41,42,5]:
valid_data[valid_data.VehicleType==vt].MaxSpeed.hist(bins=np.arange(0,200,10),
label=str(vt), alpha=0.5, histtype='step', linewidth=2)
plt.legend();
# 不同車種的速度
gp = valid_data.groupby('VehicleType')
plt.gca().get_xaxis().set_visible(False)
gp.mean().plot.bar(ax=plt.gca(), yerr=gp.std(), table=True);
In [19]:
vt_types = [31,32,41,42, 5]
vt_labels = ["小客車", "小貨車", "大客車","大貨車", "聯結車"]
plt.hist([valid_data.MaxSpeed[valid_data.VehicleType==vt] for vt in vt_types],
label=vt_labels, bins=np.arange(50,150,10), normed=1)
for t in plt.legend().texts:
t.set_fontname('Droid Sans Fallback')
#用 matplotlib.font_manager.fontManager.ttflist 看你有什麼字型
In [20]:
num_total = data.shape[0]
num_valid = valid_data.shape[0]
num_overspeed = valid_data.query("MaxSpeed > 120").shape[0]
num_total, num_valid, num_overspeed
Out[20]:
In [21]:
num_overspeed / num_total
Out[21]:
In [22]:
num_overspeed / num_valid
Out[22]:
In [23]:
stat_url = "http://www.hpb.gov.tw/files/11-1000-138.php"
國道違規統計 = pandas.read_html(stat_url, attrs={"bordercolor":"#cccccc"}, header=1)[0]
國道違規統計
Out[23]:
In [24]:
國道違規統計[國道違規統計["違規 項目"]=="超速"]
Out[24]:
In [25]:
# 平均每天
國道違規統計[國道違規統計["違規 項目"]=="超速"].applymap(lambda x:int(x)/365 if isinstance(x,int) else x)
Out[25]:
In [26]:
csvs = (tar.extractfile(csv_format(hour=hr, **data_config)) for hr in tqdm.trange(24))
data = pandas.concat([pandas.read_csv(csv, names=M06A_fields) for csv in csvs])
print("資料大小", data.shape)
# 檢查異常的資料
print("異常資料數:", data[data.TripEnd == 'N'].shape[0])
# 去除異常資料
data = data[data.TripEnd == 'Y']
# 把焦點放在 TripInformation 和 VehicleType
data = data[['VehicleType', "TripInformation"]]
In [27]:
# 很慢? 可以參考 04-Speed-Limit.ipynb 中 parse_tripinfo 的加速
data['Trip'] = data.TripInformation.progress_apply(parse_tripinfo)
data['Speed'] = data.Trip.progress_apply(compute_speed)
In [28]:
valid_idx = data.Speed.apply(len).astype('bool')
valid_data = data[valid_idx].copy()
del valid_data['TripInformation']
valid_data['MaxSpeed']=valid_data.Speed.apply(max)
In [29]:
valid_data.MaxSpeed.hist(bins=np.arange(0,200,10));
In [30]:
valid_data.sort_values("MaxSpeed", ascending=False)
Out[30]:
In [31]:
num_total = data.shape[0]
num_valid = valid_data.shape[0]
num_overspeed = valid_data.query("MaxSpeed > 120").shape[0]
num_total, num_valid, num_overspeed
Out[31]:
In [32]:
國道違規統計[國道違規統計["違規 項目"]=="超速"]
Out[32]: