視覺化

先照之前的,讀取資料


In [1]:
import tqdm
import tarfile
import pandas
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL
import gzip
from urllib.request import urlopen
%matplotlib inline
matplotlib.style.use('seaborn-bright')
# progress bar
tqdm.tqdm.pandas()

# 檔案名稱格式
filename_format="../Week03/M06A_{year:04d}{month:02d}{day:02d}.tar.gz".format
xz_filename_format="xz/M06A_{year:04d}{month:02d}{day:02d}.tar.xz".format
csv_format = "M06A/{year:04d}{month:02d}{day:02d}/{hour:02d}/TDCS_M06A_{year:04d}{month:02d}{day:02d}_{hour:02d}0000.csv".format

# 設定欄位名稱
M06A_fields = ['VehicleType',
               'DetectionTime_O','GantryID_O',
               'DetectionTime_D','GantryID_D ',
               'TripLength', 'TripEnd', 'TripInformation']
import datetime
# 用來解析時間格式
def strptime(x):
    return datetime.datetime(int(x[:4]), int(x[5:7]), int(x[8:10]), 
                             int(x[11:13]), int(x[14:16]), int(x[17:19]) )

def parse_tripinfo(tripinfo):
    split1 = tripinfo.split("; ")
    return [(strptime(t), t[20:]) for t in split1]

In [2]:
## 讀取檔案
node_data=pandas.read_json('../Week03/node_data.json')
speedlimit = pandas.read_json('../Week03/speedlimit.json')

依照速限來找更多的超速


In [3]:
# 這個改了一點, SpeedInfo 這個 named tuple 同時紀錄了區段的起始時間和兩端點
from collections import namedtuple
SpeedInfo = namedtuple("SpeedInfo", ["speed", "time","loc1", "loc2"])

def compute_speed(trip):
    rtn = []
    for (t1,n1), (t2,n2) in zip(trip[:-1], trip[1:]):
        if n1[:3] != n2[:3] or n1[-1]!=n2[-1]:
            continue
        if n1[3]=='R' or n2[3]=='R': 
            continue
        km1 = int(n1[3:-1])/10
        km2 = int(n2[3:-1])/10
        hr_delta = (t2-t1).total_seconds()/60/60
        speed = abs(km2-km1)/hr_delta
        rtn.append(SpeedInfo(speed, t1, n1, n2))
    return rtn

In [4]:
# 打開剛才下載的檔案試試
data_config ={"year":2016, "month":12, "day":18}
tar = tarfile.open(filename_format(**data_config), 'r')

# 如果沒有下載,可以試試看 xz 檔案
#data_dconfig ={"year":2016, "month":11, "day":18}
#tar = tarfile.open(xz_filename_format(**data_config), 'r')


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-4-6b5c76ef8cc8> in <module>()
      1 # 打開剛才下載的檔案試試
      2 data_config ={"year":2016, "month":12, "day":18}
----> 3 tar = tarfile.open(filename_format(**data_config), 'r')
      4 
      5 # 如果沒有下載,可以試試看 xz 檔案

/usr/lib/python3.5/tarfile.py in open(cls, name, mode, fileobj, bufsize, **kwargs)
   1557                     saved_pos = fileobj.tell()
   1558                 try:
-> 1559                     return func(name, "r", fileobj, **kwargs)
   1560                 except (ReadError, CompressionError) as e:
   1561                     if fileobj is not None:

/usr/lib/python3.5/tarfile.py in gzopen(cls, name, mode, fileobj, compresslevel, **kwargs)
   1622 
   1623         try:
-> 1624             fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
   1625         except OSError:
   1626             if fileobj is not None and mode == 'r':

/usr/lib/python3.5/gzip.py in __init__(self, filename, mode, compresslevel, fileobj, mtime)
    161             mode += 'b'
    162         if fileobj is None:
--> 163             fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
    164         if filename is None:
    165             filename = getattr(fileobj, 'name', '')

FileNotFoundError: [Errno 2] No such file or directory: 'M06A_20161218.tar.gz'

In [ ]:
csvs = (tar.extractfile(csv_format(hour=hr, **data_config)) for hr in tqdm.trange(24))
data = pandas.concat([pandas.read_csv(csv, names=M06A_fields) for csv in csvs],  ignore_index=True)
print("資料大小", data.shape)
# 檢查異常的資料
print("異常資料數:", data[data.TripEnd == 'N'].shape[0])
# 去除異常資料
data = data[data.TripEnd == 'Y']
# 只保留 TripInformation 和 VehicleType
data = data[['VehicleType', "TripInformation"]]

data['Trip'] = data.TripInformation.progress_apply(parse_tripinfo)
data['Speed'] = data.Trip.progress_apply(compute_speed)

In [ ]:
speedlimit_lookup = node_data.速限.to_dict()

def is_speeding(info):
    speedlimit = max(speedlimit_lookup[info.loc1], speedlimit_lookup[info.loc2])
    return info.speed > speedlimit + 10

# 存下所有超速的判斷
data['超速紀錄'] = data.Speed.progress_apply(lambda l:[is_speeding(x) for x in l])

In [ ]:
data.head(10)

用 dict 直接統計

以 S01F 為例


In [ ]:
node_data

In [ ]:
S01F_list = list(sorted(x for x in node_data.編號 if x.startswith('01F') and x.endswith('S')))

In [ ]:
S01F = {s:i for i,s in enumerate(S01F_list)}

In [ ]:
len(S01F)

In [ ]:
speedlimit_lookup = node_data.速限.to_dict()
start_time = datetime.datetime(data_config['year'],data_config['month'],data_config['day'])
end_time = datetime.datetime(data_config['year'],data_config['month'],data_config['day']+1)
total_statistics=np.zeros((24*4, len(S01F)), dtype=int)
speeding_statistics=np.zeros((24*4, len(S01F)), dtype=int)

def count_speeding(info):
    if not info.loc1.startswith('01F'):
        return
    if not info.loc1.endswith('S'):
        return
    if info.loc1 > info.loc2:
        return

    idx = int((info.time-start_time).seconds/(15*60))
    idx2 = S01F[info.loc1]
    total_statistics[idx][idx2] += 1
    speedlimit = max(speedlimit_lookup[info.loc1], speedlimit_lookup[info.loc2])
    if info.speed > speedlimit + 10:
        speeding_statistics[idx][idx2] += 1

# 存下所有超速的判斷
data.Speed.apply(lambda l:[count_speeding(x) for x in l]);

In [ ]:
plt.imshow(speeding_statistics/(total_statistics+0.001), aspect=0.3);

In [ ]:
# 下面的中文方式,可能需要 matplotlib 2.0.0 以上
matplotlib.__version__

In [ ]:
# 如果你知道自己系統中的中文字形,可以直接用省略下面的步驟 
# matplotlib.font_manager.fontManager.ttffiles # 列出你的字體
# matplotlib.font_manager.fontManager.ttflist # 列出你的字體

# plt.rcParams['font.family'] = 'AR PL Mingti2L Big5' # works on Ubuntu with `apt-get install fonts-arphic-bsmi00lp`

In [ ]:
# 從 00Download.ipynb 來的
import os.path
from urllib.request import urlopen
def download_req(req, filename):
    total = int(req.getheader("Content-Length"))
    tqdm_conf = dict(total=total, desc=filename, unit='B', unit_scale=True)
    with tqdm.tqdm(**tqdm_conf) as pbar:
        with open(filename,'wb') as f:
            for data in iter(lambda: req.read(8192), b""):  
                pbar.update(f.write(data))

# 字體下載
font_filename = 'NotoSansCJKtc-hinted.zip'
font_url = "https://noto-website-2.storage.googleapis.com/pkgs/"+font_filename
# 改變這行才能真正下載
if not (os.path.isfile(font_filename) and os.path.getsize(font_filename)==121247052):
    with urlopen(font_url) as req:
        download_req(req, "NotoSansCJKtc-hinted.zip")
# Extract Font files
import zipfile
with zipfile.ZipFile(font_filename) as zf:
    for f in zf.namelist():
        if f.endswith('.otf'):
            print("extract", f)
            zf.extract(f)

fp = matplotlib.font_manager.FontProperties(fname = 'NotoSansCJKtc-Regular.otf')
matplotlib.font_manager.fontManager.ttffiles.append(fp.get_file())
font_entry = matplotlib.font_manager.FontEntry(fp.get_file(), name=fp.get_name(),
                                               style=fp.get_style(), variant=fp.get_variant(),
                                              weight=fp.get_weight(), stretch=fp.get_stretch(), size=fp.get_size())

matplotlib.font_manager.fontManager.ttflist.append(font_entry)
plt.rcParams['font.family'] = fp.get_name()

In [ ]:
plt.figure(figsize=(30,40));
plt.imshow((speeding_statistics/(total_statistics+0.001)), aspect=1, 
           interpolation='none',
           extent=[0,72,24,0]
          )
ax = plt.gca()
ax.set_yticks(range(0,24,2));
plt.xticks(range(73), [node_data['交流道(起)'][S01F_list[i]] for i in range(73)], 
          rotation='vertical');
plt.grid(True)

In [ ]:
plt.figure(figsize=(15,20));
plt.imshow((speeding_statistics/(total_statistics+0.001)).T, aspect=1, 
           interpolation='none',
           extent=[0,24,72,0])
ax = plt.gca()
ax.set_xticks(range(0,26,2));
plt.yticks(range(73), [node_data['交流道(起)'][S01F_list[i]] for i in range(73)], 
          #fontproperties=fp, #or fontname='AR PL Mingti2L Big5' #'Droid Sans Fallback');
          );
plt.grid(True)

In [ ]:
plt.plot(speeding_statistics.sum(axis=1)/(total_statistics.sum(axis=1)+0.00001) );
plt.xticks(range(0,24*4,4), range(24));
plt.grid(True);
plt.xlim(0,24*4-1);
plt.ylim(0, 0.35);

In [ ]:
matplotlib.style.use('seaborn-deep')

In [ ]:
plt.figure(figsize=(30,20))
plt.plot(speeding_statistics.sum(axis=0)/(total_statistics.sum(axis=0)+0.00001),
        linewidth=5);
plt.xticks(range(73), [node_data['交流道(起)'][S01F_list[i]] for i in range(73)], 
          rotation=90, fontsize=18);
plt.grid(True);
plt.xlim(0,72);
plt.ylim(0, 0.35);

用 pandas 來算


In [ ]:
# 將 data.Speed 展開成 DataFrame
from itertools import chain
speed_data = pandas.DataFrame.from_records(chain(*data.Speed), columns=SpeedInfo._fields )
speed_data.head(10)

In [ ]:
# 計算是否超速(完全跟前面一樣,雖然現在 info 是 pandas.Series)
def is_speeding(info):
    speedlimit = max(speedlimit_lookup[info.loc1], speedlimit_lookup[info.loc2])
    return info.speed > speedlimit + 10

# 存下所有超速的判斷
# 這樣會很慢,而且撞到 tdqm 的雷
# speed_data['超速'] = speed_data.progress_apply(is_speeding, axis=1)

In [ ]:
# 用 pandas 的方法做
speed_loc1 = speed_data.loc1.progress_apply(speedlimit_lookup.get)
speed_loc2 = speed_data.loc1.progress_apply(speedlimit_lookup.get)
speed_max = pandas.DataFrame({'1':speed_loc1, '2:':speed_loc2}).max(axis=1)
speed_data['超速'] = speed_data.speed > speed_max +10

In [ ]:
ts = speed_data.time.progress_apply(lambda t:t.hour*4+t.minute//15)

In [ ]:
groups = speed_data[['speed','超速']].groupby([speed_data.loc1, ts])

In [ ]:
statistics = groups.mean()
statistics

In [ ]:
statistics.loc['01F0005S']['超速'].plot()
plt.xticks(range(0,24*4,4), range(24));

Q

利用 speed_datastatistics 來畫其他圖表

利用 bokeh


In [ ]:
from ipywidgets import interact
import numpy as np

from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
output_notebook()

In [ ]:
from bokeh.charts import Bar, show
from bokeh.models import FixedTicker
from bokeh.models.formatters import TickFormatter, String, List

class FixedTickFormatter(TickFormatter):
    """
    Class used to allow custom axis tick labels on a bokeh chart
    Extends bokeh.model.formatters.TickFormatte
    """

    JS_CODE =  """
        import {Model} from "model"
        import * as p from "core/properties"
        export class FixedTickFormatter extends Model
          type: 'FixedTickFormatter'
          
          doFormat: (ticks) ->
            labels = @labels
            return (labels[tick] ? "" for tick in ticks)
          @define {
            labels: [ p.Any ]
          }
    """

    labels = List(String, help="""
    A mapping of integer ticks values to their labels.
    """)

    __implementation__ = JS_CODE

In [ ]:
x = range(73)
y = speeding_statistics.sum(axis=0)/(total_statistics.sum(axis=0)+0.00001)
p = figure(title="超速比例", plot_height=500, 
           plot_width=800, x_range=(0,72), y_range=(0,y.max()*1.05))
r = p.line(x, y, color="#2222aa", line_width=3)
#node_data['交流道(起)'][S01F_list[i]
labels = [node_data['交流道(起)'][S01F_list[i]] for i in range(73)]
p.xaxis[0].formatter = FixedTickFormatter(labels=labels)
p.xaxis[0].ticker = FixedTicker(ticks=list(range(73)))
p.xaxis.major_label_orientation = 3.14/2

In [ ]:
from bokeh.plotting import ColumnDataSource
from bokeh.models import HoverTool, BoxSelectTool
x = range(73)
y = speeding_statistics.sum(axis=0)/(total_statistics.sum(axis=0)+0.00001)
labels = [node_data['交流道(起)'][S01F_list[i]] for i in range(73)]
datasource = ColumnDataSource(
    data = {
        'x': x,
        'y': y,
        'labels': labels,
        'ypercent': ["{:4.2f}%".format(100*i) for i in y]
    }
)
TOOLS = [BoxSelectTool(), HoverTool( tooltips=[("站", "@labels"), ("比例", "@ypercent")])]

p = figure(title="超速比例", plot_height=500, 
           plot_width=800, x_range=(0,72), y_range=(0,y.max()*1.05), tools=TOOLS)
r = p.line(x='x', y='y', color="#2222aa", line_width=3, source=datasource)
#node_data['交流道(起)'][S01F_list[i]
p.xaxis[0].formatter = FixedTickFormatter(labels=labels)
p.xaxis[0].ticker = FixedTicker(ticks=list(range(73)))
p.xaxis.major_label_orientation = 3.14/2

In [ ]:
show(p, notebook_handle=True);

In [ ]:
highway_name = speed_data.loc1.progress_apply(lambda x:x[:3])

In [ ]:
# 依照高速公路和時間區隔來分組平均
mean_highway_time = speed_data[['speed','超速']].groupby([highway_name, ts]).mean()
# 依照高速公路來分組平均
mean_highway = speed_data[['speed','超速']].groupby(highway_name).mean()

In [ ]:
mean_highway_time.index.get_level_values('loc1').unique()

In [ ]:
x = np.arange(0,24,0.25)
p = figure(title="超速比例", plot_height=720, 
           plot_width=960, x_range=(0,24), y_range=(0,0.5))
groups = mean_highway_time.groupby(mean_highway_time.index.get_level_values('loc1'))
for hw in groups.groups:    
    y = groups.get_group(hw)['超速']
    r = p.line(x, y, legend=hw, color=tuple(np.random.randint(0,255, size=3)), line_width=3, alpha=0.7)

show(p)

In [ ]:
x = np.arange(0,24,0.25)
p = figure(title="速率", plot_height=720, 
           plot_width=960, x_range=(0,24), y_range=(0,150))
groups = mean_highway_time.groupby(mean_highway_time.index.get_level_values('loc1'))
for hw in groups.groups:    
    y = groups.get_group(hw)['speed']
    r = p.line(x, y, legend=hw, color=tuple(np.random.randint(0,255, size=3)), line_width=3, alpha=0.7)
show(p)

In [ ]:


In [ ]:
from bokeh.io import output_file, show
from bokeh.plotting import figure
from bokeh.tile_providers import STAMEN_TONER
from pyproj import Proj, transform
def trans(lng_lat):
    from_proj = Proj(init="epsg:4326")
    to_proj = Proj(init="epsg:3857")
    return transform(from_proj, to_proj, *lng_lat)
fig = figure(tools='pan, wheel_zoom', x_range=(13410000, 13590000), y_range=(2610000, 2790000))
fig.axis.visible = False
fig.add_tile(STAMEN_TONER)
lng_lat = np.mgrid[120:122:0.1, 23:25:0.1].reshape(2,-1).T
xy = np.array(list(map(trans, lng_lat)))
dots = fig.circle(xy[:,0], xy[:,1])
map_handler = show(fig, notebook_handle=True)

In [ ]:


In [ ]:
import time
import math
for i in range(100):
    dots.glyph.fill_color=['red', 'green', 'blue'][(i//5)%3]
    shift = [math.cos(i/20), math.sin(i/20)]
    xy = np.array(list(map(trans, lng_lat + shift)))
    dots.data_source.data['x'] = xy[:,0]
    dots.data_source.data['y'] = xy[:,1]
    #push_notebook(handle=map_handler)
    push_notebook()
    time.sleep(0.05)

In [ ]:
from bqplot import pyplot as plt
from bqplot import topo_load
from bqplot.interacts import panzoom

In [ ]:
mean_highway.index

In [ ]:
x = np.arange(len(mean_highway))
y = mean_highway['超速']
plt.figure()
bar = plt.bar(list(mean_highway.index), y)
plt.show()

In [ ]:
# 互動式圖表
x = np.arange(0,24,0.25)
fig= plt.figure()
groups = mean_highway_time.groupby(mean_highway_time.index.get_level_values('loc1'))
def toggle_alpha(a,b):
    a.opacities=[1.2-a.opacities[0]]
colors=['blue', 'red', 'purple', 'black', 'green']
for i,hw in enumerate(groups.groups):    
    y = groups.get_group(hw)['超速']
    pp = plt.plot(x,y, colors=[colors[i]], 
             display_legend=True, line_width=3, opacities=[0.2], labels=[hw])
    pp.on_legend_click(toggle_alpha)
plt.show()

In [ ]: