In [16]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
from bs4 import BeautifulSoup
import pandas as pd
from os import listdir
from os.path import isfile, join
import xml.etree.ElementTree as ET
import codecs
import numpy as np
from dateutil import parser
import scipy as sp
In [3]:
# Transportation Mode (TM) Data
FOLDER_DATA_TM = 'data/gpx/'
FOLDER_DATA_BUS = 'data/gpx/bus/'
FOLDER_DATA_WALK = 'data/gpx/wk/'
FOLDER_DATA_CAR = 'data/gpx/car/'
Read in data from the three folders. Each data file contains a set of trkpt (traking point), and each traking point contains latitude, longitude, elevation and timestamp information.
The sampling rate varied from files to files, and varied within a file as well. Thus the time between every trkpots are not necessarily the same. I found out that for some trace (each file contains a trace), the sampling rate for the end of the trace are sometimes quite slow because sometime people stop moving.
In [4]:
"""
READ IN ALL DATA FILE PATHS
"""
files = dict()
files["bus"] = [f for f in listdir(FOLDER_DATA_BUS) if isfile(join(FOLDER_DATA_BUS, f))]
files["walk"] = [f for f in listdir(FOLDER_DATA_WALK) if isfile(join(FOLDER_DATA_WALK, f))]
files["car"] = [f for f in listdir(FOLDER_DATA_CAR) if isfile(join(FOLDER_DATA_CAR, f))]
In [4]:
# tm = transportation mode
df = pd.DataFrame(columns=("trace_id", "ts", "lat", "lon", "tm"))
In [5]:
for tm in ["bus", "walk", "car"]:
if tm == "walk":
folder_root = FOLDER_DATA_WALK
elif tm == "car":
folder_root = FOLDER_DATA_CAR
elif tm == "bus":
folder_root = FOLDER_DATA_BUS
for f_name in files[tm]:
f = codecs.open(folder_root+f_name, 'r', encoding='utf8')
print f_name
try:
# Read in traking points, each contains lat, longitude and timestamp
soup = BeautifulSoup(f)
trkpts = soup.find_all('trkpt')
n, d = df.shape
for trkpt in trkpts:
trace_id = f_name.lower().replace(" ", "_")
df.loc[n-1] = [trace_id, trkpt.time.string, trkpt["lat"], trkpt["lon"], tm]
n = n+1
except:
# print trace_id
print f_name
f.close()
In [8]:
df.to_csv(FOLDER_DATA_TM+"raw_tm_dataframe~.csv")
In [414]:
df = pd.read_csv(FOLDER_DATA_TM+"raw_tm_dataframe~.csv")
In [420]:
len(df)
Out[420]:
There are three major units in this transportation dataset.
Trace and traking points were introduced above, thus here I would only talk about segments.
In this dataset, an individual's manually record each trace, and each trace has quite clear start and end, and contains only one type of transportation mode. (Thus it ignores the transition between different transportation modes, which is not a good idea actually.)
There are multiple ways to group the traking points in each trace into segments so to compute the required features. For example, we could group the points by fixed time, or dynamic time given the speed of users, or group by distance or number of points.
The below code segments the data by time, and the duration is set to 120 seconds. The segmentation is done by calculating the time lapse from the beginning of a trace to a specific traking point, and then we can easily get the segment index from that time lapse.
In [7]:
"""
CREATE SEGMENTATION
"""
SEGMENTATION_DURATION = 120
def get_time_lapse(group):
trace_id = group.iloc[0]["trace_id"]
starttime = parser.parse(group.iloc[0]["ts"])
group["time_lapse"] = group["ts"].apply(lambda x: (parser.parse(x) - starttime).total_seconds())
group["seg_name"] = group["time_lapse"].apply(lambda x: trace_id+"__"+str(int(x/SEGMENTATION_DURATION)))
group["seg_id"] = group["time_lapse"].apply(lambda x: int(x/SEGMENTATION_DURATION))
return group
df = df.groupby("trace_id").apply(get_time_lapse)
In [9]:
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
# convert decimal degrees to radians
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
# 6367 km is the radius of the Earth
km = 6367 * c
return km
In [10]:
"""
Get distance and velocity
"""
for index, row in df.iterrows():
try:
cur_trace = df.ix[index, "trace_id"]
next_trace = df.ix[index+1, "trace_id"]
# print lat1, lon1
if cur_trace == next_trace:
# distance
lat1, lon1 = df.ix[index, ["lat", "lon"]]
lat2, lon2 = df.ix[index+1, ["lat", "lon"]]
dist = haversine(float(lon1), float(lat1), float(lon2), float(lat2)) * 1000
df.ix[index, "dist_m"] = dist
# duration
t1 = df.ix[index, "time_lapse"]
t2 = df.ix[index+1, "time_lapse"]
dur = t2 - t1
# speed m/s
df.ix[index, "speed"] = dist/dur
else:
df.ix[index, "dist_m"] = None
df.ix[index, "speed"] = None
except:
df.ix[index, "dist_m"] = None
df.ix[index, "speed"] = None
Since to get acceleration we need velocity information of the future data point, we need to separate acceleration extraction from the above code.
In [11]:
"""
Get acceleration
"""
for index, row in df.iterrows():
try:
cur_trace = df.ix[index, "trace_id"]
next_trace = df.ix[index+1, "trace_id"]
# print lat1, lon1
if cur_trace == next_trace:
# acceleration
v1 = df.ix[index, "speed"]
v2 = df.ix[index+1, "speed"]
t1 = df.ix[index, "time_lapse"]
t2 = df.ix[index+1, "time_lapse"]
dur = t2 - t1
acce = (v2 - v1)/dur
df.ix[index, "acce"] = acce
else:
df.ix[index, "dist_m"] = None
except:
df.ix[index, "dist_m"] = None
In [12]:
df.to_csv(FOLDER_DATA_TM+"tm_dataframe~.csv")
In [13]:
df.dropna(subset=["speed", "dist_m", "acce"], inplace=True)
Pandas treat speed, distance and acceleration as an object, however, we want it to be float.
In [14]:
df[["speed", "dist_m", "acce"]] = df[["speed", "dist_m", "acce"]].astype(float)
df_seg = df.groupby(["trace_id", "seg_id"])
Now we have the distance between each tracking points as well as speed and acceleration, we could obtain features for segments (i.e., a group of tracking points.)
These features include maximum speed of a group of tracking points, mean and variation of speed; other features related to acceleration.
In [17]:
"""
SPEED RELATED FEATURES
"""
def get_top3_speed(data):
data.sort(ascending=False)
return data.iloc[0:3].mean()
df_seg = df.groupby(["trace_id", "seg_id"])["speed"].agg({"max_speed": np.max,
"avg_speed": np.mean,
"avg_top3_speed": get_top3_speed,
"var_speed": np.var
})
df_seg
Out[17]:
In [18]:
def get_top3_acce(data):
data.sort(ascending=False)
return data.iloc[0:3].mean()
df_seg_acce = df.groupby(["trace_id", "seg_id"])["acce"].agg({"avg_acce": np.mean,
"top_acce": np.max,
"avg_top3_acce": get_top3_acce,
"var_acce": np.var})
df_seg_acce
Out[18]:
In [19]:
def get_tm(data):
return data.iloc[0]
df_seg_tm = df.groupby(["trace_id", "seg_id"])["tm"].agg({"tm": get_tm})
In [20]:
df_seg = df_seg.join(df_seg_acce)
df_seg = df_seg.join(df_seg_tm)
In [21]:
df_seg.dropna(inplace=True)
In [22]:
df_seg["tm"].value_counts()
Out[22]:
In [71]:
N_BINS_MAX_SPEED = 6
for col in df_seg.columns:
try:
field_name = "d_" + col
df_seg[field_name] = pd.qcut(df_seg[col], N_BINS_MAX_SPEED, labels=[x+1 for x in range(N_BINS_MAX_SPEED)])
except:
continue
In [72]:
def get_d_tm(tm):
tm_to_d = dict()
tm_to_d = {
"walk": 1,
"car": 2,
"bus": 3
}
return tm_to_d[tm]
df_seg["d_tm"] = df_seg["tm"].apply(get_d_tm)
In [396]:
df_seg
Out[396]:
In [74]:
from sklearn_pandas import DataFrameMapper
In [75]:
mapper = DataFrameMapper([
("d_tm", None),
("d_avg_speed", None),
("d_var_speed", None),
('d_max_speed', None),
('d_avg_top3_speed', None),
('d_top_acce', None),
('d_var_acce', None),
('d_avg_top3_acce', None),
('d_avg_acce', None)
])
In [76]:
n_feature = len(mapper.features)
In [77]:
import random
In [397]:
"""
RANDOM SAMPLING
"""
rows = random.sample(df_seg.index.levels[0].values, len(df_seg.index.levels[0])/2)
df_train = df_seg.ix[rows]
# Originally I used df_seg.drop(rows), however, for some reasons drop will not actually drop
# the index of rows.
rows_test = set(df_seg.index.levels[0].values) - set(rows)
df_test = df_seg.ix[rows_test]
In [398]:
df_train["tm"].value_counts()
Out[398]:
In [399]:
df_test["tm"].value_counts()
Out[399]:
In [81]:
from scipy import io
In [82]:
for dataset_type in ["training", "testing"]:
if dataset_type == "training":
df_temp = df_train.copy()
else:
df_temp = df_test.copy()
output_matrix = np.zeros(shape=(n_feature, 0))
for index, trace_id in enumerate(df_temp.index.levels[0].values):
trace_matrix = mapper.fit_transform(df_temp.ix[trace_id])
# I don't know why some index in the training set is still in the testing set
n, d = trace_matrix.shape
if n == 0:
print dataset_type, trace_id
if index == len(df_temp.index.levels[0].values) - 1:
output_matrix = np.concatenate((output_matrix, trace_matrix.T), axis=1)
else:
trace_matrix = np.concatenate((trace_matrix.T, np.zeros(shape=(n_feature, 1))),axis=1)
output_matrix = np.concatenate((output_matrix, trace_matrix), axis=1)
sp.io.savemat(dataset_type, mdict={'X': output_matrix}, appendmat=True)
In [83]:
df_train.to_csv("")
Out[83]:
In [317]:
df_temp.ix["2011-06-18_1056_dr.gpx"]
Out[317]:
In [319]:
df_seg.ix["2011-06-18_1056_dr.gpx"]
Out[319]:
In [404]:
df_train.to_csv("transportation_train.csv", index=False)
In [405]:
df_test.to_csv("transportation_test.csv", index=False)
In [409]:
len(df_train)
Out[409]:
In [307]:
output_matrix
Out[307]:
In [68]:
X = sp.io.loadmat("training.mat")
In [403]:
df_
Out[403]:
In [ ]:
In [ ]: