For each driver we havw 200 trips. Each trip is in a separate file. Trip is described as a set of x and y coordinates of the car, measured every second. Some of these trips are not made by this driver. Goal is to predict with what probability some particular trip belongs to a driver.
In [33]:
#import libraries
from __future__ import division
import pandas as pd
import os
import sqlite3
import math
import seaborn as sns
import matplotlib.pyplot as plt
import random
import numpy as np
from numpy.random import randn
In [20]:
%matplotlib inline
random.seed(666)
In [4]:
def derivative(x_list, y_list):
'''
helper function that finds derivative of the function y_list vs x_list
'''
result = []
for x in range(2, len(x_list) - 2):
result += [(-y_list[x + 2] + 8 * y_list[x + 1] - 8 * y_list[x - 1] + y_list[x - 2]) / 12]
return x_list[2:-2], result
In [5]:
def telematics(trip):
'''
@trip - dataframe with a trip
'''
x_list = trip['x'].values
y_list = trip['y'].values
trip_time = len(trip.index)
path = 0
not_moving = 0
t_list = range(trip_time)
tx, v_x = derivative(t_list, x_list)
tx, v_y = derivative(t_list, y_list)
ttx, a_x = derivative(tx, v_x)
ttx, a_y = derivative(tx, v_y)
list_v = []
for t in range(1, len(tx)):
v = math.sqrt((v_x[t] - v_x[t-1])**2 + (v_y[t] - v_y[t-1])**2)
list_v += [v]
list_a = []
for t in range(1, len(ttx)):
a = math.sqrt((a_x[t] - a_x[t-1])**2 + (a_y[t] - a_y[t-1])**2)
list_a += [a]
for t in range(1, trip_time):
dL = math.sqrt((y_list[t] - y_list[t-1])**2 + (x_list[t] - x_list[t-1])**2)
path += dL
if dL == 0:
not_moving += 1
average_speed = path / trip_time
result = pd.DataFrame()
result['pathlength'] = [path]
result['triptime'] = [trip_time]
result['average_speed'] = [average_speed]
result['not_moving'] = [not_moving]
result['percent_not_moving'] = [not_moving / trip_time]
list_v = map(abs, list_v)
result['average_momentarily_speed'] = [np.mean(list_v)]
result['max_speed'] = [max(list_v)]
result['min_speed'] = [min(list_v)]
list_a = map(abs, list_a)
result['average_acceleration'] = [np.mean(list_a)]
result['max_acceleration'] = [max(list_a)]
result['min_acceleration'] = [min(list_a)]
return result
In [14]:
#Let's get trips for one driver and look what parameters may be relevant
driver_path = os.path.join("data", "train", "drivers", "1")
#I will assume 200 trips for every driver
trips = range(1, 201)
def trip_list(trip):
return pd.read_csv(os.path.join(driver_path, str(trip) + ".csv"))
In [15]:
driver_data = map(telematics, map(trip_list, trips))
In [16]:
driver_data = pd.concat(driver_data)
In [17]:
driver_data.info()
In [39]:
plt.hist(driver_data["average_momentarily_speed"], 10);
In [59]:
sns.kdeplot(driver_data["max_speed"], shade=True)
Out[59]:
In [43]:
with sns.axes_style("white"):
sns.jointplot("pathlength", "triptime", driver_data, kind="kde");
In [61]:
with sns.axes_style("white"):
sns.jointplot("max_acceleration", "triptime", driver_data, kind="kde");
In [45]:
with sns.axes_style("white"):
sns.jointplot("average_acceleration", "triptime", driver_data, kind="kde");
I kind of like plots that are created as triptime vs pathlength. They should be correlated, but I can see some clear separate island that will allow me to check clustering algorithms.
In [46]:
from sklearn.svm import OneClassSVM
In [75]:
cv = OneClassSVM()
In [102]:
df = driver_data[["triptime", "pathlength", "max_speed", "max_acceleration", "average_acceleration"]]
In [103]:
fit = cv.fit(df.values)
In [104]:
prediction = cv.predict(df.values)
In [105]:
print (prediction + 1) / 2
In [106]:
print sum(prediction + 1) / 400
Percent of the non driving should be small, so 3, 4 and 5 worj checking
In [ ]: