From the initial data I have prepared datasets with variables:

  • pathlength
  • triptime
  • average_speed
  • not_moving
  • percent_not moving
  • average_momentarily_speed
  • min_speed
  • max_speed
  • average_acceleration
  • max_acceleration
  • min_acceleration

In [ ]:
#initialize libraries
from __future__ import division
import os
import pandas as pd
from sklearn.cluster import KMeans
from multiprocessing import Pool
import random
import time
from sklearn.svm import OneClassSVM

In [ ]:
p = Pool(3)

In [ ]:
def classify(driver_data):

    It is assumed that most of the trips belong to the same driver

    :param driver: driver that we need to analyze
    :return: data frame that will tell probability that his trip belongs to this particular driver

    # gmm = mixture.GMM(n_components=2)
    # gmm = KMeans(n_clusters=2)
    cv = OneClassSVM()
    data = driver_data.drop(["driver_trip", 
                             "min_acceleration"], axis=1)
    predictions = cv.predict(data).tolist()

    if sum(predictions + 1) / 400 < 0.5: #Here I hardcode that we have 200 trips per driver
        predictions = map(lambda x: 1 - x, predictions)

    result = pd.DataFrame()
    result["driver_trip"] = driver_data["driver_trip"]    
    result["prob"] = predictions
    return result

In [ ]:
def get_df(driver):
    return pd.read_csv(os.path.join("data", "training", str(driver)))

data_to_work_with =, os.listdir(os.path.join("data", "training")))

print "create list of data frames with predictions"
result =, data_to_work_with)

In [ ]:
print "concatenate data"
final_result = pd.concat(result)
print "done with predictions"

In [ ]:

In [ ]:
final_result.to_csv(os.path.join("data", "submission_{timestamp}".format(timestamp=time.time())), index=False)

In [ ]: