From the initial data I have prepared datasets with variables:

  • pathlength
  • triptime
  • average_speed
  • not_moving
  • percent_not moving
  • average_momentarily_speed
  • min_speed
  • max_speed
  • average_acceleration
  • max_acceleration
  • min_acceleration

In [ ]:
#initialize libraries
from __future__ import division
import os
import pandas as pd
from sklearn.cluster import KMeans
from multiprocessing import Pool
import random
import time
from sklearn.svm import OneClassSVM

In [ ]:
random.seed(666)
p = Pool(3)

In [ ]:
def classify(driver_data):
    '''

    It is assumed that most of the trips belong to the same driver

    :param driver: driver that we need to analyze
    :return: data frame that will tell probability that his trip belongs to this particular driver
    '''

    # gmm = mixture.GMM(n_components=2)
    # gmm = KMeans(n_clusters=2)
    cv = OneClassSVM()
    data = driver_data.drop(["driver_trip", 
                             "driver", 
                             "trip",       
                             "average_speed",
                             "not_moving",
                             "percent_not_moving",
                             "average_momentarily_speed",
                             "max_speed",
                             "min_speed",                             
                             "average_acceleration",
                             "max_acceleration",
                             "min_acceleration"], axis=1)
    cv.fit(data)
    predictions = cv.predict(data).tolist()

    if sum(predictions + 1) / 400 < 0.5: #Here I hardcode that we have 200 trips per driver
        predictions = map(lambda x: 1 - x, predictions)

    result = pd.DataFrame()
    result["driver_trip"] = driver_data["driver_trip"]    
    result["prob"] = predictions
    return result

In [ ]:
def get_df(driver):
    return pd.read_csv(os.path.join("data", "training", str(driver)))


data_to_work_with = p.map(get_df, os.listdir(os.path.join("data", "training")))

print "create list of data frames with predictions"
result = p.map(classify, data_to_work_with)

In [ ]:
print "concatenate data"
final_result = pd.concat(result)
print "done with predictions"

In [ ]:
print final_result.info()

In [ ]:
final_result.to_csv(os.path.join("data", "submission_{timestamp}".format(timestamp=time.time())), index=False)

In [ ]: