In [4]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import pickle
from __future__ import division
import csv
from tools import *
import datetime

In [23]:
dict_metro = load_metro_dictionary()

In [28]:
data_path = os.path.join(os.getcwd(),'..','data')
first_period_path = os.path.join(data_path,'Users_data','etapas_2013_abril_allyearsids_10_100000.csv')
second_period_path = os.path.join(data_path,'Users_data','etapas_2013_septiembre_allyearsids_10_100000.csv')

In [29]:
frame = pd.read_csv(first_period_path)
frame['tiempo_subida'] = pd.to_datetime(frame.tiempo_subida)
frame = frame.sort_values(by=['id', 'tiempo_subida'])
frame.head()


Out[29]:
tiempo_subida id x_subida y_subida tipo_transporte serviciosentidovariante tipo_dia nviaje netapa x_bajada y_bajada tiempo_bajada par_subida par_bajada zona_subida zona_bajada adulto
23 2013-04-14 06:45:44 1132106 348108.0 6289153.0 BUS T203 00R DOMINGO 1 1 346818.0 6299394.0 2013-04-14 07:07:02 T-22-205-SN-65 E-20-190-SN-40 328.0 307.0 0.0
22 2013-04-14 07:51:52 1132106 346751.0 6299389.0 BUS T502 00I DOMINGO 2 1 351363.0 6302549.0 2013-04-14 08:04:11 E-20-291-PO-20 T-15-135-PO-5 307.0 188.0 0.0
21 2013-04-14 19:56:47 1132106 351368.0 6302559.0 BUS T502 00R DOMINGO 3 1 346763.0 6299568.0 2013-04-14 20:09:11 T-15-135-OP-110 T-4-19-NS-100 188.0 55.0 0.0
20 2013-04-14 20:15:25 1132106 346713.0 6299427.0 BUS T203 00I DOMINGO 3 2 348095.0 6289148.0 2013-04-14 20:40:51 E-20-199-NS-2 T-24-205-NS-20 307.0 348.0 0.0
19 2013-04-15 21:04:59 1132106 348103.0 6289191.0 BUS T206 00R LABORAL 4 1 346844.0 6299320.0 2013-04-15 21:33:23 T-22-205-SN-65 T-20-190-SN-35 328.0 309.0 0.0

In [30]:
df_id_period = pd.read_csv(second_period_path)
df_id_period['tiempo_subida'] = pd.to_datetime(df_id_period.tiempo_subida)
df_id_period = df_id_period.sort_values(by=['id', 'tiempo_subida'])

In [31]:
reload(tpm_identification)


Out[31]:
<module 'tpm_identification' from 'tpm_identification.pyc'>

In [32]:
profiles = tpm_identification.get_spatiotemporal_profiles(frame['id'],frame['tiempo_subida'],frame['par_subida'],frame['par_bajada'])

In [33]:
sequences = tpm_identification.get_spatiotemporal_sequences(df_id_period['id'],df_id_period['tiempo_subida'],df_id_period['par_subida'],df_id_period['par_bajada'])

In [22]:
start_time = time.time()
iden = tpm_identification.get_spatiotemporal_identification_matrix(profiles,sequences)
delta_time = time.time() - start_time
print delta_time

In [34]:
with open('iden_matrix_spatiotemporal.pickle','w') as f:
    pickle.dump(iden,f)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-34-e1ce4f8818bf> in <module>()
      1 with open('iden_matrix_spatiotemporal.pickle','w') as f:
----> 2     pickle.dump(iden,f)

NameError: name 'iden' is not defined

In [60]:
profiles_st = tpm_identification.get_spatiotemporal_profiles_2(frame['id'],frame['tiempo_subida'],frame['par_subida'],frame['par_bajada'])

In [61]:
start_time = time.time()
iden_st_2 = tpm_identification.get_spatiotemporal_identification_matrix_2(profiles_st,sequences)
delta_time = time.time() - start_time
print delta_time


6095.81366611

In [62]:
with open('iden_matrix_spatiotemporal_2.pickle','w') as f:
    pickle.dump(iden_st_2,f)

In [65]:
iden_matrix = np.matrix(iden_st_2)
df_ident = pd.DataFrame(iden_matrix)
i = 0
identified_indexs = []
wrong_indexs = []
correct_indexs = []
selected_indexs = []
n_identified = 0
limit = min(len(profiles),len(sequences))
while (i<limit):
    the_index = np.argmax(iden_matrix[:,i])
    selected_indexs.append(np.max(iden_matrix[:,i]))
    identified_indexs.append(the_index)
    if(the_index!=i):
        wrong_indexs.append(the_index)
    else:
        correct_indexs.append(the_index)
        n_identified += 1
    i += 1
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"

In [68]:
reload(tpm_identification)


Out[68]:
<module 'tpm_identification' from 'tpm_identification.py'>

In [69]:
start_time = time.time()
iden_st_3 = tpm_identification.get_spatiotemporal_identification_matrix_3(profiles_st,sequences)
delta_time = time.time() - start_time
print delta_time


6426.14186883

In [70]:
with open('iden_matrix_spatiotemporal_3.pickle','w') as f:
    pickle.dump(iden_st_3,f)

In [71]:
iden_matrix = np.matrix(iden_st_3)
df_ident = pd.DataFrame(iden_matrix)
i = 0
identified_indexs = []
wrong_indexs = []
correct_indexs = []
selected_indexs = []
n_identified = 0
limit = min(len(profiles),len(sequences))
while (i<limit):
    the_index = np.argmax(iden_matrix[:,i])
    selected_indexs.append(np.max(iden_matrix[:,i]))
    identified_indexs.append(the_index)
    if(the_index!=i):
        wrong_indexs.append(the_index)
    else:
        correct_indexs.append(the_index)
        n_identified += 1
    i += 1
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"


24.2%

In [ ]: