In [7]:
%matplotlib inline
import matplotlib.pyplot as plt
import time
import datetime as dt
import pickle
import numpy as np
import borrador
from __future__ import division
from auxiliar_functions import *
import tfe
In [8]:
datos_abril = pd.read_csv('/home/cata/Documentos/Datois/etapas_2013_abril_allyearsids_10_100000.csv')
In [9]:
datos_abril = frame_config(datos_abril)
In [10]:
#frame.drop(frame.columns[[2,3,4,5,9,10,11,14,15,16]], axis=1, inplace=True)
datos_abril.head()
Out[10]:
In [11]:
datos_abril.info()
In [12]:
test_sequence_5 = datos_abril.query('id==1261466')
test_sequence_1187 = datos_abril.query('id==50264508')
test_sequence_0 = datos_abril.query('id==1132106')
test_sequence_9 = datos_abril.query('id==1355530')
test_sequence_2168 = datos_abril.query('id==59460077')
test_sequence_3696 = datos_abril.query('id==68660100')
test_sequence_4150= datos_abril.query('id==70321252')
In [13]:
test_sequence = test_sequence_0
[test_sequence_s ,test_sequence_fds] = tfe.split_sequence_by_weekdays(test_sequence)
In [14]:
shortest_activities_s = tfe.get_mean_shortest_activity_length(test_sequence_s)
longest_activities_s = tfe.get_mean_longest_activity_length(test_sequence_s)
shortest_activities_fds = tfe.get_mean_shortest_activity_length(test_sequence_fds)
longest_activities_fds = tfe.get_mean_longest_activity_length(test_sequence_fds)
In [15]:
print str(shortest_activities_s) + " , " + str(shortest_activities_fds)
print str(longest_activities_s) + " , " + str(longest_activities_fds)
In [16]:
latlong1 = (test_sequence_fds['lat_subida'][0],test_sequence_fds['long_subida'][0])
latlong2 = (test_sequence_fds['lat_bajada'][0],test_sequence_fds['long_bajada'][0])
In [17]:
tfe.get_traveled_distance(test_sequence)
Out[17]:
In [18]:
tfe.get_maximum_travel_distance(test_sequence)
Out[18]:
In [19]:
tfe.get_minimum_travel_distance(test_sequence)
Out[19]:
In [20]:
tfe.get_percentage_bus_exclusive_days(test_sequence)
Out[20]:
In [21]:
tfe.get_percentage_rail_exclusive_days(test_sequence)
Out[21]:
In [22]:
tfe.get_card_type(test_sequence)
Out[22]:
In [23]:
tfe.get_n_days_traveled(test_sequence)
Out[23]:
In [24]:
tfe.get_n_trips_per_day(test_sequence)
Out[24]:
In [25]:
tfe.get_mean_start_time_first_trip(test_sequence)
Out[25]:
In [26]:
tfe.get_mean_start_time_last_trip(test_sequence)
Out[26]:
In [27]:
tfe.get_percentage_different_last_origin(test_sequence)
Out[27]:
In [28]:
tfe.get_percentage_different_first_origin(test_sequence)
Out[28]:
In [29]:
tfe.get_n_different_locations(test_sequence)
Out[29]:
In [30]:
tfe.get_radius_of_gyration(test_sequence)
Out[30]:
In [31]:
tfe.get_radius_of_gyration(test_sequence_fds)
Out[31]:
In [32]:
tfe.get_radius_of_gyration(test_sequence_s)
Out[32]:
In [33]:
tfe.get_unc_entropy(test_sequence)
Out[33]:
In [34]:
tfe.get_random_entropy(test_sequence)
Out[34]:
In [35]:
tfe.get_frequence_regularity(test_sequence)
Out[35]:
In [36]:
tfe.get_entropy(test_sequence)
Out[36]:
In [37]:
chronology = tfe.get_chronology(test_sequence,latlong1[0],latlong1[1])
data_days_window = 1
window = data_days_window*24*60 # 8 dias en minutos
In [38]:
tfe.get_regularity(chronology,window)
Out[38]:
In [39]:
tfe.get_latlong_points(test_sequence_0)
Out[39]:
In [40]:
tfe.get_clusters(test_sequence_0)
Out[40]:
In [41]:
tfe.get_ROIs(test_sequence_0,0.6)
Out[41]:
In [42]:
tfe.get_latlong_points(test_sequence_5)
Out[42]:
In [43]:
tfe.get_clusters(test_sequence_5)
Out[43]:
In [44]:
tfe.get_latlong_points(test_sequence_1187)
Out[44]:
In [45]:
tfe.get_clusters(test_sequence_1187)
Out[45]:
In [46]:
tfe.get_latlong_points(test_sequence_9)
Out[46]:
In [47]:
tfe.get_clusters(test_sequence_9)
Out[47]:
In [48]:
tfe.get_ROIs(test_sequence_9,0.6)
Out[48]:
In [49]:
tfe.get_latlong_points(test_sequence_2168)
Out[49]:
In [50]:
tfe.get_clusters(test_sequence_2168)
Out[50]:
In [51]:
tfe.get_ROIs(test_sequence_2168,0.6)
Out[51]:
In [52]:
chronology
Out[52]:
In [53]:
window
Out[53]:
In [54]:
chronology = tfe.get_chronology(test_sequence_2168,-33.408,-70.555597)
tfe.get_regularity(chronology,window*1)
Out[54]:
In [55]:
tfe.get_latlong_points(test_sequence_3696)
Out[55]:
In [56]:
tfe.get_clusters(test_sequence_3696)
Out[56]:
In [57]:
tfe.get_ROIs(test_sequence_3696,0.6)
Out[57]:
In [58]:
tfe.get_latlong_points(test_sequence_4150)
Out[58]:
In [59]:
tfe.get_clusters(test_sequence_4150)
Out[59]:
In [60]:
tfe.get_ROIs(test_sequence_4150,0.6)
Out[60]:
In [61]:
chronology = tfe.get_chronology(test_sequence_4150,-33.56950568,-70.5831589)
data_days_window = 1
window = data_days_window*24*60 # 8 dias en minutos
tfe.get_regularity(chronology,window)
Out[61]:
In [62]:
# regularity not working!
In [63]:
test_sequences = [ test_sequence_5, test_sequence_1187, test_sequence_0, test_sequence_9, \
test_sequence_2168, test_sequence_3696, test_sequence_4150]
the_vectors = []
for test_seq in test_sequences:
the_vectors.append(borrador.get_features(test_seq))
In [64]:
counter = 0
ids = [5,1187,0,9,2168,3696,4150]
for test_seq in test_sequences:
print "Test de ", ids[counter]
a = borrador.get_features(test_seq)
assert tfe.get_mean_shortest_activity_length(test_seq) == a[0]
assert tfe.get_mean_longest_activity_length(test_seq) == a[1]
assert tfe.get_traveled_distance(test_seq) == a[2]
assert tfe.get_maximum_travel_distance(test_seq) == a[3]
assert tfe.get_minimum_travel_distance(test_seq) == a[4]
assert tfe.get_radius_of_gyration(test_seq) == a[5]
assert tfe.get_unc_entropy(test_seq) == a[6]
assert tfe.get_random_entropy(test_seq) == a[7]
assert tfe.get_percentage_different_last_origin(test_seq) == a[8]
assert tfe.get_percentage_different_first_origin(test_seq) == a[9]
assert tfe.get_card_type(test_seq) == a[10]
assert tfe.get_mean_start_time_first_trip(test_seq) == a[11]
assert tfe.get_mean_start_time_last_trip(test_seq) == a[12]
assert tfe.get_n_days_traveled(test_seq) == a[13]
assert tfe.get_n_days_traveled_before_stop(test_seq) == a[14]
assert tfe.get_frequence_regularity(test_seq) == a[15]
assert tfe.get_percentage_bus_exclusive_days(test_seq) == a[16]
assert tfe.get_percentage_rail_exclusive_days(test_seq) == a[17]
assert tfe.get_percentage_bus_trips(test_seq) == a[18]
counter += 1
In [65]:
datos_septiembre = pd.read_csv('/home/cata/Documentos/Datois/etapas_2013_septiembre_allyearsids_10_100000.csv')
In [66]:
datos_septiembre = frame_config(datos_septiembre)
In [67]:
user_ids = [1261466,50264508,1132106,1355530,59460077,68660100,70321252]
In [68]:
the_dfs = []
for i in user_ids:
a_str = 'datos_septiembre.query("id=='+str(i)+'")'
the_dfs.append(eval(a_str))
In [69]:
septiembre_vectors = []
for test_seq in the_dfs:
septiembre_vectors.append(borrador.get_features(test_seq))
In [70]:
counter = 0
counteri = 0
a_matrix = np.zeros((7, 7))
for j in ids:
abril_vector = the_vectors[counter]
for i in ids:
septiembre_vector = septiembre_vectors[counteri]
dist = np.linalg.norm(np.asarray(abril_vector)-np.asarray(septiembre_vector))
a_matrix[counter,counteri] = dist
counteri += 1
counter += 1
counteri = 0
In [71]:
comparacion = pd.DataFrame(a_matrix)
comparacion.columns = ids
comparacion.index = ids
In [72]:
comparacion
Out[72]:
In [73]:
comparacion[5]
Out[73]:
In [74]:
for i in ids:
print np.argmin(comparacion[i])
In [75]:
N_FEATURES = 19
In [76]:
reload(borrador)
reload(tfe)
Out[76]:
In [77]:
datos_abril.set_index(keys=['id'], drop=False,inplace=True)
# get a list of names
ids_abril=datos_abril['id'].unique().tolist()
datos_septiembre.set_index(keys=['id'],drop=False,inplace=True)
# get a list of names
ids_septiembre = datos_septiembre['id'].unique().tolist()
In [ ]:
with open('datos_abril_sep_sequence.pickle', 'w') as f:
pickle.dump(datos_abril, f)
pickle.dump(datos_septiembre, f)
In [78]:
limit = np.min((len(ids_abril),len(ids_septiembre)))
limit
Out[78]:
In [79]:
abril_vectors = np.zeros((limit, N_FEATURES))
init_time = time.time()
for i in range(limit):
an_id = ids_abril[i]
abril_sequence = datos_abril.loc[datos_abril.id==an_id]
abril_vector = borrador.get_features(abril_sequence)
abril_vectors[i] = abril_vector
delta_time = time.time()-init_time
print delta_time
In [80]:
septiembre_vectors = np.zeros((limit, N_FEATURES))
init_time = time.time()
for i in range(limit):
an_id = ids_abril[i]
septiembre_sequence = datos_septiembre.loc[datos_septiembre.id==an_id]
septiembre_vector = borrador.get_features(septiembre_sequence)
septiembre_vectors[i] = septiembre_vector
delta_time = time.time()-init_time
print delta_time
In [81]:
def matrixToCsv(matrix,name):
features_names = ["msal","mlal","kmDistance","kmMaxDist","kmMinDist","rg","unc_entropy", \
"random_entropy","p100_diff_last_origin","p100_diff_first_origin","card_type",\
"start_time","end_time","traveled_days","traveled_days_bs","frequence_regularity",\
"p100_exclusive_bus_days","p100_exclusive_metro_days","P100_bus_trips"]
feature_header = ""
for feature_name in features_names:
feature_header = feature_header + "," + feature_name
feature_header = feature_header[1:]
np.savetxt(name, matrix, delimiter=",",header=feature_header)
In [82]:
matrixToCsv(abril_vectors,"abril_features.csv")
matrixToCsv(septiembre_vectors,"septiembre_features.csv")
In [83]:
def normalizar(vector):
a_max = np.max(vector)
a_min = np.min(vector)
for i in range(len(vector)):
vector[i] = (vector[i] - a_min)/a_max
return vector
In [84]:
for i in range(abril_vectors.shape[1]):
abril_vectors[:,i] = normalizar(abril_vectors[:,i])
for i in range(septiembre_vectors.shape[1]):
septiembre_vectors[:,i] = normalizar(septiembre_vectors[:,i])
In [85]:
a_matrix = np.zeros((limit, limit))
init_time = time.time()
for i in range(limit):
for j in range(limit):
dist = np.linalg.norm(np.asarray(abril_vectors[i])-np.asarray(septiembre_vectors[j]))
a_matrix[i,j] = dist
delta_time = time.time()-init_time
print delta_time
In [86]:
identified_indexs = [] #almacena los indices de que secuencia fue seleccionada como match
wrong_indexs = [] # almacena los indices de los que se clasificaron incorrectamente
correct_indexs = [] # almacena los indices de los que se clasificaron correctamente
selected_distance = [] # almacena la distancia de los seleccionados
n_identified = 0
for i in range(limit):
the_index = np.argmin(a_matrix[:,i])
selected_distance.append(np.min(a_matrix[:,i]))
identified_indexs.append(the_index)
if(the_index!=i):
wrong_indexs.append(the_index)
else:
correct_indexs.append(the_index)
n_identified += 1
In [87]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"
In [88]:
comparacion = pd.DataFrame(a_matrix)
comparacion.head()
Out[88]:
In [89]:
# Explorar Features !!!
# Usar clasificador :D
In [90]:
with open('feature_data.pickle', 'w') as f:
pickle.dump(abril_vectors, f)
pickle.dump(septiembre_vectors, f)
In [163]:
reload(tfe)
Out[163]:
In [164]:
abril_the_rois = []
init_time = time.time()
for i in range(limit):
#print i
an_id = ids_abril[i]
abril_sequence = datos_abril.loc[datos_abril.id==an_id]
rois = tfe.get_ROIs(abril_sequence,0.6)
abril_the_rois.append(rois)
delta_time = time.time()-init_time
print delta_time
In [165]:
septiembre_the_rois = []
init_time = time.time()
for i in range(limit):
an_id = ids_abril[i]
septiembre_sequence = datos_septiembre.loc[datos_septiembre.id==an_id]
rois = tfe.get_ROIs(septiembre_sequence,0.6)
septiembre_the_rois.append(rois)
delta_time = time.time()-init_time
print delta_time
In [166]:
with open('rois_meters_data.pickle', 'w') as f:
pickle.dump(abril_the_rois, f)
pickle.dump(septiembre_the_rois, f)
In [167]:
p_of_being_in_rois = [rois[1] for rois in abril_the_rois]
plt.hist(p_of_being_in_rois)
plt.show()
In [168]:
p_of_being_in_rois = [rois[1] for rois in septiembre_the_rois]
plt.hist(p_of_being_in_rois)
plt.show()
In [169]:
n_of_rois = [len(rois[0]) for rois in abril_the_rois]
d = np.diff(np.unique(n_of_rois)).min()
left_of_first_bin = min(n_of_rois) - float(d)/2
right_of_last_bin = max(n_of_rois) + float(d)/2
plt.hist(n_of_rois, np.arange(left_of_first_bin, right_of_last_bin + d, d))
plt.show()
In [170]:
n_of_rois = [len(rois[0]) for rois in septiembre_the_rois]
d = np.diff(np.unique(n_of_rois)).min()
left_of_first_bin = min(n_of_rois) - float(d)/2
right_of_last_bin = max(n_of_rois) + float(d)/2
plt.hist(n_of_rois, np.arange(left_of_first_bin, right_of_last_bin + d, d))
plt.show()
In [171]:
abril_the_rois[0]
Out[171]:
In [99]:
abril_the_rois[0]
Out[99]:
In [173]:
septiembre_the_rois[0]
Out[173]:
In [174]:
vincenty((-33.405970830060198, -70.598378987909996),(-33.411775791558298,-70.604179649681797)).meters
Out[174]:
In [100]:
septiembre_the_rois[0]
Out[100]:
In [178]:
from geopy.distance import vincenty
def share_rois(rois_a,rois_b):
shared = 0
rois = [rois_a,rois_b]
index = np.argmin([len(rois_a),len(rois_b)])
other_index = abs(index-1)
min_distance = -1
for i in range(len(rois[index])):
an_a_roi = rois[index][i]
lat_a_roi = an_a_roi['lat']
long_a_roi = an_a_roi['long']
for j in range(len(rois[other_index])):
an_b_roi = rois[other_index][j]
lat_b_roi = an_b_roi['lat']
long_b_roi = an_b_roi['long']
a_distance = vincenty((lat_a_roi,long_a_roi),(lat_b_roi,long_b_roi)).meters
if a_distance < 500:
shared +=1
elif min_distance == -1 or min_distance > a_distance:
min_distance = a_distance
return [shared,min_distance]
In [201]:
a_matrix = np.ones((limit, limit)) * -1
init_time = time.time()
shared = []
for i in range(limit):
an_id = ids_abril[i]
rois_abril = abril_the_rois[i]
for j in range(limit):
rois_septiembre = septiembre_the_rois[j]
share_RoIs,min_distance_not_shared = share_rois(rois_abril[0],rois_septiembre[0])
shared.append(share_RoIs)
if share_RoIs > 0:
abril_sequence = abril_vectors[i]
septiembre_sequence = septiembre_vectors[j]
dist = np.linalg.norm(np.asarray(abril_sequence)-np.asarray(septiembre_sequence))
a_matrix[i,j] = -dist
delta_time = time.time() - init_time
print delta_time
In [203]:
5818/60
Out[203]:
In [179]:
a_matrix = np.ones((limit, limit)) * -1
init_time = time.time()
shared = []
min_not_shared = []
for i in range(limit):
an_id = ids_abril[i]
rois_abril = abril_the_rois[i]
rois_septiembre = septiembre_the_rois[i]
share_RoIs,not_shared_roi = share_rois(rois_abril[0],rois_septiembre[0])
shared.append(share_RoIs)
min_not_shared.append(not_shared_roi)
delta_time = time.time() - init_time
print delta_time
In [182]:
len(min_not_shared)
Out[182]:
In [200]:
a = pd.DataFrame(np.sort(min_not_shared))
a[800:].head(40)
Out[200]:
In [187]:
plt.plot(np.sort(min_not_shared))
Out[187]:
In [156]:
a_matrix = np.ones((limit, limit)) * -1
init_time = time.time()
shared = []
for i in range(limit):
an_id = ids_abril[i]
rois_abril = abril_the_rois[i]
rois_septiembre = septiembre_the_rois[i]
share_RoIs = share_rois(rois_abril[0],rois_septiembre[0])
shared.append(share_RoIs)
abril_sequence = abril_vectors[i]
septiembre_sequence = septiembre_vectors[i]
dist = np.linalg.norm(np.asarray(abril_sequence)-np.asarray(septiembre_sequence))
delta_time = time.time() - init_time
print delta_time
In [151]:
delta_time/60
Out[151]:
In [162]:
print len(shared)
plt.hist(shared,[0,1,2,3,4])
Out[162]:
In [204]:
with open('distance_meters_data.pickle', 'w') as f:
pickle.dump(a_matrix, f)
In [205]:
identified_indexs = [] #almacena los indices de que secuencia fue seleccionada como match
wrong_indexs = [] # almacena los indices de los que se clasificaron incorrectamente
correct_indexs = [] # almacena los indices de los que se clasificaron correctamente
selected_distance = [] # almacena la distancia de los seleccionados
n_identified = 0
for i in range(limit):
the_index = np.argmax(a_matrix[:,i])
selected_distance.append(np.max(a_matrix[:,i]))
identified_indexs.append(the_index)
if(the_index!=i):
wrong_indexs.append(the_index)
else:
correct_indexs.append(the_index)
n_identified += 1
In [206]:
shared_rois_2_month = 0
changed_behaviour = []
for i in range(limit):
if a_matrix[i,i] != -1:
shared_rois_2_month += 1
else:
changed_behaviour.append(i)
In [207]:
print str(round(shared_rois_2_month*100/limit,2)) + "%"
In [208]:
changed_behaviour
Out[208]:
In [120]:
len(changed_behaviour)
Out[120]:
In [114]:
ids_abril[7]
Out[114]:
In [115]:
datos_abril.loc[datos_abril.id==ids_abril[7]][['par_subida']]
Out[115]:
In [116]:
datos_septiembre.loc[datos_septiembre.id==ids_abril[7]][['par_subida']]
Out[116]:
In [209]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"
In [210]:
shared[0]
Out[210]:
In [118]:
plt.hist(selected_distance)
Out[118]:
In [119]:
plt.hist(correct)
In [123]:
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
In [125]:
latlong_points = tfe.get_latlong_points(test_sequence_0)
latlong_points[0]
Out[125]:
In [137]:
Z_euclidean = linkage(latlong_points[0],'ward')
clusters_euclidean = fcluster(Z_euclidean,0.02,criterion='distance')
In [138]:
meters_distance = pdist(latlong_points[0],lambda x,y: vincenty(x,y).meters)
In [145]:
Z_meters = linkage(latlong_points[0],'weighted',lambda x,y: vincenty(x,y).meters)
clusters_meters = fcluster(Z_meters,500,criterion='distance')
In [146]:
clusters_euclidean
Out[146]:
In [147]:
clusters_meters
Out[147]:
In [149]:
m = pd.DataFrame(Z_euclidean)
m
Out[149]:
In [150]:
n = pd.DataFrame(Z_meters)
n
Out[150]:
In [ ]: