In [8]:
%matplotlib inline
import matplotlib.pyplot as plt
import time
import datetime as dt
import pickle
import numpy as np
import borrador
from __future__ import division
from auxiliar_functions import *
import tfe
from geopy.distance import vincenty
from itertools import chain, combinations
In [9]:
#Función que filtra columnas de acuerdo a una lista de indices
#filter_features: matrix list -> matrix
def filter_features(vector,selected_features):
selected = []
for i in range(len(selected_features)):
selected.append(features_dict[selected_features[i]])
return vector[:,selected]
In [10]:
#Función que identifica cuantos rois comparten dos grupos de rois y cual es la minima distancia que se descarta como
#rois compartidos
#share_rois: list(dict) list(dict) -> [int,int]
def share_rois(rois_a,rois_b):
shared = 0
rois = [rois_a,rois_b]
index = np.argmin([len(rois_a),len(rois_b)])
other_index = abs(index-1)
min_distance = -1
for i in range(len(rois[index])):
an_a_roi = rois[index][i]
lat_a_roi = an_a_roi['lat']
long_a_roi = an_a_roi['long']
for j in range(len(rois[other_index])):
an_b_roi = rois[other_index][j]
lat_b_roi = an_b_roi['lat']
long_b_roi = an_b_roi['long']
a_distance = vincenty((lat_a_roi,long_a_roi),(lat_b_roi,long_b_roi)).meters
if a_distance < 500:
shared +=1
elif min_distance == -1 or min_distance > a_distance:
min_distance = a_distance
return [shared,min_distance]
In [11]:
#Función que identifica cuantos rois comparten cada par de rois de dos cortes temporales (ct)
#get_shared_rois: list(list(dict)) list(list(dict)) -> [[int]] list(int)
def get_shared_rois(rois_ct1,rois_ct2,limit):
init_time = time.time()
shared = np.ones((limit, limit)) * -1
min_distance = []
min_distance_not_shared = -1
for i in range(limit):
rois_i = rois_ct1[i]
for j in range(limit):
rois_j = rois_ct2[j]
share_RoIs,min_distance_not_shared = share_rois(rois_i[0],rois_j[0])
if i==j:
min_distance.append(min_distance_not_shared)
min_distance_not_shared = -1
shared[i,j] = share_RoIs
delta_time = time.time() - init_time
print delta_time
return [shared,min_distance]
In [12]:
def compare_vectors(vector_a,vector_b,rois_a,rois_b,limit,min_shared):
a_matrix = np.ones((limit, limit)) * -1
init_time = time.time()
shared = []
for i in range(limit):
rois_abril = rois_a[i]
for j in range(limit):
rois_septiembre = rois_b[j]
min_shared_x = min(len(rois_abril),len(rois_septiembre),min_shared)
share_RoIs,min_distance_not_shared = share_rois(rois_abril[0],rois_septiembre[0])
shared.append(share_RoIs)
if share_RoIs >= min_shared_x:
a_sequence = vector_a[i]
b_sequence = vector_b[j]
dist = np.linalg.norm(np.asarray(a_sequence)-np.asarray(b_sequence))
a_matrix[i,j] = -dist
delta_time = time.time() - init_time
print delta_time
return a_matrix
In [13]:
def compare_vectors_with_shared_matrix(vector_a,vector_b,rois_a,rois_b,shared_rois,limit,min_shared):
a_matrix = np.ones((limit, limit)) * -1
init_time = time.time()
for i in range(limit):
rois_abril = rois_a[i]
for j in range(limit):
rois_septiembre = rois_b[j]
min_shared_x = min(len(rois_abril),len(rois_septiembre),min_shared)
share_RoIs = shared_rois[i,j]
if share_RoIs >= min_shared_x:
a_sequence = vector_a[i]
b_sequence = vector_b[j]
dist = np.linalg.norm(np.asarray(a_sequence)-np.asarray(b_sequence))
a_matrix[i,j] = -dist
delta_time = time.time() - init_time
print delta_time
return a_matrix
In [14]:
def get_n_correct(a_matrix,limit):
identified_indexs = [] #almacena los indices de que secuencia fue seleccionada como match
wrong_indexs = [] # almacena los indices de los que se clasificaron incorrectamente
correct_indexs = [] # almacena los indices de los que se clasificaron correctamente
selected_distance = [] # almacena la distancia de los seleccionados
abstenidos = []
n_identified = 0
for i in range(limit):
the_index = np.argmax(a_matrix[:,i])
selected_distance.append(a_matrix[the_index,i])
identified_indexs.append(the_index)
if a_matrix[the_index,i] == -1:
abstenidos.append(the_index)
elif(the_index!=i):
wrong_indexs.append(the_index)
else:
correct_indexs.append(the_index)
n_identified += 1
return [n_identified,selected_distance,identified_indexs,abstenidos]
In [15]:
with open('data/datos_abril_sep_sequence.pickle') as f:
datos_abril = pickle.load(f)
datos_septiembre = pickle.load(f)
In [16]:
with open('data/feature_data.pickle') as f:
abril_vectors = pickle.load(f)
septiembre_vectors = pickle.load(f)
In [17]:
limit = np.min((len(abril_vectors),len(septiembre_vectors)))
limit
Out[17]:
In [18]:
with open('data/rois_meters_data.pickle') as f:
abril_the_rois = pickle.load(f)
septiembre_the_rois = pickle.load(f)
In [21]:
with open('data/shared_rois.pickle') as f:
shared_rois = pickle.load(f)
In [11]:
with open('data/changed_behaviour.pickle') as f:
changed_behaviour = pickle.load(f)
In [ ]:
with open('data/distance_meters_data_2_rois.pickle', 'w') as f:
pickle.dump(a_matrix, f)
In [44]:
with open('data/freq_correct.pickle', 'w') as f:
pickle.dump(frequency_correct, f)
In [22]:
N_FEATURES = 19
In [23]:
features_names = ["msal","mlal","kmDistance","kmMaxDist","kmMinDist","rg","unc_entropy", \
"random_entropy","p100_diff_last_origin","p100_diff_first_origin","card_type",\
"start_time","end_time","traveled_days","traveled_days_bs","frequence_regularity",\
"p100_exclusive_bus_days","p100_exclusive_metro_days","P100_bus_trips"]
In [24]:
features_dict = {"msal":0,"mlal":1,"kmDistance":2,"kmMaxDist":3,"kmMinDist":4,"rg":5,"unc_entropy":6,
"random_entropy":7,"p100_diff_last_origin":8,"p100_diff_first_origin":9,"card_type":10,
"start_time":11,"end_time":12,"traveled_days":13,"traveled_days_bs":14,"frequence_regularity":15,
"p100_exclusive_bus_days":16,"p100_exclusive_metro_days":17,"P100_bus_trips":18}
In [47]:
selected_features = ["msal","mlal","kmDistance","kmMaxDist","kmMinDist","rg","unc_entropy", \
"random_entropy","p100_diff_last_origin","p100_diff_first_origin",\
"start_time","end_time","traveled_days","frequence_regularity",\
"p100_exclusive_bus_days","p100_exclusive_metro_days","P100_bus_trips"]
#selected_features = ['msal', 'mlal', 'kmDistance', 'unc_entropy', 'random_entropy', 'start_time']
In [57]:
selected_features= ['msal',
'kmMaxDist',
'unc_entropy',
'random_entropy',
'start_time',
'frequence_regularity',
'p100_exclusive_bus_days',
'P100_bus_trips']
In [58]:
abril_selected_features = filter_features(abril_vectors,selected_features)
septiembre_selected_features = filter_features(septiembre_vectors,selected_features)
In [59]:
abril_selected_features.shape
Out[59]:
In [54]:
shared_rois,min_distance = get_shared_rois(abril_the_rois,septiembre_the_rois,limit)
In [62]:
min(min_distance)
Out[62]:
In [63]:
a_min_distance = filter(lambda x:x!=-1,min_distance)
In [66]:
len(a_min_distance)
Out[66]:
In [69]:
min(a_min_distance)
Out[69]:
In [72]:
plt.hist(a_min_distance,300)
Out[72]:
In [81]:
a_matrix = compare_vectors_with_shared_matrix(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,1)
In [82]:
n_identified,selected_distance,identified_indexs = get_n_correct(a_matrix,limit)
In [83]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"
In [84]:
shared_rois_2_month = 0
for i in range(limit):
if a_matrix[i,i] != -1:
shared_rois_2_month += 1
print str(round(shared_rois_2_month*100/limit,2)) + "%"
In [60]:
a_matrix = compare_vectors_with_shared_matrix(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,2)
In [61]:
n_identified,selected_distance,identified_indexs,abstenidos = get_n_correct(a_matrix,limit)
In [62]:
len(identified_indexs)
Out[62]:
In [63]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"
In [46]:
shared_rois_2_month = 0
for i in range(limit):
if a_matrix[i,i] != -1:
shared_rois_2_month += 1
print str(round(shared_rois_2_month*100/limit,2)) + "%"
In [125]:
len(abstenidos)
Out[125]:
In [126]:
1349/limit*100
Out[126]:
In [72]:
def counterx(a):
b = filter(lambda x: x!=-1,a)
return len(b)
In [73]:
counterx([1,2,3,4,-1,-1,5,-1])
Out[73]:
In [ ]:
shared_rois_2_month = 0
for i in range(limit):
if a_matrix[i,i] != -1:
if identified_index == i:
print str(round(shared_rois_2_month*100/limit,2)) + "%"
In [127]:
a_matrix = compare_vectors_with_shared_matrix(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,3)
In [128]:
n_identified,selected_distance,identified_indexs = get_n_correct(a_matrix,limit)
In [ ]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"
In [ ]:
shared_rois_2_month = 0
for i in range(limit):
if a_matrix[i,i] != -1:
shared_rois_2_month += 1
print str(round(shared_rois_2_month*100/limit,2)) + "%"
In [85]:
a_matrix = compare_vectors_with_shared_matrix(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,4)
In [86]:
n_identified,selected_distance,identified_indexs = get_n_correct(a_matrix,limit)
In [87]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"
In [88]:
shared_rois_2_month = 0
for i in range(limit):
if a_matrix[i,i] != -1:
shared_rois_2_month += 1
print str(round(shared_rois_2_month*100/limit,2)) + "%"
In [ ]:
x = np.array(identified_indexs_2)
y = np.bincount(x)
ii = np.nonzero(y)[0]
In [ ]:
frequency_correct = zip(ii,y[ii])
counter = 0
ncounter = 0
freq_max_1 = 0
freq_max_2 = 0
freq_max_3 = 0
for element in frequency_correct:
if(element[1]>1):
counter +=1
ncounter += element[1]
In [ ]:
frequency_correct.sort(key = lambda t: t[1], reverse=True)
frequency_correct
In [65]:
min_distance_features_correct = []
std_distance_features_correct = []
max_distance_features_correct = []
mean_distance_features_correct = []
euclidean_distance_matrix = abs(abril_vectors-septiembre_vectors)
#obtengo para cada feature las estadisticas de los identificados correctamente
for j in range(N_FEATURES):
min_distance_features_correct.append(min(euclidean_distance_matrix[:,j]))
std_distance_features_correct.append(np.std(euclidean_distance_matrix[:,j]))
max_distance_features_correct.append(max(euclidean_distance_matrix[:,j]))
mean_distance_features_correct.append(np.mean(euclidean_distance_matrix[:,j]))
In [ ]:
min_distance_features = []
std_distance_features = []
max_distance_features = []
mean_distance_features = []
distance_feature_matrix = np.zeros((limit, N_FEATURES))
#obtengo para cada feature las distancias de cada feature en general
for j in range(N_FEATURES):
#resto el valor de la feature j,i a septiembre j guardo la distancia
for i in range(limit):
the_dist = np.mean(abs(septiembre_vectors[:,j]-abril_vectors[i,j]))
distance_feature_matrix[i,j] = the_dist
min_distance_features.append(min(distance_feature_matrix[:,j]))
std_distance_features.append(np.std(distance_feature_matrix[:,j]))
max_distance_features.append(max(distance_feature_matrix[:,j]))
mean_distance_features.append(np.mean(distance_feature_matrix[:,j]))
In [16]:
min_distance_features_neighbour = []
std_distance_features_neighbour = []
max_distance_features_neighbour = []
mean_distance_features_neighbour = []
distance_feature_matrix = np.zeros((limit, N_FEATURES))
for i in range(limit):
neighbours_i = get_neighbours(i,shared_rois)
for j in range(N_FEATURES):
the_dist = np.mean(abs(septiembre_vectors[:,j]-abril_vectors[i,j]))
distance_feature_matrix[i,j] = the_dist
min_distance_features_neighbour.append(min(distance_feature_matrix[:,j]))
std_distance_features_neighbour.append(np.std(distance_feature_matrix[:,j]))
max_distance_features_neighbour.append(max(distance_feature_matrix[:,j]))
mean_distance_features_neighbour.append(np.mean(distance_feature_matrix[:,j]))
In [17]:
plt.figure()
plt.plot(min_distance_features_correct)
plt.hold(True)
plt.plot(min_distance_features)
Out[17]:
In [18]:
plt.figure()
plt.plot(max_distance_features_correct)
plt.hold(True)
plt.plot(max_distance_features)
Out[18]:
In [19]:
plt.figure()
plt.plot(mean_distance_features_correct)
plt.hold(True)
plt.plot(mean_distance_features)
Out[19]:
In [20]:
plt.figure()
plt.plot(std_distance_features_correct)
plt.hold(True)
plt.plot(std_distance_features)
Out[20]:
In [27]:
plt.hist(euclidean_distance_matrix[:,0])
Out[27]:
In [28]:
plt.hist(euclidean_distance_matrix[:,1])
Out[28]:
In [29]:
plt.hist(euclidean_distance_matrix[:,2])
Out[29]:
In [30]:
plt.hist(euclidean_distance_matrix[:,3])
Out[30]:
In [31]:
plt.hist(euclidean_distance_matrix[:,4])
Out[31]:
In [32]:
plt.hist(euclidean_distance_matrix[:,5])
Out[32]:
In [33]:
plt.hist(euclidean_distance_matrix[:,6])
Out[33]:
In [34]:
plt.hist(euclidean_distance_matrix[:,7])
Out[34]:
In [36]:
plt.hist(euclidean_distance_matrix[:,8])
Out[36]:
In [37]:
plt.hist(euclidean_distance_matrix[:,9])
Out[37]:
In [38]:
plt.hist(euclidean_distance_matrix[:,10])
Out[38]:
In [39]:
plt.hist(euclidean_distance_matrix[:,11])
Out[39]:
In [40]:
plt.hist(euclidean_distance_matrix[:,12])
Out[40]:
In [41]:
plt.hist(euclidean_distance_matrix[:,13])
Out[41]:
In [42]:
plt.hist(euclidean_distance_matrix[:,14])
Out[42]:
In [43]:
plt.hist(euclidean_distance_matrix[:,15])
Out[43]:
In [44]:
plt.hist(euclidean_distance_matrix[:,16])
Out[44]:
In [45]:
plt.hist(euclidean_distance_matrix[:,17])
Out[45]:
In [46]:
plt.hist(euclidean_distance_matrix[:,18])
Out[46]:
In [30]:
def powerset(iterable):
"powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
s = list(iterable)
return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
In [39]:
selected_features = ["msal","mlal","kmDistance","kmMaxDist","kmMinDist","rg","unc_entropy", \
"random_entropy","p100_diff_last_origin","p100_diff_first_origin", \
"start_time","end_time","traveled_days","frequence_regularity","P100_bus_trips"]
In [36]:
len(selected_features)
Out[36]:
In [53]:
set_features = list(set(powerset(selected_features)))
In [54]:
counter = 0
for a_set in set_features:
if len(a_set) > 10:
counter +=1
print counter*18/3600
In [89]:
counter = 0
index_set = []
rendimientos = []
for a_set in set_features:
if len(a_set) > 10:
print "El set: " + str(counter)
abril_selected_features = filter_features(abril_vectors,a_set)
septiembre_selected_features = filter_features(septiembre_vectors,a_set)
a_matrix = compare_vectors_with_shared_matrix(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,2)
n_identified,selected_distance,identified_indexs = get_n_correct(a_matrix,limit)
porcentaje_correcto = n_identified*100/limit
print "Rendimiento " + str(round(porcentaje_correcto,2))+ "%"
index_set.append(counter)
rendimientos.append(porcentaje_correcto)
counter += 1
In [55]:
set_features[1915]
Out[55]:
In [ ]: