In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import time
import datetime as dt
import pickle
import numpy as np
import borrador
from __future__ import division
from auxiliar_functions import *
import tfe
from geopy.distance import vincenty
from itertools import chain, combinations
import random
import scipy as sp
In [2]:
#Función que filtra columnas de acuerdo a una lista de indices
#filter_features: matrix list -> matrix
def filter_features(vector,selected_features):
selected = []
for i in range(len(selected_features)):
selected.append(features_dict[selected_features[i]])
return vector[:,selected]
In [3]:
def only_one_match_home_and_work(limit,shared_rois):
a_matrix = np.ones((limit, limit)) * -1
init_time = time.time()
for i in range(limit):
neighbours = get_neighbours_index(shared_rois,i,2)
if len(neighbours) > 0:
if len(neighbours) == 1:
a_matrix[i,neighbours[0]] = 0
delta_time = time.time() - init_time
print delta_time
return a_matrix
In [6]:
def random_match_home_and_work(limit,shared_rois):
a_matrix = np.ones((limit, limit)) * -1
init_time = time.time()
for i in range(limit):
neighbours = get_neighbours_index(shared_rois,i,2)
n_neighbours = len(neighbours)
if n_neighbours > 0:
if n_neighbours == 1:
a_matrix[i,neighbours[0]] = 0
else:
random_neighbour = random.choice(neighbours)
a_matrix[i,random_neighbour] = 0
delta_time = time.time() - init_time
print delta_time
return a_matrix
In [334]:
def compare_vectors_with_neighbours(vector_a,vector_b,rois_abril,rois_b,shared_rois,limit,min_shared,f_normalizar,f_distancia):
a_matrix = np.ones((limit, limit)) * -1
init_time = time.time()
for i in range(limit):
#print "Usuario ",i
rois_abril = rois_a[i]
neighbours = get_neighbours_index(rois_abril,shared_rois,i,min_shared)
if len(neighbours) > 0:
if len(neighbours) == 1:
a_matrix[i,neighbours[0]] = 0
else:
a_sequence = vector_a[i,:]
b_sequences = vector_b[neighbours,:]
ab_sequences = np.vstack((a_sequence,b_sequences))
for j in range(ab_sequences.shape[1]):
ab_sequences[:,j] = f_normalizar(ab_sequences[:,j])
counter = 0
for neighbour in neighbours:
dist = f_distancia(np.asarray(ab_sequences[0,:]),np.asarray(ab_sequences[counter+1,:]))
a_matrix[i,neighbour] = -dist
counter += 1
delta_time = time.time() - init_time
print delta_time
return a_matrix
In [297]:
neighburs = get_neighbours_index(shared_rois,1829,2)
neighburs
Out[297]:
In [298]:
neighburs = get_neighbours_index(shared_rois,1829,2)
i = 1829
a_sequence = abril_selected_features[i,:]
b_sequences = septiembre_selected_features[neighburs,:]
ab_sequences = np.vstack((a_sequence,b_sequences))
for i in range(ab_sequences.shape[1]):
ab_sequences[:,i] = normalizar(ab_sequences[:,i])
pd.DataFrame(ab_sequences)
Out[298]:
In [299]:
counter = 0
for neighbour in neighburs:
dist = sp.spatial.distance.euclidean(np.asarray(ab_sequences[0,:]),np.asarray(ab_sequences[counter+1,:]))
print -dist
counter += 1
In [300]:
counter = 0
for neighbour in neighburs:
dist = abs(np.linalg.norm(np.asarray(ab_sequences[0,:])-np.asarray(ab_sequences[counter+1,:])))
print -dist
counter += 1
In [301]:
counter = 0
for neighbour in neighburs:
dist = abs(np.linalg.norm(np.asarray(ab_sequences[0,:])-np.asarray(ab_sequences[counter+1,:])))
print -dist
counter += 1
In [302]:
np.vstack((septiembre_vectors[0],septiembre_vectors[[1,2,3,4]])).shape
Out[302]:
In [10]:
def get_n_correct(a_matrix,limit):
identified_indexs = [] #almacena los indices de que secuencia fue seleccionada como match
wrong_indexs = [] # almacena los indices de los que se clasificaron incorrectamente
correct_indexs = [] # almacena los indices de los que se clasificaron correctamente
selected_distance = [] # almacena la distancia de los seleccionados
abstenidos = []
n_identified = 0
for i in range(limit):
the_index = np.argmax(a_matrix[:,i])
selected_distance.append(a_matrix[the_index,i])
identified_indexs.append(the_index)
if a_matrix[the_index,i] == -1:
abstenidos.append(the_index)
elif(the_index!=i):
wrong_indexs.append(the_index)
else:
correct_indexs.append(the_index)
n_identified += 1
return [n_identified,selected_distance,identified_indexs,abstenidos]
In [235]:
#normalizar: [float] -> [float(0,1)]
#Normalizacion mayor menor
def normalizar(vector):
a_max = np.max(vector)
a_min = np.min(vector)
if a_max == 0 and a_min == 0:
return vector
#if a_max == 0:
# a_max= a_min*0.0000001
for i in range(len(vector)):
vector[i] = (vector[i] - a_min)/a_max
return vector
In [330]:
#get_neighbours_index: np.matrix int -> np.array
#obtiene los vecinos del usuario "user",
#considerando como vecino a quien comparte dos ubicaciones
def get_neighbours_index(rois_a,shared_rois,user,min_shared):
min_shared_x = min(len(rois_a),min_shared)
neighbours = np.where(shared_rois[user] >= min_shared_x)
return neighbours[0]
In [14]:
with open('data/shared_rois.pickle') as f:
shared_rois = pickle.load(f)
In [36]:
with open('data/Distancia_Euclideana/features.pickle') as f:
abril_vectors = pickle.load(f)
septiembre_vectors = pickle.load(f)
In [40]:
with open('data/rois_meters_data.pickle') as f:
abril_the_rois = pickle.load(f)
septiembre_the_rois = pickle.load(f)
In [41]:
limit = np.min((len(abril_vectors),len(septiembre_vectors)))
limit
Out[41]:
In [327]:
shared_1_rois_2_month = 0
shared_2_rois_2_month = 0
for i in range(limit):
if shared_rois[i,i] > 0:
shared_1_rois_2_month += 1
if shared_rois[i,i] > 1:
shared_2_rois_2_month += 1
print "% de tarjetas que comparten un roi en ambos meses:",str(round(shared_1_rois_2_month*100/limit,2)) + "%"
print "% de tarjetas que comparten dos roi en ambos meses:", str(round(shared_2_rois_2_month*100/limit,2)) + "%"
In [73]:
n_of_neighbours = []
for i in range(limit):
n_of_neighbours.append(len(get_neighbours_index(shared_rois,i,2)))
In [74]:
plt.hist(n_of_neighbours,range(10))
Out[74]:
In [328]:
n10_of_neighbours = filter(lambda x: x>9,n_of_neighbours)
In [329]:
plt.hist(n10_of_neighbours,30)
Out[329]:
In [139]:
features_names = ["msal","mlal","kmDistance","kmMaxDist","kmMinDist","rg","unc_entropy", \
"random_entropy","p100_diff_last_origin","p100_diff_first_origin","card_type",\
"start_time","end_time","traveled_days","traveled_days_bs","frequence_regularity",\
"p100_exclusive_bus_days","p100_exclusive_metro_days","P100_bus_trips"]
In [140]:
features_dict = {"msal":0,"mlal":1,"kmDistance":2,"kmMaxDist":3,"kmMinDist":4,"rg":5,"unc_entropy":6,
"random_entropy":7,"p100_diff_last_origin":8,"p100_diff_first_origin":9,"card_type":10,
"start_time":11,"end_time":12,"traveled_days":13,"traveled_days_bs":14,"frequence_regularity":15,
"p100_exclusive_bus_days":16,"p100_exclusive_metro_days":17,"P100_bus_trips":18}
In [141]:
selected_features = ["msal","mlal","kmDistance","kmMaxDist","kmMinDist","rg","unc_entropy", \
"random_entropy","p100_diff_last_origin","p100_diff_first_origin",\
"start_time","end_time","traveled_days","frequence_regularity",\
"p100_exclusive_bus_days","p100_exclusive_metro_days","P100_bus_trips"]
In [142]:
#selected_features = features_names
In [143]:
len(selected_features)
Out[143]:
In [144]:
abril_selected_features = filter_features(abril_vectors,selected_features)
septiembre_selected_features = filter_features(septiembre_vectors,selected_features)
In [145]:
abril_selected_features.shape
Out[145]:
In [146]:
a_matrix_unique_match = only_one_match_home_and_work(limit,shared_rois)
In [147]:
n_identified_unique_match,selected_distance_unique_match,identified_indexs_unique_match,abstenidos_unique_match = get_n_correct(a_matrix_unique_match,limit)
In [148]:
porcentaje_correcto_unique_match = n_identified_unique_match*100/limit
print str(round(porcentaje_correcto_unique_match,2))+ "%"
In [198]:
a_matrix_random_match = random_match_home_and_work(limit,shared_rois)
In [199]:
n_identified_random_match,selected_distance_random_match,identified_indexs_random_match,abstenidos_random_match = get_n_correct(a_matrix_random_match,limit)
In [200]:
porcentaje_correcto_random_match = n_identified_random_match*100/limit
print str(round(porcentaje_correcto_random_match,2))+ "%"
In [332]:
a_matrix = compare_vectors_with_neighbours(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,2,normalizar,sp.spatial.distance.euclidean)
In [308]:
n_identified,selected_distance,identified_indexs,abstenidos = get_n_correct(a_matrix,limit)
In [309]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"
In [248]:
len(abstenidos)
Out[248]:
In [310]:
a_matrix_euclidiana = compare_vectors_with_neighbours(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,2,normalizar,sp.spatial.distance.euclidean)
In [311]:
n_identified,selected_distance,identified_indexs,abstenidos = get_n_correct(a_matrix_euclidiana,limit)
In [312]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"
In [313]:
a_matrix_manhattan = compare_vectors_with_neighbours(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,2,normalizar,sp.spatial.distance.cityblock)
In [314]:
n_identified,selected_distance,identified_indexs,abstenidos = get_n_correct(a_matrix_manhattan,limit)
In [315]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"
In [316]:
a_matrix_braycurtis = compare_vectors_with_neighbours(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,2,normalizar,sp.spatial.distance.braycurtis)
In [317]:
n_identified,selected_distance,identified_indexs,abstenidos = get_n_correct(a_matrix_braycurtis,limit)
In [318]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"
In [319]:
a_matrix_hamming = compare_vectors_with_neighbours(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,2,normalizar,sp.spatial.distance.hamming)
In [320]:
n_identified,selected_distance,identified_indexs,abstenidos = get_n_correct(a_matrix_hamming,limit)
In [321]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"
In [322]:
a_matrix_chebyshev= compare_vectors_with_neighbours(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,2,normalizar,sp.spatial.distance.chebyshev)
In [323]:
n_identified,selected_distance,identified_indexs,abstenidos = get_n_correct(a_matrix_chebyshev,limit)
In [324]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"