Importar lo importante


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import time
import datetime as dt
import pickle
import numpy as np
import borrador
from __future__ import division
from auxiliar_functions import *
import tfe
from geopy.distance import vincenty
from itertools import chain, combinations
import random
import scipy as sp

Funciones Auxiliares


In [2]:
#Función que filtra columnas de acuerdo a una lista de indices
#filter_features: matrix list -> matrix
def filter_features(vector,selected_features):
    selected = []
    for i in range(len(selected_features)):
        selected.append(features_dict[selected_features[i]])
    return vector[:,selected]

In [3]:
def only_one_match_home_and_work(limit,shared_rois):
    a_matrix = np.ones((limit, limit)) * -1
    init_time = time.time()
    for i in range(limit):
        neighbours = get_neighbours_index(shared_rois,i,2)
        if len(neighbours) > 0:
            if len(neighbours) == 1:
                a_matrix[i,neighbours[0]] = 0
    delta_time = time.time() - init_time
    print delta_time
    return a_matrix

In [6]:
def random_match_home_and_work(limit,shared_rois):
    a_matrix = np.ones((limit, limit)) * -1
    init_time = time.time()
    for i in range(limit):
        neighbours = get_neighbours_index(shared_rois,i,2)
        n_neighbours = len(neighbours)
        if n_neighbours > 0:
            if n_neighbours == 1:
                a_matrix[i,neighbours[0]] = 0
            else:
                random_neighbour = random.choice(neighbours)
                a_matrix[i,random_neighbour] = 0
    delta_time = time.time() - init_time
    print delta_time
    return a_matrix

In [334]:
def compare_vectors_with_neighbours(vector_a,vector_b,rois_abril,rois_b,shared_rois,limit,min_shared,f_normalizar,f_distancia):
    a_matrix = np.ones((limit, limit)) * -1
    init_time = time.time()
    for i in range(limit):
        #print "Usuario ",i
        rois_abril = rois_a[i]
        neighbours = get_neighbours_index(rois_abril,shared_rois,i,min_shared)
        if len(neighbours) > 0:
            if len(neighbours) == 1:
                a_matrix[i,neighbours[0]] = 0
            else:
                a_sequence = vector_a[i,:]
                b_sequences = vector_b[neighbours,:]
                ab_sequences = np.vstack((a_sequence,b_sequences))
                for j in range(ab_sequences.shape[1]):
                    ab_sequences[:,j] = f_normalizar(ab_sequences[:,j])
                counter = 0
                for neighbour in neighbours:
                    dist = f_distancia(np.asarray(ab_sequences[0,:]),np.asarray(ab_sequences[counter+1,:]))
                    a_matrix[i,neighbour] = -dist
                    counter += 1
    delta_time = time.time() - init_time
    print delta_time
    return a_matrix

In [297]:
neighburs = get_neighbours_index(shared_rois,1829,2)
neighburs


Out[297]:
array([1829, 2110, 2712, 3271, 4424])

In [298]:
neighburs = get_neighbours_index(shared_rois,1829,2)
i = 1829
a_sequence = abril_selected_features[i,:]
b_sequences = septiembre_selected_features[neighburs,:]
ab_sequences = np.vstack((a_sequence,b_sequences))
for i in range(ab_sequences.shape[1]):
    ab_sequences[:,i] = normalizar(ab_sequences[:,i])
pd.DataFrame(ab_sequences)


Out[298]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
0 0.768005 0.234478 0.107873 0.268945 0.236329 0.184279 0.279562 0.135152 0.285714 0.371429 0.518907 0.157296 0.428571 0.333333 0.5 0.00 0.789474
1 0.170092 0.256958 0.141095 0.312517 0.822196 0.247446 0.124270 0.000000 0.428571 0.550000 0.134181 0.122713 0.000000 0.000000 0.0 0.00 0.544737
2 0.000000 0.356958 0.050553 0.115478 0.375303 0.245689 0.270979 0.171268 0.428571 0.800000 0.273788 0.000000 0.000000 0.333333 0.0 0.25 0.605263
3 0.129063 0.025692 0.878229 0.790451 0.003350 0.772343 0.421315 0.262650 0.000000 0.085714 0.122805 0.118542 0.428571 0.000000 0.0 0.00 0.759144
4 0.225940 0.000000 0.088745 0.316488 0.000000 0.196454 0.320780 0.171268 0.428571 0.228571 0.326611 0.220585 0.428571 0.333333 1.0 0.00 1.000000
5 0.631294 0.098044 0.000000 0.000000 0.236841 0.000000 0.000000 0.000000 0.028571 0.000000 0.000000 0.105401 0.142857 0.333333 0.0 1.00 0.000000

In [299]:
counter = 0
for neighbour in neighburs:
    dist = sp.spatial.distance.euclidean(np.asarray(ab_sequences[0,:]),np.asarray(ab_sequences[counter+1,:]))
    print -dist
    counter += 1


-1.24772003022
-1.21455669278
-1.56121688142
-0.886884446948
-1.63786205831

In [300]:
counter = 0
for neighbour in neighburs:
    dist = abs(np.linalg.norm(np.asarray(ab_sequences[0,:])-np.asarray(ab_sequences[counter+1,:])))
    print -dist
    counter += 1


-1.24772003022
-1.21455669278
-1.56121688142
-0.886884446948
-1.63786205831

In [301]:
counter = 0
for neighbour in neighburs:
    dist = abs(np.linalg.norm(np.asarray(ab_sequences[0,:])-np.asarray(ab_sequences[counter+1,:])))
    print -dist
    counter += 1


-1.24772003022
-1.21455669278
-1.56121688142
-0.886884446948
-1.63786205831

In [302]:
np.vstack((septiembre_vectors[0],septiembre_vectors[[1,2,3,4]])).shape


Out[302]:
(5, 19)

In [10]:
def get_n_correct(a_matrix,limit):
    identified_indexs = [] #almacena los indices de que secuencia fue seleccionada como match
    wrong_indexs = [] # almacena los indices de los que se clasificaron incorrectamente
    correct_indexs = [] # almacena los indices de los que se clasificaron correctamente
    selected_distance = [] # almacena la distancia de los seleccionados
    abstenidos = []
    n_identified = 0
    for i in range(limit):
        the_index = np.argmax(a_matrix[:,i])
        selected_distance.append(a_matrix[the_index,i])
        identified_indexs.append(the_index)
        if a_matrix[the_index,i] == -1:
            abstenidos.append(the_index)
        elif(the_index!=i):
            wrong_indexs.append(the_index)
        else:
            correct_indexs.append(the_index)
            n_identified += 1
    return [n_identified,selected_distance,identified_indexs,abstenidos]

In [235]:
#normalizar: [float] -> [float(0,1)]
#Normalizacion mayor menor
def normalizar(vector):
    a_max = np.max(vector)
    a_min = np.min(vector)
    if a_max == 0 and a_min == 0:
        return vector
    #if a_max == 0:
     #   a_max= a_min*0.0000001
    for i in range(len(vector)):
        vector[i] = (vector[i] - a_min)/a_max
    return vector

In [330]:
#get_neighbours_index: np.matrix int -> np.array
#obtiene los vecinos del usuario "user",
#considerando como vecino a quien comparte dos ubicaciones
def get_neighbours_index(rois_a,shared_rois,user,min_shared):
    min_shared_x = min(len(rois_a),min_shared)
    neighbours = np.where(shared_rois[user] >= min_shared_x)
    return neighbours[0]

Obtener datos


In [14]:
with open('data/shared_rois.pickle') as f:
    shared_rois = pickle.load(f)

In [36]:
with open('data/Distancia_Euclideana/features.pickle') as f:
    abril_vectors = pickle.load(f)
    septiembre_vectors = pickle.load(f)

In [40]:
with open('data/rois_meters_data.pickle') as f:
    abril_the_rois = pickle.load(f)
    septiembre_the_rois = pickle.load(f)

In [41]:
limit = np.min((len(abril_vectors),len(septiembre_vectors)))
limit


Out[41]:
5169

In [327]:
shared_1_rois_2_month = 0
shared_2_rois_2_month = 0
for i in range(limit):
    if shared_rois[i,i] > 0:
        shared_1_rois_2_month += 1
    if shared_rois[i,i] > 1:
        shared_2_rois_2_month += 1
print "% de tarjetas que comparten un roi en ambos meses:",str(round(shared_1_rois_2_month*100/limit,2)) + "%"
print "% de tarjetas que comparten dos roi en ambos meses:", str(round(shared_2_rois_2_month*100/limit,2)) + "%"


% de tarjetas que comparten un roi en ambos meses:  93.38%
% de tarjetas que comparten dos roi en ambos meses:  54.85%

Histograma cantidad de vecinos de hogar y trabajo (2 rois) entre abril y septiembre


In [73]:
n_of_neighbours = []
for i in range(limit):
    n_of_neighbours.append(len(get_neighbours_index(shared_rois,i,2)))

In [74]:
plt.hist(n_of_neighbours,range(10))


Out[74]:
(array([ 1051.,  1642.,   808.,   431.,   263.,   191.,   116.,    91.,
          117.]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 <a list of 9 Patch objects>)

Histograma cantidad de vecinos de hogar y trabajo (2 rois) entre abril y septiembre


In [328]:
n10_of_neighbours = filter(lambda x: x>9,n_of_neighbours)

In [329]:
plt.hist(n10_of_neighbours,30)


Out[329]:
(array([ 269.,  109.,   36.,    8.,    8.,    8.,    5.,    4.,    1.,
           0.,    0.,    1.,    2.,    1.,    0.,    0.,    0.,    0.,
           1.,    0.,    2.,    0.,    0.,    1.,    1.,    0.,    1.,
           0.,    0.,    1.]),
 array([  10.        ,   19.06666667,   28.13333333,   37.2       ,
          46.26666667,   55.33333333,   64.4       ,   73.46666667,
          82.53333333,   91.6       ,  100.66666667,  109.73333333,
         118.8       ,  127.86666667,  136.93333333,  146.        ,
         155.06666667,  164.13333333,  173.2       ,  182.26666667,
         191.33333333,  200.4       ,  209.46666667,  218.53333333,
         227.6       ,  236.66666667,  245.73333333,  254.8       ,
         263.86666667,  272.93333333,  282.        ]),
 <a list of 30 Patch objects>)

Seleccionar Features para la comparación


In [139]:
features_names = ["msal","mlal","kmDistance","kmMaxDist","kmMinDist","rg","unc_entropy", \
    "random_entropy","p100_diff_last_origin","p100_diff_first_origin","card_type",\
    "start_time","end_time","traveled_days","traveled_days_bs","frequence_regularity",\
    "p100_exclusive_bus_days","p100_exclusive_metro_days","P100_bus_trips"]

In [140]:
features_dict = {"msal":0,"mlal":1,"kmDistance":2,"kmMaxDist":3,"kmMinDist":4,"rg":5,"unc_entropy":6,
                  "random_entropy":7,"p100_diff_last_origin":8,"p100_diff_first_origin":9,"card_type":10,
                  "start_time":11,"end_time":12,"traveled_days":13,"traveled_days_bs":14,"frequence_regularity":15,
                  "p100_exclusive_bus_days":16,"p100_exclusive_metro_days":17,"P100_bus_trips":18}

In [141]:
selected_features = ["msal","mlal","kmDistance","kmMaxDist","kmMinDist","rg","unc_entropy", \
    "random_entropy","p100_diff_last_origin","p100_diff_first_origin",\
    "start_time","end_time","traveled_days","frequence_regularity",\
    "p100_exclusive_bus_days","p100_exclusive_metro_days","P100_bus_trips"]

In [142]:
#selected_features = features_names

In [143]:
len(selected_features)


Out[143]:
17

In [144]:
abril_selected_features = filter_features(abril_vectors,selected_features)
septiembre_selected_features = filter_features(septiembre_vectors,selected_features)

Normalizar con respecto a los vecinos


In [145]:
abril_selected_features.shape


Out[145]:
(5169, 17)

Hacer las comparaciones

Seleccionar solo los que se tiene un match


In [146]:
a_matrix_unique_match = only_one_match_home_and_work(limit,shared_rois)


0.0594351291656

In [147]:
n_identified_unique_match,selected_distance_unique_match,identified_indexs_unique_match,abstenidos_unique_match = get_n_correct(a_matrix_unique_match,limit)

In [148]:
porcentaje_correcto_unique_match = n_identified_unique_match*100/limit
print str(round(porcentaje_correcto_unique_match,2))+ "%"


21.88%

Seleccionar randommente entre los vecinos que matchean hogar y trabajo


In [198]:
a_matrix_random_match = random_match_home_and_work(limit,shared_rois)


0.0673789978027

In [199]:
n_identified_random_match,selected_distance_random_match,identified_indexs_random_match,abstenidos_random_match = get_n_correct(a_matrix_random_match,limit)

In [200]:
porcentaje_correcto_random_match = n_identified_random_match*100/limit
print str(round(porcentaje_correcto_random_match,2))+ "%"


28.86%

Seleccionar con distancia euclidiana entre los vecinos


In [332]:
a_matrix = compare_vectors_with_neighbours(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,2,normalizar,sp.spatial.distance.euclidean)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-332-5eff2e07acf6> in <module>()
----> 1 a_matrix = compare_vectors_with_neighbours(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,2,normalizar,sp.spatial.distance.euclidean)

<ipython-input-331-2d64c159e5d5> in compare_vectors_with_neighbours(vector_a, vector_b, rois_a, rois_b, shared_rois, limit, min_shared, f_normalizar, f_distancia)
      4     for i in range(limit):
      5         #print "Usuario ",i
----> 6         neighbours = get_neighbours_index(rois_a,rois_b,shared_rois,i,min_shared)
      7         if len(neighbours) > 0:
      8             if len(neighbours) == 1:

TypeError: get_neighbours_index() takes exactly 4 arguments (5 given)

In [308]:
n_identified,selected_distance,identified_indexs,abstenidos = get_n_correct(a_matrix,limit)

In [309]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"


35.46%

In [248]:
len(abstenidos)


Out[248]:
3769

Pruebas con diferentes distancias

Euclidiana


In [310]:
a_matrix_euclidiana = compare_vectors_with_neighbours(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,2,normalizar,sp.spatial.distance.euclidean)


1.14597606659

In [311]:
n_identified,selected_distance,identified_indexs,abstenidos = get_n_correct(a_matrix_euclidiana,limit)

In [312]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"


35.46%

Manhattan


In [313]:
a_matrix_manhattan = compare_vectors_with_neighbours(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,2,normalizar,sp.spatial.distance.cityblock)


0.941806793213

In [314]:
n_identified,selected_distance,identified_indexs,abstenidos = get_n_correct(a_matrix_manhattan,limit)

In [315]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"


23.97%

braycurtis


In [316]:
a_matrix_braycurtis = compare_vectors_with_neighbours(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,2,normalizar,sp.spatial.distance.braycurtis)


1.08362221718

In [317]:
n_identified,selected_distance,identified_indexs,abstenidos = get_n_correct(a_matrix_braycurtis,limit)

In [318]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"


39.89%

hamming


In [319]:
a_matrix_hamming = compare_vectors_with_neighbours(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,2,normalizar,sp.spatial.distance.hamming)


1.0786049366

In [320]:
n_identified,selected_distance,identified_indexs,abstenidos = get_n_correct(a_matrix_hamming,limit)

In [321]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"


40.74%

chebyshev


In [322]:
a_matrix_chebyshev= compare_vectors_with_neighbours(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,2,normalizar,sp.spatial.distance.chebyshev)


0.929002046585

In [323]:
n_identified,selected_distance,identified_indexs,abstenidos = get_n_correct(a_matrix_chebyshev,limit)

In [324]:
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"


38.05%