In [8]:
    
%matplotlib inline
import matplotlib.pyplot as plt
import time
import datetime as dt
import pickle
import numpy as np
import borrador
from __future__ import division
from auxiliar_functions import *
import tfe
from geopy.distance import vincenty
from itertools import chain, combinations
    
In [9]:
    
#Función que filtra columnas de acuerdo a una lista de indices
#filter_features: matrix list -> matrix
def filter_features(vector,selected_features):
    selected = []
    for i in range(len(selected_features)):
        selected.append(features_dict[selected_features[i]])
    return vector[:,selected]
    
In [10]:
    
#Función que identifica cuantos rois comparten dos grupos de rois y cual es la minima distancia que se descarta como
#rois compartidos
#share_rois: list(dict) list(dict) -> [int,int]
def share_rois(rois_a,rois_b):
    shared = 0
    rois = [rois_a,rois_b]
    index = np.argmin([len(rois_a),len(rois_b)])
    other_index = abs(index-1)
    min_distance = -1
    for i in range(len(rois[index])):
        an_a_roi = rois[index][i]
        lat_a_roi = an_a_roi['lat']
        long_a_roi = an_a_roi['long']
        for j in range(len(rois[other_index])):
            an_b_roi = rois[other_index][j]
            lat_b_roi = an_b_roi['lat']
            long_b_roi = an_b_roi['long']
            a_distance = vincenty((lat_a_roi,long_a_roi),(lat_b_roi,long_b_roi)).meters
            if a_distance < 500:
                shared +=1
            elif min_distance == -1 or min_distance > a_distance:
                min_distance = a_distance
    return [shared,min_distance]
    
In [11]:
    
#Función que identifica cuantos rois comparten cada par de rois de dos cortes temporales (ct)
#get_shared_rois: list(list(dict)) list(list(dict)) -> [[int]] list(int)
def get_shared_rois(rois_ct1,rois_ct2,limit):
    init_time = time.time()
    shared = np.ones((limit, limit)) * -1
    min_distance = []
    min_distance_not_shared = -1
    for i in range(limit):
        rois_i = rois_ct1[i]
        for j in range(limit):
            rois_j = rois_ct2[j]
            share_RoIs,min_distance_not_shared = share_rois(rois_i[0],rois_j[0])
            if i==j:
                min_distance.append(min_distance_not_shared)
                min_distance_not_shared = -1
            shared[i,j] = share_RoIs
    delta_time = time.time() - init_time
    print delta_time
    return [shared,min_distance]
    
In [12]:
    
def compare_vectors(vector_a,vector_b,rois_a,rois_b,limit,min_shared):
    a_matrix = np.ones((limit, limit)) * -1
    init_time = time.time()
    shared = []
    for i in range(limit):
        rois_abril = rois_a[i]
        for j in range(limit):
            rois_septiembre = rois_b[j]
            min_shared_x = min(len(rois_abril),len(rois_septiembre),min_shared)
            share_RoIs,min_distance_not_shared = share_rois(rois_abril[0],rois_septiembre[0])
            shared.append(share_RoIs)
            if share_RoIs >= min_shared_x:
                a_sequence = vector_a[i]
                b_sequence = vector_b[j]
                dist = np.linalg.norm(np.asarray(a_sequence)-np.asarray(b_sequence))
                a_matrix[i,j] = -dist
    delta_time = time.time() - init_time
    print delta_time
    return a_matrix
    
In [13]:
    
def compare_vectors_with_shared_matrix(vector_a,vector_b,rois_a,rois_b,shared_rois,limit,min_shared):
    a_matrix = np.ones((limit, limit)) * -1
    init_time = time.time()
    for i in range(limit):
        rois_abril = rois_a[i]
        for j in range(limit):
            rois_septiembre = rois_b[j]
            min_shared_x = min(len(rois_abril),len(rois_septiembre),min_shared)
            share_RoIs = shared_rois[i,j]
            if share_RoIs >= min_shared_x:
                a_sequence = vector_a[i]
                b_sequence = vector_b[j]
                dist = np.linalg.norm(np.asarray(a_sequence)-np.asarray(b_sequence))
                a_matrix[i,j] = -dist
    delta_time = time.time() - init_time
    print delta_time
    return a_matrix
    
In [14]:
    
def get_n_correct(a_matrix,limit):
    identified_indexs = [] #almacena los indices de que secuencia fue seleccionada como match
    wrong_indexs = [] # almacena los indices de los que se clasificaron incorrectamente
    correct_indexs = [] # almacena los indices de los que se clasificaron correctamente
    selected_distance = [] # almacena la distancia de los seleccionados
    abstenidos = []
    n_identified = 0
    for i in range(limit):
        the_index = np.argmax(a_matrix[:,i])
        selected_distance.append(a_matrix[the_index,i])
        identified_indexs.append(the_index)
        if a_matrix[the_index,i] == -1:
            abstenidos.append(the_index)
        elif(the_index!=i):
            wrong_indexs.append(the_index)
        else:
            correct_indexs.append(the_index)
            n_identified += 1
    return [n_identified,selected_distance,identified_indexs,abstenidos]
    
In [15]:
    
with open('data/datos_abril_sep_sequence.pickle') as f:
    datos_abril = pickle.load(f)
    datos_septiembre = pickle.load(f)
    
In [16]:
    
with open('data/feature_data.pickle') as f:
    abril_vectors = pickle.load(f)
    septiembre_vectors = pickle.load(f)
    
In [17]:
    
limit = np.min((len(abril_vectors),len(septiembre_vectors)))
limit
    
    Out[17]:
In [18]:
    
with open('data/rois_meters_data.pickle') as f:
    abril_the_rois = pickle.load(f)
    septiembre_the_rois = pickle.load(f)
    
In [21]:
    
with open('data/shared_rois.pickle') as f:
    shared_rois = pickle.load(f)
    
In [11]:
    
with open('data/changed_behaviour.pickle') as f:
    changed_behaviour = pickle.load(f)
    
    
In [ ]:
    
with open('data/distance_meters_data_2_rois.pickle', 'w') as f:
    pickle.dump(a_matrix, f)
    
In [44]:
    
with open('data/freq_correct.pickle', 'w') as f:
    pickle.dump(frequency_correct, f)
    
In [22]:
    
N_FEATURES = 19
    
In [23]:
    
features_names = ["msal","mlal","kmDistance","kmMaxDist","kmMinDist","rg","unc_entropy", \
    "random_entropy","p100_diff_last_origin","p100_diff_first_origin","card_type",\
    "start_time","end_time","traveled_days","traveled_days_bs","frequence_regularity",\
    "p100_exclusive_bus_days","p100_exclusive_metro_days","P100_bus_trips"]
    
In [24]:
    
features_dict = {"msal":0,"mlal":1,"kmDistance":2,"kmMaxDist":3,"kmMinDist":4,"rg":5,"unc_entropy":6,
                  "random_entropy":7,"p100_diff_last_origin":8,"p100_diff_first_origin":9,"card_type":10,
                  "start_time":11,"end_time":12,"traveled_days":13,"traveled_days_bs":14,"frequence_regularity":15,
                  "p100_exclusive_bus_days":16,"p100_exclusive_metro_days":17,"P100_bus_trips":18}
    
In [47]:
    
selected_features = ["msal","mlal","kmDistance","kmMaxDist","kmMinDist","rg","unc_entropy", \
    "random_entropy","p100_diff_last_origin","p100_diff_first_origin",\
    "start_time","end_time","traveled_days","frequence_regularity",\
    "p100_exclusive_bus_days","p100_exclusive_metro_days","P100_bus_trips"]
#selected_features = ['msal', 'mlal', 'kmDistance', 'unc_entropy', 'random_entropy', 'start_time']
    
In [57]:
    
selected_features= ['msal',
 'kmMaxDist',
 'unc_entropy',
 'random_entropy',
 'start_time',
 'frequence_regularity',
 'p100_exclusive_bus_days',
 'P100_bus_trips']
    
In [58]:
    
abril_selected_features = filter_features(abril_vectors,selected_features)
septiembre_selected_features = filter_features(septiembre_vectors,selected_features)
    
In [59]:
    
abril_selected_features.shape
    
    Out[59]:
In [54]:
    
shared_rois,min_distance = get_shared_rois(abril_the_rois,septiembre_the_rois,limit)
    
    
In [62]:
    
min(min_distance)
    
    Out[62]:
In [63]:
    
a_min_distance = filter(lambda x:x!=-1,min_distance)
    
In [66]:
    
len(a_min_distance)
    
    Out[66]:
In [69]:
    
min(a_min_distance)
    
    Out[69]:
In [72]:
    
plt.hist(a_min_distance,300)
    
    Out[72]:
    
In [81]:
    
a_matrix = compare_vectors_with_shared_matrix(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,1)
    
    
In [82]:
    
n_identified,selected_distance,identified_indexs = get_n_correct(a_matrix,limit)
    
In [83]:
    
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"
    
    
In [84]:
    
shared_rois_2_month = 0
for i in range(limit):
    if a_matrix[i,i] != -1:
        shared_rois_2_month += 1
print str(round(shared_rois_2_month*100/limit,2)) + "%"
    
    
In [60]:
    
a_matrix = compare_vectors_with_shared_matrix(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,2)
    
    
In [61]:
    
n_identified,selected_distance,identified_indexs,abstenidos = get_n_correct(a_matrix,limit)
    
In [62]:
    
len(identified_indexs)
    
    Out[62]:
In [63]:
    
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"
    
    
In [46]:
    
shared_rois_2_month = 0
for i in range(limit):
    if a_matrix[i,i] != -1:
        shared_rois_2_month += 1
print str(round(shared_rois_2_month*100/limit,2)) + "%"
    
    
In [125]:
    
len(abstenidos)
    
    Out[125]:
In [126]:
    
1349/limit*100
    
    Out[126]:
In [72]:
    
def counterx(a):
    b = filter(lambda x: x!=-1,a)
    return len(b)
    
In [73]:
    
counterx([1,2,3,4,-1,-1,5,-1])
    
    Out[73]:
In [ ]:
    
shared_rois_2_month = 0
for i in range(limit):
    if a_matrix[i,i] != -1:
        if identified_index == i:
print str(round(shared_rois_2_month*100/limit,2)) + "%"
    
In [127]:
    
a_matrix = compare_vectors_with_shared_matrix(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,3)
    
    
In [128]:
    
n_identified,selected_distance,identified_indexs = get_n_correct(a_matrix,limit)
    
    
In [ ]:
    
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"
    
In [ ]:
    
shared_rois_2_month = 0
for i in range(limit):
    if a_matrix[i,i] != -1:
        shared_rois_2_month += 1
print str(round(shared_rois_2_month*100/limit,2)) + "%"
    
In [85]:
    
a_matrix = compare_vectors_with_shared_matrix(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,4)
    
    
In [86]:
    
n_identified,selected_distance,identified_indexs = get_n_correct(a_matrix,limit)
    
In [87]:
    
porcentaje_correcto = n_identified*100/limit
print str(round(porcentaje_correcto,2))+ "%"
    
    
In [88]:
    
shared_rois_2_month = 0
for i in range(limit):
    if a_matrix[i,i] != -1:
        shared_rois_2_month += 1
print str(round(shared_rois_2_month*100/limit,2)) + "%"
    
    
In [ ]:
    
x = np.array(identified_indexs_2)
y = np.bincount(x)
ii = np.nonzero(y)[0]
    
In [ ]:
    
frequency_correct = zip(ii,y[ii]) 
counter = 0
ncounter = 0
freq_max_1 = 0
freq_max_2 = 0
freq_max_3 = 0
for element in frequency_correct:
    if(element[1]>1):
        counter +=1
        ncounter += element[1]
    
In [ ]:
    
frequency_correct.sort(key = lambda t: t[1], reverse=True)
frequency_correct
    
In [65]:
    
min_distance_features_correct = []
std_distance_features_correct = []
max_distance_features_correct = []
mean_distance_features_correct = []
euclidean_distance_matrix = abs(abril_vectors-septiembre_vectors)
#obtengo para cada feature las estadisticas de los identificados correctamente
for j in range(N_FEATURES):
    min_distance_features_correct.append(min(euclidean_distance_matrix[:,j]))
    std_distance_features_correct.append(np.std(euclidean_distance_matrix[:,j]))
    max_distance_features_correct.append(max(euclidean_distance_matrix[:,j]))
    mean_distance_features_correct.append(np.mean(euclidean_distance_matrix[:,j]))
    
In [ ]:
    
min_distance_features = []
std_distance_features = []
max_distance_features = []
mean_distance_features = []
distance_feature_matrix = np.zeros((limit, N_FEATURES))
#obtengo para cada feature las distancias de cada feature en general
for j in range(N_FEATURES):
    #resto el valor de la feature j,i a septiembre j guardo la distancia
    for i in range(limit):
        the_dist = np.mean(abs(septiembre_vectors[:,j]-abril_vectors[i,j]))
        distance_feature_matrix[i,j] = the_dist
    min_distance_features.append(min(distance_feature_matrix[:,j]))
    std_distance_features.append(np.std(distance_feature_matrix[:,j]))
    max_distance_features.append(max(distance_feature_matrix[:,j]))
    mean_distance_features.append(np.mean(distance_feature_matrix[:,j]))
    
In [16]:
    
min_distance_features_neighbour = []
std_distance_features_neighbour = []
max_distance_features_neighbour = []
mean_distance_features_neighbour = []
distance_feature_matrix = np.zeros((limit, N_FEATURES))
for i in range(limit):
    neighbours_i = get_neighbours(i,shared_rois)
    for j in range(N_FEATURES):
        the_dist = np.mean(abs(septiembre_vectors[:,j]-abril_vectors[i,j]))
        distance_feature_matrix[i,j] = the_dist
        min_distance_features_neighbour.append(min(distance_feature_matrix[:,j]))
        std_distance_features_neighbour.append(np.std(distance_feature_matrix[:,j]))
        max_distance_features_neighbour.append(max(distance_feature_matrix[:,j]))
        mean_distance_features_neighbour.append(np.mean(distance_feature_matrix[:,j]))
    
In [17]:
    
plt.figure()
plt.plot(min_distance_features_correct)
plt.hold(True)
plt.plot(min_distance_features)
    
    Out[17]:
    
In [18]:
    
plt.figure()
plt.plot(max_distance_features_correct)
plt.hold(True)
plt.plot(max_distance_features)
    
    Out[18]:
    
In [19]:
    
plt.figure()
plt.plot(mean_distance_features_correct)
plt.hold(True)
plt.plot(mean_distance_features)
    
    Out[19]:
    
In [20]:
    
plt.figure()
plt.plot(std_distance_features_correct)
plt.hold(True)
plt.plot(std_distance_features)
    
    Out[20]:
    
In [27]:
    
plt.hist(euclidean_distance_matrix[:,0])
    
    Out[27]:
    
In [28]:
    
plt.hist(euclidean_distance_matrix[:,1])
    
    Out[28]:
    
In [29]:
    
plt.hist(euclidean_distance_matrix[:,2])
    
    Out[29]:
    
In [30]:
    
plt.hist(euclidean_distance_matrix[:,3])
    
    Out[30]:
    
In [31]:
    
plt.hist(euclidean_distance_matrix[:,4])
    
    Out[31]:
    
In [32]:
    
plt.hist(euclidean_distance_matrix[:,5])
    
    Out[32]:
    
In [33]:
    
plt.hist(euclidean_distance_matrix[:,6])
    
    Out[33]:
    
In [34]:
    
plt.hist(euclidean_distance_matrix[:,7])
    
    Out[34]:
    
In [36]:
    
plt.hist(euclidean_distance_matrix[:,8])
    
    Out[36]:
    
In [37]:
    
plt.hist(euclidean_distance_matrix[:,9])
    
    Out[37]:
    
In [38]:
    
plt.hist(euclidean_distance_matrix[:,10])
    
    Out[38]:
    
In [39]:
    
plt.hist(euclidean_distance_matrix[:,11])
    
    Out[39]:
    
In [40]:
    
plt.hist(euclidean_distance_matrix[:,12])
    
    Out[40]:
    
In [41]:
    
plt.hist(euclidean_distance_matrix[:,13])
    
    Out[41]:
    
In [42]:
    
plt.hist(euclidean_distance_matrix[:,14])
    
    Out[42]:
    
In [43]:
    
plt.hist(euclidean_distance_matrix[:,15])
    
    Out[43]:
    
In [44]:
    
plt.hist(euclidean_distance_matrix[:,16])
    
    Out[44]:
    
In [45]:
    
plt.hist(euclidean_distance_matrix[:,17])
    
    Out[45]:
    
In [46]:
    
plt.hist(euclidean_distance_matrix[:,18])
    
    Out[46]:
    
In [30]:
    
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
    
In [39]:
    
selected_features = ["msal","mlal","kmDistance","kmMaxDist","kmMinDist","rg","unc_entropy", \
    "random_entropy","p100_diff_last_origin","p100_diff_first_origin", \
    "start_time","end_time","traveled_days","frequence_regularity","P100_bus_trips"]
    
In [36]:
    
len(selected_features)
    
    Out[36]:
In [53]:
    
set_features = list(set(powerset(selected_features)))
    
In [54]:
    
counter = 0
for a_set in set_features:
    if len(a_set) > 10:
        counter +=1
print counter*18/3600
    
    
In [89]:
    
counter = 0
index_set = []
rendimientos = []
for a_set in set_features:
    if len(a_set) > 10:
        print "El set: " + str(counter)
        abril_selected_features = filter_features(abril_vectors,a_set)
        septiembre_selected_features = filter_features(septiembre_vectors,a_set)
        a_matrix = compare_vectors_with_shared_matrix(abril_selected_features,septiembre_selected_features,abril_the_rois,septiembre_the_rois,shared_rois,limit,2)
        n_identified,selected_distance,identified_indexs = get_n_correct(a_matrix,limit)
        porcentaje_correcto = n_identified*100/limit
        print "Rendimiento " + str(round(porcentaje_correcto,2))+ "%"
        index_set.append(counter)
        rendimientos.append(porcentaje_correcto)
        counter += 1
    
    
In [55]:
    
set_features[1915]
    
    Out[55]:
In [ ]: