In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import pickle
from __future__ import division
import csv
from tools import *
import os
In [2]:
reload(tpm_identification)
Out[2]:
In [10]:
data_path = os.path.join(os.getcwd(),'..','data')
first_period_path = os.path.join(data_path,'Users_data','etapas_2013_abril_allyearsids_10_100000.csv')
second_period_path = os.path.join(data_path,'Users_data','etapas_2013_septiembre_allyearsids_10_100000.csv')
In [11]:
dict_metro = auxiliar_functions.load_metro_dictionary()
In [17]:
first_period_frame = pd.read_csv(first_period_path)
first_period_frame['tiempo_subida'] = pd.to_datetime(first_period_frame.tiempo_subida)
first_period_frame = first_period_frame.sort_values(by=['id', 'tiempo_subida'])
first_period_frame.head()
Out[17]:
In [16]:
second_period_frame = pd.read_csv(second_period_path)
second_period_frame['tiempo_subida'] = pd.to_datetime(second_period_frame.tiempo_subida)
second_period_frame = second_period_frame.sort_values(by=['id', 'tiempo_subida'])
second_period_frame.head()
Out[16]:
In [18]:
first_period_frame.info()
In [8]:
reload(tpm_identification)
Out[8]:
In [52]:
users_profiles = tpm_identification.get_profiles(first_period_frame['id'],first_period_frame['zona_subida'],first_period_frame['zona_bajada'])
In [53]:
numero_usuarios = len(users_profiles)
numero_usuarios
Out[53]:
In [54]:
profiles = tpm_identification.get_sequences(second_period_frame['id'],second_period_frame['zona_subida'],second_period_frame['zona_bajada'])
In [55]:
print len(profiles)
limit = min(len(users_profiles),len(profiles))
print limit
In [56]:
last_iddd = 0
ids_alg1 = []
for i in range(len(users_profiles)):
assert last_iddd < users_profiles[i]['user_id']
last_iddd = users_profiles[i]['user_id']
ids_alg1.append(last_iddd)
assert users_profiles[i]['user_id'] == profiles[i]['user_id']
In [33]:
with open('data/ids_alg1.pickle', 'w') as f:
pickle.dump(ids_alg1,f)
In [57]:
start_time = time.time()
iden = tpm_identification.get_identification_matrix(users_profiles,profiles)
delta_time = time.time() - start_time
print delta_time
In [58]:
iden_matrix_zona = np.matrix(iden)
df_ident = pd.DataFrame(iden_matrix_zona)
df_ident.head(10)
Out[58]:
In [77]:
n_identified,selected_distance,identified_indexs,abstenidos,correct_indexs,correct_distance,wrong_indexs,wrong_distances = auxiliar_functions.get_n_correct_tpm(iden_matrix_zona,limit)
porcentaje_correcto = n_identified*100.0/limit
print str(round(porcentaje_correcto,2))+ "%"
In [79]:
with open('resultados_alg_1_zona.pickle','w') as f:
pickle.dump(n_identified,f)
pickle.dump(selected_distance,f)
pickle.dump(identified_indexs,f)
pickle.dump(abstenidos,f)
pickle.dump(correct_indexs,f)
pickle.dump(correct_distance,f)
pickle.dump(wrong_indexs,f)
pickle.dump(wrong_distances,f)
pickle.dump(iden_matrix_zona.diagonal(),f)
In [78]:
wrong_distances_without_800 = filter(lambda x: x>-800,wrong_distances)
print len(wrong_distances_without_800)
In [63]:
colors = ['red', 'green']
plt.hist([wrong_distances_without_800,correct_distance], 30, histtype='bar',color=colors)
plt.show()
In [72]:
counter = 0
for i in correct_distance:
if i ==0.0:
counter +=1
print counter
In [71]:
correct_distance
Out[71]:
Se puede observar que la distribución verde está más a la izquierda por lo que habla de un gran número de personas gente que cambió de comportamiento. Esto se condice con los resultados del tercer algoritmo.
In [64]:
wrong_distances_selected = []
counter = 0
for i in range(limit):
if i in wrong_indexs:
wrong_distances_selected.append(selected_distance[counter])
counter += 1
if i in correct_indexs:
counter += 1
In [65]:
colors = ['red', 'blue','green']
plt.hist([wrong_distances_without_800,wrong_distances_selected,correct_distance], 30, histtype='bar',color=colors)
plt.show()
In [27]:
diagonal = iden.diagonal().copy()
correct_distance_ii = []
wrong_distance_ii = []
diagonal_d1_ii = []
for i in range(len(diagonal)):
if diagonal[i]>-100:
diagonal_d1_ii.append(i)
if i in correct_indexs:
correct_distance_ii.append(diagonal[i])
else:
wrong_distance_ii.append(diagonal[i])
print "diagonal: "+str(len(diagonal_d1_ii) )
print "correctos: "+str(len(correct_distance_ii))
print "incorrectos: "+str(len(wrong_distance_ii))
In [28]:
colors = ['red', 'green']
plt.hist([wrong_distance_ii,correct_distance_ii], histtype='bar',color=colors)
plt.show()
In [29]:
diagonal = iden.diagonal().copy()
correct_distance = []
wrong_distance = []
diagonal_d1 = []
for i in range(len(diagonal)):
if diagonal[i]>-1:
diagonal_d1.append(i)
if i in correct_indexs:
correct_distance.append(diagonal[i])
else:
wrong_distance.append(diagonal[i])
print "diagonal: "+str(len(diagonal_d1) )
print "correctos: "+str(len(correct_distance))
print "incorrectos: "+str(len(wrong_distance))
In [30]:
colors = ['red', 'green']
plt.hist([wrong_distance,correct_distance], histtype='bar',color=colors)
plt.show()
In [31]:
diagonal = iden.diagonal().copy()
correct_distance = []
wrong_distance = []
diagonal_d1 = []
for i in range(len(diagonal)):
if diagonal[i]>-0.1:
diagonal_d1.append(i)
if i in correct_indexs:
correct_distance.append(diagonal[i])
else:
wrong_distance.append(diagonal[i])
print "diagonal: "+str(len(diagonal_d1) )
print "correctos: "+str(len(correct_distance))
print "incorrectos: "+str(len(wrong_distance))
In [32]:
colors = ['red', 'green']
plt.hist([wrong_distance,correct_distance], histtype='bar',color=colors)
plt.show()
In [66]:
with open('data/iden_matrix_zona.pickle','w') as f:
pickle.dump(iden_matrix_zona,f)
In [26]:
with open('data/iden_matrix_zona.pickle','r') as f:
iden = pickle.load(f)
In [8]:
reload(auxiliar_functions)
Out[8]:
In [11]:
start_time = time.time()
users_profiles = tpm_identification.get_profiles(first_period_frame['id'],first_period_frame['par_subida'],first_period_frame['par_bajada'])
profiles = tpm_identification.get_sequences(second_period_frame['id'],second_period_frame['par_subida'],second_period_frame['par_bajada'])
delta_time = time.time() - start_time
print delta_time
In [10]:
users_profiles[0]
Out[10]:
In [45]:
start_time = time.time()
iden_paradero = tpm_identification.get_identification_matrix(users_profiles,profiles)
delta_time = time.time() - start_time
print delta_time
In [46]:
iden_matrix_paradero = np.matrix(iden_paradero)
df_ident_paradero = pd.DataFrame(iden_matrix_paradero)
In [47]:
with open('data/iden_matrix_paradero.pickle','w') as f:
pickle.dump(iden_matrix_paradero,f)
In [73]:
with open('data/iden_matrix_paradero.pickle','r') as f:
iden_matrix_paradero = pickle.load(f)
In [74]:
n_identified,selected_distance,identified_indexs,abstenidos,correct_indexs,correct_distance,wrong_indexs,wrong_distances = auxiliar_functions.get_n_correct_tpm(iden_matrix_paradero,limit)
porcentaje_correcto = n_identified*100.0/limit
print str(round(porcentaje_correcto,2))+ "%"
In [21]:
with open('data/resultados_alg_1.pickle','w') as f:
pickle.dump(n_identified,f)
pickle.dump(selected_distance,f)
pickle.dump(identified_indexs,f)
pickle.dump(abstenidos,f)
pickle.dump(correct_indexs,f)
pickle.dump(correct_distance,f)
pickle.dump(wrong_indexs,f)
pickle.dump(wrong_distances,f)
pickle.dump(iden_matrix_paradero.diagonal(),f)
In [5]:
with open('data/resultados_alg_1.pickle','r') as f:
n_identified = pickle.load(f)
selected_distance = pickle.load(f)
identified_indexs = pickle.load(f)
abstenidos = pickle.load(f)
correct_indexs = pickle.load(f)
correct_distance = pickle.load(f)
wrong_indexs = pickle.load(f)
wrong_distances = pickle.load(f)
diagonal = pickle.load(f)
In [6]:
counter = 0
for i in correct_distance:
if i ==0.0:
counter +=1
print counter
In [7]:
counter = 0
for i in wrong_distances:
if i ==0.0:
counter +=1
print counter
In [8]:
iden_matrix_paradero.shape
In [9]:
len(abstenidos)
Out[9]:
In [10]:
len(abstenidos)*100/limit
In [11]:
len(correct_indexs)
Out[11]:
In [12]:
len(identified_indexs)
Out[12]:
In [13]:
len(wrong_indexs)
Out[13]:
In [14]:
wrong_distances_without_800 = filter(lambda x: x>-800,wrong_distances)
print len(wrong_distances_without_800)
In [15]:
plt.hist(wrong_distances_without_800)
Out[15]:
In [16]:
colors = ['red', 'green']
plt.hist([wrong_distances_without_800,correct_distance], histtype='bar',color=colors)
Out[16]:
In [9]:
colors = ['green', 'red']
plt.hist([correct_distance,wrong_distances_without_800], 30, histtype='bar',color=colors,label=['Correctly recognized users','Wrongly recognized users'])
plt.xlabel('Similarity Indicator')
plt.ylabel('Number of Users')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
fancybox=True, shadow=True, ncol=5)
#plt.legend((a,b),('Correctly identified users','Correctly identified users'))
plt.savefig('hist_tpm.eps', format='eps', dpi=1000)
In [17]:
from matplotlib.font_manager import FontProperties
fontP = FontProperties()
fontP.set_size('small')
colors = [(132, 255, 108),(255, 57, 100)]
for i in range(len(colors)):
r, g, b = colors[i]
colors[i] = (r / 255., g / 255., b / 255.)
plt.figure()
ax = plt.subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
# Ensure that the axis ticks only show up on the bottom and left of the plot.
# Ticks on the right and top of the plot are generally unnecessary chartjunk.
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left() # Limit the range of the plot to only where the data is.
ax.set_xlabel('Similarity Indicator')
ax.set_ylabel('Number of Users')
ax.set_title('Number of users identified with TPM Algorithm by similarity indicator',y=1.08)
ax.set_axisbelow(True)
ax.yaxis.grid(color='gray',linestyle='dashed')
# Avoid unnecessary whitespace.
plt.xlim(-800.0, 0.0)
plt.ylim(0, 250)
#plt.yticks(range(36, 54, 2), [str(x) + "%" for x in range(36, 54, 2)], fontsize=11)
#plt.xticks(a, [str(x) + "%" for x in a], fontsize=11)
#plt.plot(a,resultados_2,lw=2.5,color=(31/255.,119/255.,180/255.))
plt.hist([correct_distance,wrong_distances_without_800], 30, histtype='bar',color=colors,label=['Correctly recognized users','Wrongly recognized users'])
plt.legend(loc='lower left',fancybox=False, shadow=False, prop = fontP, bbox_to_anchor=(1.0, 0.05))
#plt.show()
plt.savefig("indicator correct wrong alg1.eps",format='eps', dpi=1000, bbox_inches="tight")
In [41]:
wrong_distances_selected = []
counter = 0
for i in range(limit):
if i in wrong_indexs:
wrong_distances_selected.append(selected_distance[counter])
counter += 1
if i in correct_indexs:
counter += 1
In [42]:
len(wrong_distances_selected)
Out[42]:
In [43]:
colors = ['green','red','blue']
plt.hist([correct_distance,wrong_distances_without_800,wrong_distances_selected], 30, histtype='bar',color=colors)
plt.show()
In [46]:
wd_menor_100 = filter(lambda x: x > -100,wrong_distances_selected)
In [47]:
plt.hist(wd_menor_100)
Out[47]:
In [73]:
x = np.array(identified_indexs)
y = np.bincount(x)
ii = np.nonzero(y)[0]
frequency_identified_indexs = zip(ii,y[ii])
frequency_identified_indexs.sort(key = lambda t: t[1], reverse=True)
frequency_identified_indexs
Out[73]:
In [48]:
wrong_indexs_0 = []
for i in range(len(wd_menor_100)):
if wd_menor_100[i]==0:
wrong_indexs_0.append(wrong_distances[i])
#Hay uno en el histograma que es menor que 0.092 pero es distinto de 0
len(wrong_indexs_0)
Out[48]:
Falta encontrar el índice donde se minimiza el error
In [49]:
diagonal = iden_matrix_paradero.diagonal().copy()
correct_distance = []
wrong_distance = []
diagonal_d1 = []
for i in range(len(diagonal)):
if diagonal[i]>-800:
diagonal_d1.append(i)
if i in correct_indexs:
correct_distance.append(diagonal[i])
else:
wrong_distance.append(diagonal[i])
print "diagonal: "+str(len(diagonal_d1) )
print "correctos: "+str(len(correct_distance))
print "incorrectos: "+str(len(wrong_distance))
colors = ['red', 'green']
plt.hist([wrong_distance,correct_distance], histtype='bar',color=colors)
plt.show()
In [50]:
correct_distance = []
wrong_distance = []
diagonal_d1 = []
for i in range(len(diagonal)):
if diagonal[i]>-300:
diagonal_d1.append(i)
if i in correct_indexs:
correct_distance.append(diagonal[i])
else:
wrong_distance.append(diagonal[i])
print len(diagonal_d1)
print len(correct_distance)
print len(wrong_distance)
In [51]:
colors = ['red', 'green']
plt.hist([wrong_distance,correct_distance], histtype='bar',color=colors)
plt.show()
In [ ]: