In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import time
import datetime as dt
import pickle
import numpy as np
from itertools import chain, combinations
import random
import scipy as sp
from __future__ import division
from dict_stops import *
import pandas as pd
import os
import csv
In [3]:
frame = pd.read_csv('/home/cata/Documentos/Datois/etapas_2013_abril_allyearsids_10_100000.csv')
In [4]:
frame_2 = pd.read_csv('/home/cata/Documentos/Datois/etapas_2013_septiembre_allyearsids_10_100000.csv')
In [5]:
if os.name == 'nt':
path_subway_dictionary = 'C:\Users\catalina\Documents\Datois\Diccionario-EstacionesMetro.csv'
path_csv_sequences = 'C:\Users\catalina\Documents\sequences\\'
else:
path_subway_dictionary = '/home/cata/Documentos/Datois/Diccionario-EstacionesMetro.csv'
path_csv_sequences = '/home/cata/Documentos/sequences/'
# Función que carga las estaciones de metro
# en un diccionario
def load_metro_dictionary():
dict_metro = {}
with open(path_subway_dictionary,mode='r') as infile:
reader = csv.reader(infile,delimiter=';')
dict_metro = {rows[5]:rows[7] for rows in reader}
return dict_metro
In [6]:
# Función que estandariza los valores de los paraderos de subida
# y bajada
def update_vals(row,data = load_metro_dictionary()):
if row.par_subida in data:
row.par_subida = data[row.par_subida]
if row.par_bajada in data:
row.par_bajada = data[row.par_bajada]
return row
In [7]:
# Función que estandariza los valores de los paraderos de subida
# y bajada
def add_vals(row,latlong,paradero,data = dict_latlong_stops):
stop_name = row[paradero]
if stop_name in data:
return data[stop_name][latlong]
else :
return np.nan
In [8]:
def frame_config(frame):
frame['tiempo_subida'] = pd.to_datetime(frame.tiempo_subida)
frame['tiempo_bajada'] = pd.to_datetime(frame.tiempo_bajada)
frame = frame.apply(update_vals, axis=1)
frame['weekday'] = frame.tiempo_subida.dt.dayofweek
frame['lat_subida'] = frame.apply(add_vals,args=('lat','par_subida'),axis=1)
frame['lat_bajada'] = frame.apply(add_vals,args=('lat','par_bajada'),axis=1)
frame['long_subida'] = frame.apply(add_vals,args=('long','par_subida'),axis=1)
frame['long_bajada'] = frame.apply(add_vals,args=('long','par_bajada'),axis=1)
frame = frame.sort_values(by=['id', 'tiempo_subida'])
frame['diferencia_tiempo'] = (frame['tiempo_subida']-frame['tiempo_subida'].shift()).fillna(0)
return frame
In [9]:
def hour_to_seconds(an_hour):
return int(an_hour.hour*3600 + an_hour.minute *60 + an_hour.second)
In [10]:
frame = frame_config(frame)
In [11]:
frame.head()
Out[11]:
In [12]:
frame.info()
In [13]:
frame_2 = frame_config(frame_2)
In [14]:
paraderos_sinlatlong = frame_2['par_subida'][frame_2['lat_subida'].isnull()& frame_2['par_subida'].notnull()].unique()
In [16]:
paraderos_sinlatlong
Out[16]:
In [84]:
frame_2 = frame_2[frame_2.lat_subida.notnull()]
In [54]:
from scipy.stats.mstats import mode
f = lambda x: mode(x, axis=None)[0][0]
g = lambda x: mode(x,axis=None)[1][0]
aggregations = {
'tiempo_subida': "count" # Calculate two results for the 'network' column with a list
}
a_group = frame.groupby(['par_subida','tipo_transporte']).agg(aggregations)
another_group = frame.groupby(['par_subida','par_bajada','lat_subida','tipo_transporte']).agg(aggregations)
In [57]:
sorted_group = another_group.sort_values('tiempo_subida',ascending=False)
sorted_group.head(100)
Out[57]:
In [25]:
sorted_group.to_csv('od.csv')
In [28]:
another_group.head()
Out[28]:
In [12]:
groupie_group = frame.groupby(['par_subida','tipo_transporte']).agg({'tiempo_subida':"count"})
sorted_par_subidas = groupie_group.sort_values('tiempo_subida',ascending=False)
sorted_par_subidas.to_csv('origin_.csv')
sorted_par_subidas.head()
Out[12]:
In [13]:
groupie_group = frame.groupby(['par_bajada']).agg({'tiempo_subida':"count"})
sorted_par_bajadas = groupie_group.sort_values('tiempo_subida',ascending=False)
sorted_par_bajadas.to_csv('destination_.csv')
sorted_par_bajadas.head()
Out[13]:
In [46]:
sorted_par_subidas[sorted_par_subidas['tiempo_subida']>10].to_csv('origin_10.csv')
sorted_par_bajadas[sorted_par_bajadas['tiempo_subida']>10].to_csv('destination_10.csv')
In [15]:
with open('correct_and_wrong_indexs_alg1.pickle') as f:
correct_alg1 = pickle.load(f)
wrong_alg1 = pickle.load(f)
In [16]:
with open('index_id_users.pickle') as f:
users_id = pickle.load(f)
In [17]:
correct_alg1_ids = []
wrong_alg1_ids = []
for i in range(len(correct_alg1)):
correct_alg1_ids.append(users_id[i])
for i in range(len(wrong_alg1)):
wrong_alg1_ids.append(users_id[i])
In [30]:
def write_csv_grouped_data(a_frame,name,threshold):
groupie_group = a_frame.groupby(['par_bajada','tipo_transporte']).agg({'tiempo_subida':"count"})
sorted_par_bajadas = groupie_group.sort_values('tiempo_subida',ascending=False)
sorted_par_bajadas[sorted_par_bajadas['tiempo_subida']>threshold].to_csv(name+'_'+str(threshold)+'_destination.csv')
groupie_group = a_frame.groupby(['par_subida','tipo_transporte']).agg({'tiempo_subida':"count"})
sorted_par_subidas = groupie_group.sort_values('tiempo_subida',ascending=False)
sorted_par_subidas[sorted_par_subidas['tiempo_subida']>threshold].to_csv(name+'_'+str(threshold)+'_origin.csv')
return sorted_par_subidas
In [31]:
ff = write_csv_grouped_data(frame,'',0)
In [85]:
write_csv_grouped_data(frame_2,'',0)
Out[85]:
In [37]:
frame
Out[37]:
In [19]:
write_csv_grouped_data(frame[frame['id'].isin(wrong_alg1_ids)],'wrong_alg1',1)
In [20]:
write_csv_grouped_data(frame[frame['id'].isin(correct_alg1_ids)],'correct_alg1',1)
In [23]:
without_transbordors_frame = frame[frame['netapa']==1]
In [24]:
write_csv_grouped_data(without_transbordors_frame[without_transbordors_frame['id'].isin(wrong_alg1_ids)],'wrong_alg1_wo_tr',1)
write_csv_grouped_data(without_transbordors_frame[without_transbordors_frame['id'].isin(correct_alg1_ids)],'correct_alg1_wo_tr',1)
In [ ]: