In [ ]:
"""
Clean CARRIS database
"""
In [1]:
from senpy.pgsql import get_row_number
from senpy.pgsql.fields import drop_column
from senpy.pgsql.fields import split_column_value_into_columns
from senpy.pgsql.fields import get_columns_name
from senpy.pgsql.sanitize import replace_char_in_col
from senpy.pgsql.data._del import drop_table_data
from senpy.pgsql.data.w import create_table_by_query
In [2]:
conParam = {
"HOST" : "localhost", "PORT" : "5432", "PASSWORD" : "admin",
"USER" : "postgres", "DATABASE" : "valcarris_v2"
}
val_trips_table = "validacoes"
fields_to_delete = ["nserie", "descricao", "nchapa", "nviagem"]
routes_fields = {
"carreira" : "text", "variante" : "text",
"sentido" : "text", "ordem" : "integer"
}
route_fields_order = ["carreira", "variante", "sentido"]
route_table = "percursos_pnt_geom"
routeID_relation = {"carreira": "carreira", "sentido": "sentido", "variante" : "variante", "ordem" : "ordem"}
stops_inRouteTable = "paragem"
card_field = "ncartao"
null_card_value = "''"
dayhour_field = "datahora"
day_field = "dia"
hour_field = "hora"
minute_field = "minuto"
second_field = "segundo"
In [ ]:
"""
Conta numero de linhas da tabela val_trips_table
"""
print get_row_number(conParam, val_trips_table)
In [ ]:
"""
1) Elimina colunas que nao serao necessarias
"""
drop_column(conParam, val_trips_table, fields_to_delete)
In [ ]:
"""
2) Elimina valores nulos ou vazios ('' para texto ou 0 para numericos)
nas colunas onde se encontra a informacao
relacionada com os percursos
"""
for f in routes_fields:
whr = "{fld}='' OR {fld} IS NULL" if routes_fields[f] == "text" else "{fld}=0 OR {fld} IS NULL"
print "Deleting empty values from {} column using the WHERE statment {}".format(f, whr.format(fld=f))
drop_table_data(conParam, val_trips_table, whr.format(fld=f))
In [ ]:
"""
Conta numero de linhas da tabela val_trips_table
"""
print get_row_number(conParam, val_trips_table)
In [ ]:
"""
3) Elimina espacos em branco dos campos com informacao
relacionada com os percursos
"""
nv_table = "{}_v2".format(val_trips_table)
replace_char_in_col(
conParam, val_trips_table, [c for c in routes_fields if routes_fields[c] == "text"],
" ", "", nv_table
)
In [ ]:
"""
Conta numero de linhas da nova versao da tabela val_trips_table
"""
print get_row_number(conParam, nv_table)
In [ ]:
"""
4) Exclui linhas com percurso que não se encontra na tabela dos percursos e
obtem paragem onde a validacao foi registada
"""
nv_table = "{}_v2".format(val_trips_table)
fld_relation = " AND ".join("{}.{} = {}.{}".format(
nv_table, i, route_table, routeID_relation[i]
) for i in routeID_relation)
create_table_by_query(
conParam, "{}_v3".format(val_trips_table),
("SELECT {valT}.*, {routeT}.{stops} FROM {valT} INNER JOIN {routeT} ON "
"{onq}").format(
valT=nv_table, routeT=route_table, onq=fld_relation,
stops=stops_inRouteTable
)
)
nv_table = "{}_v3".format(val_trips_table)
In [ ]:
"""
Conta numero de linhas da nova versao da tabela val_trips_table
"""
print get_row_number(conParam, nv_table)
In [ ]:
"""
5) Exclui validacoes sem numero de cartao associado
"""
drop_table_data(conParam, nv_table, "{f}={v} OR {f} IS NULL".format(f=card_field, v=null_card_value))
In [ ]:
"""
Conta numero de linhas da nova versao da tabela val_trips_table
"""
print get_row_number(conParam, nv_table)
In [ ]:
"""
6) Parte campo com data e hora em dia, hora, minuto, segundo, sendo estes ultimos tres
campos com valores inteiros
"""
split_column_value_into_columns(
conParam, nv_table, dayhour_field, " ",
[day_field, "time"], "{}_v4".format(val_trips_table)
)
split_column_value_into_columns(
conParam,"{}_v4".format(val_trips_table), "time", ":",
[hour_field, minute_field, second_field],
"{}_v5".format(val_trips_table)
)
cols = [c for c in get_columns_name(
conParam, "{}_v5".format(val_trips_table)
) if c != hour_field and c != minute_field and c != second_field]
create_table_by_query(
conParam, "{}_v6".format(val_trips_table),
("SELECT {_cols}, CAST({hf} AS integer) AS {hf}, "
"CAST({mf} AS integer) AS {mf}, CAST({sf} AS integer) AS {sf} "
"FROM {t}_v5").format(
_cols=", ".join(cols), hf=hour_field, mf=minute_field,
sf=second_field, t=val_trips_table
)
)
In [ ]:
nv_table = "{}_v6".format(val_trips_table)
In [ ]:
"""
7) Cria campo com horas em decimal
"""
create_table_by_query(
conParam, "{}_v7".format(val_trips_table),
("SELECT *, ({hf} + ({mf} / 60.0) + ({sf} / 3600.0)) AS hour_dec "
"FROM {t}").format(
hf=hour_field, mf=minute_field, sf=second_field, t=nv_table
)
)
In [ ]:
nv_table = "{}_v7".format(val_trips_table)
In [ ]:
"""
8) Cria um campo único que identifica claramente o percurso a que a linha se refere
"""
from senpy.pgsql.fields import text_columns_to_column
text_columns_to_column(
conParam, nv_table, route_fields_order, '|', "route_id", "{}_v8".format(val_trips_table)
)
In [7]:
nv_table = "{}_v8".format(val_trips_table)
In [5]:
"""
Conta numero de linhas da nova versao da tabela val_trips_table
"""
print get_row_number(conParam, "{}_v8".format(val_trips_table))
In [9]:
"""
9) Elimina validacoes de um cartao na mesma linha/paragem quando estas foram feitas
com uma diferenca menor ou igual a tres minutos
"""
from senpy.pgsql.time import del_rows_by_temporal_proximity
del_rows_by_temporal_proximity(
conParam, nv_table, [card_field, "route_id", stops_inRouteTable],
day_field, hour_field, "hour_dec", minute_field,
second_field, 3, "{}_v9".format(val_trips_table),
exclusionRows="time_errors"
)
In [10]:
"""
Conta numero de linhas da nova versao da tabela val_trips_table
"""
print get_row_number(conParam, "{}_v9".format(val_trips_table))
In [ ]: