In [10]:
import pandas as pd

# Load files
fileNames = {
    "recogidas" : {"fileName" : "Recogidas.csv", "sep" : ";"},
    "elementos" : {"fileName" : "elementos.csv", "sep" : ";"},
    "posiciones" : {"fileName": "posiciones.csv", "sep" : "\t"},
    "vehiculos" : {"fileName": "vehículos.csv", "sep" : ";"},
    "x_ele_tra" : {"fileName": "x_ele_tra.csv", "sep" : ";"},
    "categoria" : {"fileName": "categoria.csv", "sep" : ";"},
    "transponders" : {"fileName": "transponders.csv", "sep" : ";"},
    "x_cat_ele" : {"fileName": "x_cat_ele.csv", "sep" : ";"}
}
basePath = "/Users/guillermoblascojimenez/Documents/codeProjects/datathon_sant_cugat/dataset/"
for v in fileNames.values():
    v["fileName"] = basePath + v["fileName"]
loadDataFrame = lambda(f) : pd.read_csv(filepath_or_buffer =f['fileName'], header=0, sep=f['sep'])
dfs = {k : loadDataFrame(v) for k,v in fileNames.iteritems()}
dfs['posiciones'][['POS_LATITUD','POS_LONGITUD']] = dfs['posiciones'][['POS_LATITUD','POS_LONGITUD']].applymap(lambda x: x.replace(',','.'))
dfs['posiciones'][['POS_LATITUD','POS_LONGITUD']] = dfs['posiciones'][['POS_LATITUD','POS_LONGITUD']].astype(float)

In [11]:
# Clean files

# remove null columns
countNotNullsForCol = lambda df, col: len(df[df[col].notnull()].index)
countNotNullsForCol(dfs['vehiculos'], 'VEH_CALCA')
for k,v in dfs.iteritems():
    for c in v.columns:
        if countNotNullsForCol(v,c) is 0:
            v.drop(c, 1, inplace=True)

In [61]:
dfs['categoria'].columns


Out[61]:
Index([u'CAT_ID', u'CAT_CLA_ID', u'CAT_CODIGO', u'CAT_TRE_ID', u'CAT_ICO_ID'], dtype='object')

In [62]:
dfs['elementos']
pd.merge(dfs['elementos'], dfs['categoria'], how='left', left_on='ELE_ICO_ID', right_on='CAT_ICO_ID')


Out[62]:
ELE_ID ELE_COD1 ELE_DESCRIPCION ELE_FECHA_ALTA ELE_FECHA_BAJA ELE_ICO_ID ELE_LONGITUD ELE_LATITUD ELE_COLOR ELE_RADIO ELE_INFO CAT_ID CAT_CLA_ID CAT_CODIGO CAT_TRE_ID CAT_ICO_ID
0 1 SC000000001 1 2011-01-01 00:00:00.000 NaN 170 2,065983 41,47662 #22DD22 20 RUTA R3A 8 1 REB3200 41 170
1 2 SC000000002 2 2011-01-01 00:00:00.000 NaN 170 2,068571 41,47731 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
2 3 SC000000003 3 2011-01-01 00:00:00.000 NaN 170 2,058171 41,46426 #22DD22 20 RUTA R3A 8 1 REB3200 41 170
3 4 SC000000004 4 2011-01-01 00:00:00.000 NaN 170 2,05952 41,46452 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
4 5 SC000000005 5 2011-01-01 00:00:00.000 NaN 170 2,062084 41,46376 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
5 6 SC000000006 6 2011-01-01 00:00:00.000 NaN 170 2,063481 41,46331 #22DD22 20 RUTA R3A 8 1 REB3200 41 170
6 7 SC000000007 7 2011-01-01 00:00:00.000 NaN 170 2,061429 41,46455 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
7 8 SC000000008 8 2011-01-01 00:00:00.000 NaN 170 2,062434 41,46495 #22DD22 20 RUTA R3B 8 1 REB3200 41 170
8 9 SC000000009 9 2011-01-01 00:00:00.000 NaN 170 2,061972 41,4659 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
9 10 SC000000010 10 2011-01-01 00:00:00.000 NaN 170 2,05845 41,46489 #22DD22 20 RUTA R3B 8 1 REB3200 41 170
10 11 SC000000011 11 2011-01-01 00:00:00.000 NaN 170 2,061007 41,46619 #22DD22 20 RUTA R3B 8 1 REB3200 41 170
11 12 SC000000012 12 2011-01-01 00:00:00.000 NaN 170 2,061822 41,46641 #22DD22 20 RUTA R3A 8 1 REB3200 41 170
12 13 SC000000013 13 2011-01-01 00:00:00.000 NaN 170 2,062397 41,46306 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
13 14 SC000000014 14 2011-01-01 00:00:00.000 NaN 170 2,059583 41,46412 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
14 15 SC000000015 15 2011-01-01 00:00:00.000 NaN 170 2,058694 41,46416 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
15 16 SC000000016 16 2011-01-01 00:00:00.000 NaN 170 2,059134 41,46331 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
16 17 SC000000017 17 2011-01-01 00:00:00.000 NaN 170 2,061173 41,46283 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
17 18 SC000000018 18 2011-01-01 00:00:00.000 NaN 170 2,062448 41,46242 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
18 19 SC000000019 19 2011-01-01 00:00:00.000 NaN 170 2,063736 41,46179 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
19 20 SC000000020 20 2011-01-01 00:00:00.000 NaN 170 2,064478 41,46225 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
20 21 SC000000021 21 2011-01-01 00:00:00.000 NaN 170 2,063936 41,46111 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
21 22 SC000000022 22 2011-01-01 00:00:00.000 NaN 170 2,064094 41,47167 #22DD22 20 RUTA R3B 8 1 REB3200 41 170
22 23 SC000000023 23 2011-01-01 00:00:00.000 NaN 170 2,061493 41,47289 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
23 24 SC000000024 24 2011-01-01 00:00:00.000 NaN 170 2,06785 41,47329 #22DD22 20 RUTA R3B 8 1 REB3200 41 170
24 25 SC000000025 25 2011-01-01 00:00:00.000 NaN 170 2,067467 41,47159 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
25 26 SC000000026 26 2011-01-01 00:00:00.000 NaN 170 2,065841 41,47065 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
26 27 SC000000027 27 2011-01-01 00:00:00.000 NaN 170 2,065844 41,47216 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
27 28 SC000000028 28 2011-01-01 00:00:00.000 NaN 170 2,06568 41,473 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
28 29 SC000000029 29 2011-01-01 00:00:00.000 NaN 170 2,069548 41,47009 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
29 30 SC000000030 30 2011-01-01 00:00:00.000 NaN 170 2,066324 41,46884 #22DD22 20 RUTA R3C 8 1 REB3200 41 170
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2640 2546 2545 NaN 2013-12-17 00:00:00.000 NaN 168 2,062398 41,46305 NaN NaN NaN 6 1 PAP3200 38 168
2641 2547 2546 NaN 2013-12-17 00:00:00.000 NaN 169 2,062398 41,46305 NaN NaN NaN 10 1 VIDRE2200 43 169
2642 2548 2547 NaN 2011-03-21 00:00:00.000 2011-03-21 00:00:00.000 171 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2643 2549 2548 NaN 2011-03-21 00:00:00.000 2011-03-21 00:00:00.000 171 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2644 2550 Sense TAG 1 NaN 2011-03-21 00:00:00.000 NaN NaN NaN NaN NaN NaN NaN 1 2 vias.generico NaN NaN
2645 2551 Sense TAG 2 NaN 2011-03-21 00:00:00.000 NaN NaN NaN NaN NaN NaN NaN 1 2 vias.generico NaN NaN
2646 2552 2551 NaN 2014-01-25 00:00:01.000 NaN 170 2,067866 41,47712 NaN NaN NaN 8 1 REB3200 41 170
2647 2553 2552 NaN 2014-01-25 00:00:01.000 NaN 199 2,067866 41,47712 NaN NaN NaN 4 1 FORM2200 42 199
2648 2554 2553 NaN 2014-02-22 00:00:00.000 NaN 167 2,090001 41,47677 NaN NaN NaN 2 1 ENV3200 36 167
2649 2555 2554 NaN 2014-02-22 00:00:00.000 NaN 168 2,090001 41,47677 NaN NaN NaN 6 1 PAP3200 38 168
2650 2556 2555 NaN 2014-02-22 00:00:00.000 NaN 169 2,090001 41,47677 NaN NaN NaN 10 1 VIDRE2200 43 169
2651 2557 2556 NaN 2014-03-26 00:00:00.000 NaN 170 2,041508 41,47141 NaN NaN NaN 8 1 REB3200 41 170
2652 2558 2557 NaN 2014-03-26 00:00:01.000 NaN 199 2,041508 41,47141 NaN NaN NaN 4 1 FORM2200 42 199
2653 2559 2558 NaN 2014-03-26 00:00:01.000 NaN 168 2,041508 41,47141 NaN NaN NaN 6 1 PAP3200 38 168
2654 2560 2559 NaN 2014-03-26 00:00:01.000 NaN 167 2,041508 41,47141 NaN NaN NaN 2 1 ENV3200 36 167
2655 2561 2560 NaN 2014-03-26 00:00:01.000 NaN 169 2,041508 41,47141 NaN NaN NaN 10 1 VIDRE2200 43 169
2656 2562 2561 NaN 2014-03-24 00:00:01.000 NaN 167 2,067694 41,46317 NaN NaN NaN 2 1 ENV3200 36 167
2657 2563 2562 NaN 2014-03-24 00:00:01.000 NaN 168 2,067694 41,46317 NaN NaN NaN 6 1 PAP3200 38 168
2658 2564 2563 NaN 2014-03-24 00:00:01.000 NaN 169 2,067694 41,46317 NaN NaN NaN 10 1 VIDRE2200 43 169
2659 2565 3002 NaN 2014-06-16 16:00:00.000 NaN 170 2,076817 41,46409 NaN NaN NaN 8 1 REB3200 41 170
2660 2566 3003 NaN 2014-07-08 00:00:00.000 NaN 170 2,067461 41,45784 NaN NaN NaN 8 1 REB3200 41 170
2661 2567 3004 NaN 2014-09-01 00:00:00.000 2014-10-30 00:00:00.000 170 2,06682 41,4488 NaN NaN NaN 8 1 REB3200 41 170
2662 2568 3005 NaN 2014-11-04 00:00:01.000 NaN 166 2,089479 41,47503 NaN NaN NaN 9 1 REBSOT 41 166
2663 2568 3005 NaN 2014-11-04 00:00:01.000 NaN 166 2,089479 41,47503 NaN NaN NaN 14 1 GEN�RIC 46 166
2664 2569 3006 NaN 2014-11-06 12:04:52.000 NaN 170 2,081754 41,47278 NaN NaN NaN 8 1 REB3200 41 170
2665 2570 3007 NaN 2014-11-06 12:05:45.000 NaN 199 2,081754 41,47278 NaN NaN NaN 4 1 FORM2200 42 199
2666 2571 3008 NaN 2014-11-06 12:06:26.000 NaN 167 2,081754 41,47278 NaN NaN NaN 2 1 ENV3200 36 167
2667 2572 3009 NaN 2014-11-06 12:06:56.000 NaN 168 2,081754 41,47278 NaN NaN NaN 6 1 PAP3200 38 168
2668 2573 3010 NaN 2014-11-06 12:07:28.000 NaN 169 2,081754 41,47278 NaN NaN NaN 10 1 VIDRE2200 43 169
2669 2574 3011 NaN 2014-12-29 00:00:00.000 NaN 170 2,076817 41,46409 NaN NaN NaN 8 1 REB3200 41 170

2670 rows × 16 columns


In [9]:
dfs['posiciones'][dfs['posiciones']['POS_VEH_ID'] == 77][['POS_LATITUD','POS_LONGITUD','POS_FECHAHORA']].to_csv("/Users/guillermoblascojimenez/Documents/codeProjects/datathon_sant_cugat/dataset/ruta_77.csv")

In [19]:
dfs['posiciones'][['POS_LATITUD','POS_LONGITUD']].applymap(lambda x: x.replace(',','.'))


Out[19]:
POS_LATITUD POS_LONGITUD
0 41.52687 2.10266
1 41.47936 2.09492
2 41.35049 -1.64004
3 41.47969 2.09539
4 41.47934 2.09488
5 41.47939 2.09514
6 41.47945 2.09513
7 41.47216 2.08819
8 41.47966 2.095074
9 41.47966 2.095091
10 41.47963 2.095445
11 41.47963 2.095448
12 41.52475 2.10048
13 41.46928 2.07711
14 41.35088 -1.64103
15 41.47969 2.09539
16 41.47936 2.09492
17 41.47934 2.09488
18 41.47939 2.09514
19 41.47945 2.09513
20 41.35097 -1.64128
21 41.47199 2.08892
22 41.47935 2.094973
23 41.47934 2.094945
24 41.52253 2.09858
25 41.46928 2.07711
26 41.52229 2.09854
27 41.35116 -1.64183
28 41.47969 2.09539
29 41.47936 2.09492
... ... ...
19969 41.4601 2.053967
19970 41.45977 2.051575
19971 41.43259 2.077775
19972 41.4326 2.077759
19973 41.43258 2.077757
19974 41.43259 2.077739
19975 41.46652 2.06707
19976 41.48742 2.0661
19977 41.47558 2.068618
19978 41.4769 2.069063
19979 41.45082 2.0732
19980 41.43129 2.08857
19981 41.47581 2.07014
19982 41.47872 2.08633
19983 41.4783 2.08697
19984 41.43134 2.08862
19985 41.4758 2.0703
19986 41.44762 2.07159
19987 41.44001 2.071512
19988 41.44064 2.071522
19989 41.44064 2.071541
19990 41.46951 2.07944
19991 41.48576 2.06623
19992 41.47137 2.07775
19993 41.47837 2.08684
19994 41.44588 2.076448
19995 41.44588 2.076591
19996 41.44699 2.076259
19997 41.44797 2.075562
19998 41.44804 2.075487

19999 rows × 2 columns


In [9]:
import numpy as np
i = 0
j = 100
def process(i,j, K=0.00000000000001):
    points = dfs['posiciones'][['POS_ID','POS_LONGITUD','POS_LATITUD']]

    #LON = 2
    #LAT = 41
    #px = points['POS_LONGITUD'] - LON
    #f = lambda lon, lat: lambda row: (row['POS_LONGITUD'] - lon) + (row['POS_LATITUD'] - lat)
    #points.apply(f(LON,LAT),1)
    points2 = points[:]
    points2.columns = ['POS_ID_2','POS_LONGITUD_2', 'POS_LATITUD_2']
    points = points[i:j]
    points[0] = points2[0] = 0
    m = pd.merge(points, points2, how='outer', on=0)
    m['d'] = np.power(m['POS_LONGITUD'] - m['POS_LONGITUD_2'],2) + np.power(m['POS_LATITUD'] - m['POS_LATITUD_2'],2)
    m['not_itself'] =  m['POS_ID'] != m['POS_ID_2']
    m['edge_candidate'] = m['d'] < K
    m['edge'] = m.apply(lambda x: x['not_itself'] and x['edge_candidate'], axis=1)
    m = m[m['edge']][['POS_ID','POS_LONGITUD','POS_LATITUD','POS_ID_2','POS_LONGITUD_2','POS_LATITUD_2']]
    fileName = basePath + 'edges_' + str(i) + '_' + str(j) + ".csv"
    m.to_csv(fileName)
    print(len(m.index))

In [8]:
process(0,100)


#for i in range(0,len(dfs['posiciones'].index),1000):
#    print i
#    process(i,i+1000)


1292

In [ ]: