In [1]:
import collections

import pandas as pd
pd.set_option('display.max_columns', 500)

DPAE


In [2]:
filename_input = 'LBB_XDPDPA_DPAE_20160307_20170407_20170407_165803_sep.csv'
filename_output = 'LBB_XDPDPA_DPAE_20160307_20170407_20170407_165803_clean.csv'

In [3]:
f_input = open(filename_input, 'r')
f_output = open(filename_output, 'w')

In [4]:
header_intput = f_input.readline()

In [5]:
column_names = header_intput[:-1].split('|')

In [6]:
columns_kept = [
 'dc_naf_id',
 'dn_tailleetablissement',
 'kd_dateembauche',
 'dc_typecontrat_id',
 'dd_datefincdd',
 'dc_romev3_1_id',
 'dc_romev3_2_id',
 'nbrjourtravaille',
# 'kn_trancheage',
]
header_output = '|'.join(columns_kept) + '\n'
f_output.write(header_output)


Out[6]:
128

In [7]:
index_kept = [
    column_names.index(column_name)
    for column_name in columns_kept
]
index_kept


Out[7]:
[1, 5, 7, 8, 9, 10, 11, 20]

In [8]:
index_kd_dateembauche = column_names.index('kd_dateembauche')
index_dd_datefincdd = column_names.index('dd_datefincdd')
index_dc_romev3_1_id = column_names.index('dc_romev3_1_id')
index_dc_romev3_2_id = column_names.index('dc_romev3_2_id')
index_dc_naf_id = column_names.index('dc_naf_id')

In [9]:
for i, line_input in enumerate(f_input):
    cells = line_input[:-1].split('|')
    
    
    # Modify dates

    kd_dateembauche = cells[index_kd_dateembauche]
    kd_dateembauche = kd_dateembauche[:10]
    cells[index_kd_dateembauche] = kd_dateembauche

    dd_datefincdd = cells[index_dd_datefincdd]
    dd_datefincdd = dd_datefincdd[:10]
    cells[index_dd_datefincdd] = dd_datefincdd
    

    # Remove lines with no ROME
    set_null = {'NULL', 'null', ''}
    dc_romev3_1_id = cells[index_dc_romev3_1_id]
    dc_romev3_2_id = cells[index_dc_romev3_2_id]
    if (dc_romev3_1_id in set_null) and (dc_romev3_2_id in set_null):
        continue
   
    # Remove lines with no NAF
    dc_naf_id = cells[index_dc_naf_id]
    if dc_naf_id in set_null:
        continue
   

    cells_kept = [
        cells[i]
        for i in index_kept
    ]
    line_output = '|'.join(cells_kept) + '\n'
    f_output.write(line_output)
    
    if i % 1000000 == 0:
        print(i)


2000000
3000000
4000000
5000000
8000000
9000000
10000000
11000000
14000000
15000000
16000000
17000000
20000000
21000000
22000000
23000000
26000000
27000000
28000000
29000000
31000000
32000000
37000000
38000000
39000000
40000000
41000000
43000000
44000000
46000000
47000000
49000000
50000000
51000000

In [10]:
f_input.close()
f_output.close()