In [1]:

    
import collections

import pandas as pd
pd.set_option('display.max_columns', 500)

DPAE



In [2]:

    
filename_input = 'LBB_XDPDPA_DPAE_20160307_20170407_20170407_165803_sep.csv'
filename_output = 'LBB_XDPDPA_DPAE_20160307_20170407_20170407_165803_clean.csv'



In [3]:

    
f_input = open(filename_input, 'r')
f_output = open(filename_output, 'w')



In [4]:

    
header_intput = f_input.readline()



In [5]:

    
column_names = header_intput[:-1].split('|')



In [6]:

    
columns_kept = [
 'dc_naf_id',
 'dn_tailleetablissement',
 'kd_dateembauche',
 'dc_typecontrat_id',
 'dd_datefincdd',
 'dc_romev3_1_id',
 'dc_romev3_2_id',
 'nbrjourtravaille',
# 'kn_trancheage',
]
header_output = '|'.join(columns_kept) + '\n'
f_output.write(header_output)









    Out[6]:





128



In [7]:

    
index_kept = [
    column_names.index(column_name)
    for column_name in columns_kept
]
index_kept









    Out[7]:





[1, 5, 7, 8, 9, 10, 11, 20]



In [8]:

    
index_kd_dateembauche = column_names.index('kd_dateembauche')
index_dd_datefincdd = column_names.index('dd_datefincdd')
index_dc_romev3_1_id = column_names.index('dc_romev3_1_id')
index_dc_romev3_2_id = column_names.index('dc_romev3_2_id')
index_dc_naf_id = column_names.index('dc_naf_id')



In [9]:

    
for i, line_input in enumerate(f_input):
    cells = line_input[:-1].split('|')
    
    
    # Modify dates

    kd_dateembauche = cells[index_kd_dateembauche]
    kd_dateembauche = kd_dateembauche[:10]
    cells[index_kd_dateembauche] = kd_dateembauche

    dd_datefincdd = cells[index_dd_datefincdd]
    dd_datefincdd = dd_datefincdd[:10]
    cells[index_dd_datefincdd] = dd_datefincdd
    

    # Remove lines with no ROME
    set_null = {'NULL', 'null', ''}
    dc_romev3_1_id = cells[index_dc_romev3_1_id]
    dc_romev3_2_id = cells[index_dc_romev3_2_id]
    if (dc_romev3_1_id in set_null) and (dc_romev3_2_id in set_null):
        continue
   
    # Remove lines with no NAF
    dc_naf_id = cells[index_dc_naf_id]
    if dc_naf_id in set_null:
        continue
   

    cells_kept = [
        cells[i]
        for i in index_kept
    ]
    line_output = '|'.join(cells_kept) + '\n'
    f_output.write(line_output)
    
    if i % 1000000 == 0:
        print(i)



In [10]:

    
f_input.close()
f_output.close()