In [1]:
import collections
import pandas as pd
pd.set_option('display.max_columns', 500)
In [2]:
filename_input = 'LBB_EOF_OFFRES_20160307_20170407_20170407_191410_sep.csv'
filename_output = 'LBB_EOF_OFFRES_20160307_20170407_20170407_191410_clean.csv'
In [3]:
f_input = open(filename_input, 'r')
f_output = open(filename_output, 'w')
In [4]:
header_intput = f_input.readline()
In [5]:
column_names = header_intput[:-1].split('|')
In [6]:
columns_kept = [
'dn_nbrpostesoffertscreation',
'dc_rome_id',
'dc_appelationrome_id',
'dc_naturecontrat_id',
'dc_typecontrat_id',
'dc_unitedureecontrat',
'dc_duree_contrat_id',
'dc_naf2',
]
header_output = '|'.join(columns_kept) + '\n'
f_output.write(header_output)
Out[6]:
In [7]:
index_kept = [
column_names.index(column_name)
for column_name in columns_kept
]
index_kept
Out[7]:
In [8]:
index_dc_rome_id = column_names.index('dc_rome_id')
index_dc_naf2 = column_names.index('dc_naf2')
In [9]:
for i, line_input in enumerate(f_input):
cells = line_input[:-1].split('|')
# Remove lines with no ROME
set_null = {'NULL', 'null', ''}
dc_rome_id = cells[index_dc_rome_id]
if dc_rome_id in set_null:
continue
# Remove lines with no NAF
dc_naf2 = cells[index_dc_naf2]
if dc_naf2 in set_null:
continue
cells_kept = [
cells[i]
for i in index_kept
]
line_output = '|'.join(cells_kept) + '\n'
f_output.write(line_output)
if i % 1000000 == 0:
print(i)
In [10]:
f_input.close
f_output.close()
In [ ]: