In [1]:
import collections

import pandas as pd
pd.set_option('display.max_columns', 500)

In [2]:
filename_input = 'LBB_ETT_ETT_20160430_20170530_20170530_162434_sep.csv'
filename_output = 'LBB_ETT_ETT_20160430_20170530_20170530_162434_clean.csv'

In [3]:
f_input = open(filename_input, 'r')
f_output = open(filename_output, 'w')

In [4]:
header_intput = f_input.readline()

In [5]:
column_names = header_intput[:-1].split('|')

In [6]:
columns_kept = [
 'dn_nbjcaltotalmission',
 'dc_nafinsee700_id',
 'dn_nbmission',
 'dc_nafrefv2_id',
 'dc_trancheeffectif_id',
 'dc_romev3_1_id',
 'dc_romev3_2_id',
]
header_output = '|'.join(columns_kept) + '\n'
f_output.write(header_output)


Out[6]:
120

In [7]:
index_kept = [
    column_names.index(column_name)
    for column_name in columns_kept
]
index_kept


Out[7]:
[4, 7, 8, 9, 12, 13, 16]

In [8]:
index_dc_romev3_1_id = column_names.index('dc_romev3_1_id')
index_dc_nafrefv2_id = column_names.index('dc_nafrefv2_id')

In [9]:
for i, line_input in enumerate(f_input):
    cells = line_input[:-1].split('|')
    
    # Remove lines with no ROME
    set_null = {'NULL', 'null', ''}
    dc_romev3_1_id = cells[index_dc_romev3_1_id]
    if dc_romev3_1_id in set_null:
        continue
   
    ## Remove lines with no NAF
    #dc_nafrefv2_id = cells[index_dc_nafrefv2_id]
    #if dc_nafrefv2_id in set_null:
    #    continue
   

    cells_kept = [
        cells[i]
        for i in index_kept
    ]
    line_output = '|'.join(cells_kept) + '\n'
    f_output.write(line_output)
    
    if i % 1000000 == 0:
        print(i)


0
1000000
3000000
4000000
5000000
6000000
7000000
8000000
10000000
11000000
12000000
13000000
16000000
17000000
18000000
19000000

In [10]:
f_input.close
f_output.close()

In [ ]: