In [1]:
from collections import defaultdict
import re
import pickle

import pandas as pd

In [2]:
set_by_naf = defaultdict(set)
not_parsed = []
with open('sirc-17804_9075_14209_201705_L_M_20170601_024542286.csv', 'r', encoding='latin-1') as f:
    headers = f.readline()[:-1].split('"')[1::2]

    SIREN_index = headers.index('SIREN')
    NAF_index = headers.index('APET700')
    
    for i, line in enumerate(f):
        row = line[:-1].split('"')[1::2]
        if len(row) != len(headers):
            row = line[:-1].split(';')
            row = [c[1:-1] for c in row]
            if len(row) != len(headers):
                row = line[:-1].split('"')
                while '' in row[2:-1:2]:
                    i = row[2:-1:2].index('')
                    assert row[2*i+2] == ''
                    row[2*i+1] = row[2*i+1] + '"' + row[2*i+3]
                    del row[2*i+3]
                    del row[2*i+2]
                row = row[1::2]
        assert len(row) == len(headers)

        SIREN = row[SIREN_index]
        NAF = row[NAF_index]
        set_by_naf[NAF].add(SIREN)

        if i%1000000 == 0:
            print(i)


0
1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000

In [3]:
count_by_naf = {}
for k in set_by_naf.keys():
    assert re.match(r'^(:?[0-9]{4}[A-Z]|[0-9]{3}[A-Z] )$', k), k
    for SIREN in set_by_naf[k]:
        assert re.match(r'^[0-9]{9}$', SIREN), SIREN
    count_by_naf[k] = len(set_by_naf[k])

In [4]:
with open('count_by_naf.pickle', 'wb') as f:
    pickle.dump(count_by_naf, f)

In [5]:
NAF_df = pd.read_csv('../referentiels/referentiel_NAF/naf2008_liste_n5_nouveau_header.csv', sep='|', encoding="utf-8").set_index(['NAF'])

In [6]:
for NAF in NAF_df.index:
    if NAF not in count_by_naf:
        count = 0
    else:
        count = count_by_naf[NAF]
    if count < 3:
        print('{}\t{}'.format(count, NAF))


0	0520Z
2	5122Z
0	9700Z
0	9810Z
0	9820Z

In [7]:
for NAF, count in count_by_naf.items():
    if count < 3:
        print('{}\t{}'.format(count, NAF))


2	5122Z
1	702C 

In [ ]: