In [1]:
from collections import defaultdict
import re
import pickle
import pandas as pd
In [2]:
set_by_naf = defaultdict(set)
not_parsed = []
with open('sirc-17804_9075_14209_201705_L_M_20170601_024542286.csv', 'r', encoding='latin-1') as f:
headers = f.readline()[:-1].split('"')[1::2]
SIREN_index = headers.index('SIREN')
NAF_index = headers.index('APET700')
for i, line in enumerate(f):
row = line[:-1].split('"')[1::2]
if len(row) != len(headers):
row = line[:-1].split(';')
row = [c[1:-1] for c in row]
if len(row) != len(headers):
row = line[:-1].split('"')
while '' in row[2:-1:2]:
i = row[2:-1:2].index('')
assert row[2*i+2] == ''
row[2*i+1] = row[2*i+1] + '"' + row[2*i+3]
del row[2*i+3]
del row[2*i+2]
row = row[1::2]
assert len(row) == len(headers)
SIREN = row[SIREN_index]
NAF = row[NAF_index]
set_by_naf[NAF].add(SIREN)
if i%1000000 == 0:
print(i)
In [3]:
count_by_naf = {}
for k in set_by_naf.keys():
assert re.match(r'^(:?[0-9]{4}[A-Z]|[0-9]{3}[A-Z] )$', k), k
for SIREN in set_by_naf[k]:
assert re.match(r'^[0-9]{9}$', SIREN), SIREN
count_by_naf[k] = len(set_by_naf[k])
In [4]:
with open('count_by_naf.pickle', 'wb') as f:
pickle.dump(count_by_naf, f)
In [5]:
NAF_df = pd.read_csv('../referentiels/referentiel_NAF/naf2008_liste_n5_nouveau_header.csv', sep='|', encoding="utf-8").set_index(['NAF'])
In [6]:
for NAF in NAF_df.index:
if NAF not in count_by_naf:
count = 0
else:
count = count_by_naf[NAF]
if count < 3:
print('{}\t{}'.format(count, NAF))
In [7]:
for NAF, count in count_by_naf.items():
if count < 3:
print('{}\t{}'.format(count, NAF))
In [ ]: