In [1]:
import string
import pickle
import pandas as pd
In [2]:
ROME_df = pd.read_csv('../referentiels/referentiel_ROME/20150921_arboprincipale28427_ROME.csv', index_col=0, sep='|', dtype=str)
OGR_df = pd.read_csv('../referentiels/referentiel_OGR/20150921_arboprincipale28427_OGR.csv', sep='|', dtype=str).set_index('OGR')
NAF_df = pd.read_csv('../referentiels/referentiel_NAF/naf2008_liste_n5_nouveau_header.csv', sep='|', encoding="utf-8").set_index(['NAF'])
In [3]:
with open('../decoupage_manuel_ROME.txt', 'r') as f:
data = f.read()
In [4]:
lines = data.split('\n')
def begin_ROME(line_number, line):
words = line.split(' ')
assert words[0] == '***', 'Erreur! J\'attendais "*** Z1234 xxx ***" (ligne {})'.format(line_number)
current_ROME = words[1]
assert current_ROME in ROME_df.index, 'Erreur! Code ROME "{}" inconnu (ligne {})'.format(current_ROME, line_number)
return current_ROME
current_ROME = None
current_group_OGRs = None
current_group_name = None
groups = {}
for line_number, line in enumerate(lines):
line = line.strip()
if line == '':
if current_group_name:
assert current_group_OGRs, 'Erreur! Le groupe "{}" est vide (ligne {})'.format(current_group_name, line_number)
groups[current_ROME].append({'label': current_group_name, 'OGRs': current_group_OGRs})
current_group_name = None
current_group_OGRs = None
else:
if current_group_name:
words = line.split(' ')
OGR = words[0]
assert OGR in set(OGR_df[OGR_df.ROME == current_ROME].index), 'Erreur! Le code OGR "{}" ne fais pas partie du code ROME "{}" (ligne {})'.format(OGR, current_ROME, line_number)
current_group_OGRs.append(OGR)
elif current_ROME:
words = line.split(' ')
if words[0] == '***':
list_OGRs = []
for group in groups[current_ROME]:
list_OGRs += group['OGRs']
set_OGRs_in_data = set(list_OGRs)
assert len(set_OGRs_in_data) == len(list_OGRs), 'Erreur! Un code OGR est présent dans plusieurs groupes du code ROME "{}" (ligne {})'.format(current_ROME, line_number)
set_OGRs = set(OGR_df[OGR_df.ROME == current_ROME].index)
assert set_OGRs_in_data == set_OGRs, 'Erreur! Tous les codes OGR ne sont pas classés pour le code ROME "{}" : {} vs {} (ligne {})'.format(current_ROME, set_OGRs_in_data, set_OGRs, line_number)
current_ROME = begin_ROME(line_number, line)
assert current_ROME not in groups, 'Erreur! Le ROME "{}" est déjà défini (ligne {})'.format(current_ROME, line_number)
groups[current_ROME] = []
else:
current_group_name = line
current_group_OGRs = []
else:
current_ROME = begin_ROME(line_number, line)
assert current_ROME not in groups, 'Erreur! Le ROME "{}" est déjà défini (ligne {})'.format(current_ROME, line_number)
groups[current_ROME] = []
assert current_group_name not in {group['label'] for group in groups[current_ROME]}, 'Erreur! Le nom "{}" est déjà définiTous les codes OGR ne sont pas classés pour le code ROME "{}" (ligne {})'.format(current_ROME, line_number)
In [5]:
suffixes = string.ascii_lowercase
suffixes
Out[5]:
In [6]:
for ROME, ROME_groups in groups.items():
for i, group in enumerate(ROME_groups):
label = group['label']
OGRs = group['OGRs']
group['name'] = ROME + suffixes[i]
In [7]:
groups
Out[7]:
In [8]:
with open('../decoupage_ROME.pickle', 'wb') as f:
pickle.dump(groups, f)
In [ ]: