In [1]:
import string
import pickle

import pandas as pd

Load data


In [2]:
ROME_df = pd.read_csv('../referentiels/referentiel_ROME/20150921_arboprincipale28427_ROME.csv', index_col=0, sep='|', dtype=str)
OGR_df = pd.read_csv('../referentiels/referentiel_OGR/20150921_arboprincipale28427_OGR.csv', sep='|', dtype=str).set_index('OGR')
NAF_df = pd.read_csv('../referentiels/referentiel_NAF/naf2008_liste_n5_nouveau_header.csv', sep='|', encoding="utf-8").set_index(['NAF'])

In [3]:
with open('../decoupage_manuel_ROME.txt', 'r') as f:
    data = f.read()

Parse data


In [4]:
lines = data.split('\n')

def begin_ROME(line_number, line):
    words = line.split(' ')
    assert words[0] == '***', 'Erreur! J\'attendais "*** Z1234 xxx ***" (ligne {})'.format(line_number)
    current_ROME = words[1]
    assert current_ROME in ROME_df.index, 'Erreur! Code ROME "{}" inconnu (ligne {})'.format(current_ROME, line_number)
    return current_ROME

current_ROME = None
current_group_OGRs = None
current_group_name = None
groups = {}
for line_number, line in enumerate(lines):
    line = line.strip()
    
    if line == '':
        if current_group_name:
            assert current_group_OGRs, 'Erreur! Le groupe "{}" est vide (ligne {})'.format(current_group_name, line_number)
            groups[current_ROME].append({'label': current_group_name, 'OGRs': current_group_OGRs})
            current_group_name = None
            current_group_OGRs = None
    else:
        if current_group_name:
            words = line.split(' ')
            OGR = words[0]
            assert OGR in set(OGR_df[OGR_df.ROME == current_ROME].index), 'Erreur! Le code OGR "{}" ne fais pas partie du code ROME "{}" (ligne {})'.format(OGR, current_ROME, line_number)
            current_group_OGRs.append(OGR)
        elif current_ROME:
            words = line.split(' ')
            if words[0] == '***':
                list_OGRs = []
                for group in groups[current_ROME]:
                    list_OGRs += group['OGRs']
                set_OGRs_in_data = set(list_OGRs)
                assert len(set_OGRs_in_data) == len(list_OGRs), 'Erreur! Un code OGR est présent dans plusieurs groupes du code ROME "{}" (ligne {})'.format(current_ROME, line_number)
                set_OGRs = set(OGR_df[OGR_df.ROME == current_ROME].index)
                assert set_OGRs_in_data == set_OGRs, 'Erreur! Tous les codes OGR ne sont pas classés pour le code ROME "{}" : {} vs {} (ligne {})'.format(current_ROME, set_OGRs_in_data, set_OGRs, line_number)
            
                current_ROME = begin_ROME(line_number, line)
                assert current_ROME not in groups, 'Erreur! Le ROME "{}" est déjà défini (ligne {})'.format(current_ROME, line_number)
                groups[current_ROME] = []
            else:
                current_group_name = line
                current_group_OGRs = []

        else:
            current_ROME = begin_ROME(line_number, line)
            assert current_ROME not in groups, 'Erreur! Le ROME "{}" est déjà défini (ligne {})'.format(current_ROME, line_number)
            groups[current_ROME] = []

            
        
assert current_group_name not in {group['label'] for group in groups[current_ROME]}, 'Erreur! Le nom "{}" est déjà définiTous les codes OGR ne sont pas classés pour le code ROME "{}" (ligne {})'.format(current_ROME, line_number)

Nommage des regroupements


In [5]:
suffixes = string.ascii_lowercase
suffixes


Out[5]:
'abcdefghijklmnopqrstuvwxyz'

In [6]:
for ROME, ROME_groups in groups.items():
    for i, group in enumerate(ROME_groups):
        label = group['label']
        OGRs = group['OGRs']
        group['name'] = ROME + suffixes[i]

In [7]:
groups


Out[7]:
{'D1106': [{'OGRs': ['20560', '20567', '20568'],
   'label': 'Vendeur / Vendeuse en boulangerie-pâtisserie',
   'name': 'D1106a'},
  {'OGRs': ['20576', '20577', '20540', '14863'],
   'label': 'Vendeur / Vendeuse en épicerie',
   'name': 'D1106b'},
  {'OGRs': ['20525', '16388', '20530'],
   'label': 'Vendeur / Vendeuse de fruits et légumes',
   'name': 'D1106c'},
  {'OGRs': ['20570', '20623', '20645'],
   'label': 'Vendeur / Vendeuse en crèmerie-fromagerie',
   'name': 'D1106d'},
  {'OGRs': ['38776', '20622'],
   'label': 'Vendeur / Vendeuse en produits biologiques-diététiques',
   'name': 'D1106e'},
  {'OGRs': ['20646', '20558', '20564', '20559', '20627'],
   'label': 'Vendeur / Vendeuse en boucherie-charcuterie',
   'name': 'D1106f'},
  {'OGRs': ['20612'],
   'label': 'Vendeur / Vendeuse en poissonnerie',
   'name': 'D1106g'},
  {'OGRs': ['20635'],
   'label': 'Vendeur / Vendeuse en vins et spiritueux',
   'name': 'D1106h'}]}

In [8]:
with open('../decoupage_ROME.pickle', 'wb') as f:
    pickle.dump(groups, f)

In [ ]: