In [1]:
import bisect
import pickle
import pandas as pd
import numpy as np
In [2]:
ROME_df = pd.read_csv('../referentiels/referentiel_ROME/20150921_arboprincipale28427_ROME.csv', index_col=0, sep='|', dtype=str)
OGR_df = pd.read_csv('../referentiels/referentiel_OGR/20150921_arboprincipale28427_OGR.csv', sep='|', dtype=str).set_index('OGR')
NAF_df = pd.read_csv('../referentiels/referentiel_NAF/naf2008_liste_n5_nouveau_header.csv', sep='|', encoding="utf-8").set_index(['NAF'])
In [3]:
tranches_effectif = ['00', '01', '02', '03', '11', '12', '21', '22', '31', '32', '41', '42', '51', '52', '53', 'NN']
seuils_tranches_effectif = [0, 1, 3, 6, 10, 20, 50, 100, 200, 250, 500, 1000, 2000, 5000, 10000]
In [4]:
def trouve_tranche_effectif(effectif):
index = bisect.bisect(seuils_tranches_effectif, effectif) - 1
return index, tranches_effectif[index]
In [7]:
filename = '../LBB_XDPDPA_DPAE_20160302_20170331_20170331_172811_clean.csv'
In [8]:
df = pd.read_csv(filename, sep='|', dtype=str, keep_default_na=False)
In [9]:
def ponderation_duree(dc_typecontrat_id, nbrjourtravaille):
if dc_typecontrat_id == '2': # CDI
return 1
elif dc_typecontrat_id == '1': # CDD
if int(nbrjourtravaille) >= 30:
return 1
else:
return 0
raise ValueError()
In [10]:
n_NAF = len(NAF_df)
n_ROME = len(ROME_df)
n_tranches = len(tranches_effectif)
array_ROME1 = np.zeros((n_NAF, n_ROME, n_tranches))
array_ROME2 = np.zeros((n_NAF, n_ROME, n_tranches))
In [11]:
for i, row in df.iterrows():
(dc_naf_id, dn_tailleetablissement, kd_dateembauche, dc_typecontrat_id, dd_datefincdd,
dc_romev3_1_id, dc_romev3_2_id, nbrjourtravaille) = row
if (dc_naf_id in NAF_df.index) and (dc_romev3_1_id in ROME_df.index) and (dc_typecontrat_id in {'1', '2'}):
if dc_typecontrat_id == '1' and nbrjourtravaille == 'NULL':
continue
NAF_index = NAF_df.index.get_loc(dc_naf_id)
ROME_index = ROME_df.index.get_loc(dc_romev3_1_id)
try:
effectif = int(dn_tailleetablissement)
except ValueError:
tranche = 'NA'
tranche_index = 15
else:
tranche_index, tranche = trouve_tranche_effectif(effectif)
poids = ponderation_duree(dc_typecontrat_id, nbrjourtravaille)
array_ROME1[NAF_index, ROME_index, tranche_index] += poids
if dc_romev3_2_id in ROME_df.index:
ROME2_index = ROME_df.index.get_loc(dc_romev3_2_id)
array_ROME2[NAF_index, ROME2_index, tranche_index] += poids
if i % 100000 == 0:
print(i)
In [13]:
with open('../array_ROME1_CDI_CDD.pickle', 'wb') as f:
pickle.dump(array_ROME1, f)
In [14]:
with open('../array_ROME2_CDI_CDD.pickle', 'wb') as f:
pickle.dump(array_ROME2, f)
In [ ]: