In [1]:
import bisect
import pickle

import pandas as pd
import numpy as np

Charge les référentiels

cf referentiels.py


In [2]:
ROME_df = pd.read_csv('../referentiels/referentiel_ROME/20150921_arboprincipale28427_ROME.csv', index_col=0, sep='|', dtype=str)
OGR_df = pd.read_csv('../referentiels/referentiel_OGR/20150921_arboprincipale28427_OGR.csv', sep='|', dtype=str).set_index('OGR')
NAF_df = pd.read_csv('../referentiels/referentiel_NAF/naf2008_liste_n5_nouveau_header.csv', sep='|', encoding="utf-8").set_index(['NAF'])

In [3]:
tranches_effectif = ['00', '01', '02', '03', '11', '12', '21', '22', '31', '32', '41', '42', '51', '52', '53', 'NN']
seuils_tranches_effectif = [0, 1, 3, 6, 10, 20, 50, 100, 200, 250, 500, 1000, 2000, 5000, 10000]

In [4]:
def trouve_tranche_effectif(effectif):
    index = bisect.bisect(seuils_tranches_effectif, effectif) - 1
    return index, tranches_effectif[index]

Charge les DPAE


In [7]:
filename = '../LBB_XDPDPA_DPAE_20160302_20170331_20170331_172811_clean.csv'

In [8]:
df = pd.read_csv(filename, sep='|', dtype=str, keep_default_na=False)

Array : ROME x NAF x effectif


In [9]:
def ponderation_duree(dc_typecontrat_id, nbrjourtravaille):
    if dc_typecontrat_id == '2':  # CDI
        return 1
    elif dc_typecontrat_id == '1':  # CDD
        if int(nbrjourtravaille) >= 30:
            return 1
        else:
            return 0
    raise ValueError()

In [10]:
n_NAF = len(NAF_df)
n_ROME = len(ROME_df)
n_tranches = len(tranches_effectif)

array_ROME1 = np.zeros((n_NAF, n_ROME, n_tranches))
array_ROME2 = np.zeros((n_NAF, n_ROME, n_tranches))

In [11]:
for i, row in df.iterrows():
    (dc_naf_id, dn_tailleetablissement, kd_dateembauche, dc_typecontrat_id, dd_datefincdd,
     dc_romev3_1_id, dc_romev3_2_id, nbrjourtravaille) = row
    
    if (dc_naf_id in NAF_df.index) and (dc_romev3_1_id in ROME_df.index) and (dc_typecontrat_id in {'1', '2'}):
        
        if dc_typecontrat_id == '1' and nbrjourtravaille == 'NULL':
            continue
        
        NAF_index = NAF_df.index.get_loc(dc_naf_id)
        ROME_index = ROME_df.index.get_loc(dc_romev3_1_id)
        
        try:
            effectif = int(dn_tailleetablissement)
        except ValueError:
            tranche = 'NA'
            tranche_index = 15
        else:
            tranche_index, tranche = trouve_tranche_effectif(effectif)
            
        poids = ponderation_duree(dc_typecontrat_id, nbrjourtravaille)
            
        array_ROME1[NAF_index, ROME_index, tranche_index] += poids

        if dc_romev3_2_id in ROME_df.index:
            ROME2_index = ROME_df.index.get_loc(dc_romev3_2_id)
            array_ROME2[NAF_index, ROME2_index, tranche_index] += poids
        
    if i % 100000 == 0:
        print(i)


0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
5400000
5500000
5600000
5700000
5800000
5900000
6000000
6100000
6200000
6300000
6400000
6500000
6600000
6700000
6800000
6900000
7000000
7100000
7200000
7300000
7400000
7500000
7600000
7700000
7800000
7900000
8000000
8100000
8200000
8300000
8400000
8500000
8600000
8700000
8800000
8900000
9000000
9100000
9200000
9300000
9400000
9500000
9600000
9700000
9800000
9900000
10000000
10100000
10200000
10300000
10400000
10500000
10600000
10700000
10800000
10900000
11000000
11100000
11200000
11300000
11400000
11500000
11600000
11700000
11800000
11900000
12000000
12100000
12200000
12300000
12400000
12500000
12600000
12700000
12800000
12900000
13000000
13100000
13200000
13300000
13400000
13500000
13600000
13700000
13800000
13900000
14000000
14100000
14200000
14300000
14400000
14500000
14600000
14700000
14800000
14900000
15000000
15100000
15200000
15300000
15400000
15500000
15600000
15700000
15800000
15900000
16000000
16100000
16200000
16300000
16400000
16500000
16600000
16700000
16800000
16900000
17000000
17100000
17200000
17300000
17400000
17500000
17600000
17700000
17800000
17900000
18000000
18100000
18200000
18300000
18400000
18500000
18600000
18700000
18800000
18900000
19000000
19100000
19200000
19300000
19400000
19500000
19600000
19700000
19800000
19900000
20000000
20100000
20200000
20300000
20400000
20500000
20600000
20700000
20800000
20900000
21000000
21100000
21200000
21300000
21400000
21500000
21600000
21700000
21800000
21900000
22000000
22100000
22200000
22300000
22400000
22500000
22600000
22700000
22800000
22900000
23000000
23100000
23200000
23300000
23400000
23500000
23600000
23700000
23800000
23900000
24000000
24100000
24200000
24300000
24400000
24500000
24600000
24700000
24800000
24900000
25000000
25100000
25200000
25300000
25400000
25500000
25600000
25700000
25800000
25900000
26000000
26100000
26200000
26300000
26400000
26500000
26600000
26700000
26800000
26900000
27000000
27100000
27200000
27300000
27400000
27500000
27600000
27700000
27800000
27900000
28000000
28100000
28200000
28300000
28400000
28500000
28600000
28700000
28800000
28900000
29000000
29100000
29200000
29300000
29400000
29500000
29600000
29700000
29800000
29900000
30000000
30100000
30200000
30300000
30400000
30500000
30600000
30700000
30800000
30900000
31000000
31100000
31200000
31300000
31400000
31500000
31600000
31700000
31800000
31900000
32000000
32100000
32200000
32300000
32400000
32500000
32600000
32700000
32800000
32900000
33000000
33100000
33200000
33300000
33400000
33500000
33600000
33700000
33800000
33900000
34000000
34100000
34200000
34300000
34400000
34500000
34600000
34700000
34800000
34900000
35000000
35100000

In [13]:
with open('../array_ROME1_CDI_CDD.pickle', 'wb') as f:
    pickle.dump(array_ROME1, f)

In [14]:
with open('../array_ROME2_CDI_CDD.pickle', 'wb') as f:
    pickle.dump(array_ROME2, f)

In [ ]: