In [1]:

    
import bisect
import pickle
from collections import Counter

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

Charge les référentiels

cf referentiels.py



In [2]:

    
ROME_df = pd.read_csv('../referentiels/referentiel_ROME/20150921_arboprincipale28427_ROME.csv', index_col=0, sep='|', dtype=str)
OGR_df = pd.read_csv('../referentiels/referentiel_OGR/20150921_arboprincipale28427_OGR.csv', sep='|', dtype=str).set_index('OGR')
NAF_df = pd.read_csv('../referentiels/referentiel_NAF/naf2008_liste_n5_nouveau_header.csv', sep='|', encoding="utf-8").set_index(['NAF'])



In [3]:

    
tranches_effectif = ['00', '01', '02', '03', '11', '12', '21', '22', '31', '32', '41', '42', '51', '52', '53', 'NN']
seuils_tranches_effectif = [0, 1, 3, 6, 10, 20, 50, 100, 200, 250, 500, 1000, 2000, 5000, 10000]



In [4]:

    
def trouve_tranche_effectif(effectif):
    index = bisect.bisect(seuils_tranches_effectif, effectif) - 1
    return index, tranches_effectif[index]

Charge les DPAE



In [5]:

    
filename = '../LBB_ETT_ETT_20160430_20170530_20170530_162434_clean.csv'



In [6]:

    
df = pd.read_csv(filename, sep='|', dtype=str, keep_default_na=False)

Histogramme sur la durée



In [7]:

    
duree_liste = list(df.dn_nbjcaltotalmission)



In [8]:

    
duree_liste = [int(duree) for duree in duree_liste if duree != 'NULL']



In [9]:

    
comptage_duree = Counter(duree_liste)



In [10]:

    
duree_max = max(comptage_duree.keys())
duree_max









    Out[10]:





365



In [11]:

    
histo = [comptage_duree[i] for i in range(duree_max + 1)]
sum(histo), sum(histo[30:])









    Out[11]:





(15339856, 1066312)



In [12]:

    
plt.plot(histo[:50])









    Out[12]:





[<matplotlib.lines.Line2D at 0x7f6c9bf67fd0>]



In [13]:

    
volume = [histo[i]*i for i in range(duree_max + 1)]
plt.plot(volume[:100])









    Out[13]:





[<matplotlib.lines.Line2D at 0x7f6c9be8e5c0>]



In [14]:

    
sum(volume), sum(volume[30:]), float(sum(volume[30:]))/sum(volume)









    Out[14]:





(152336029, 62580705, 0.4108069864417957)

Array : ROME x NAF x effectif



In [15]:

    
def ponderation_duree(nbrjourtravaille, dn_nbmission):
    return 1 if (int(nbrjourtravaille) * int(dn_nbmission) >=30) else 0



In [16]:

    
n_NAF = len(NAF_df)
n_ROME = len(ROME_df)
n_tranches = len(tranches_effectif)

array_ROME1 = np.zeros((n_NAF, n_ROME, n_tranches))
array_ROME2 = np.zeros((n_NAF, n_ROME, n_tranches))



In [17]:

    
for i, row in df.iterrows():
    (dn_nbjcaltotalmission, dc_nafinsee700_id, dn_nbmission,
    dc_nafrefv2_id, dc_trancheeffectif_id, dc_romev3_1_id,
    dc_romev3_2_id) = row
    
    if ((dn_nbjcaltotalmission != 'NULL') and (dc_nafrefv2_id in NAF_df.index) and
        (dc_romev3_1_id in ROME_df.index)):
        
        NAF_index = NAF_df.index.get_loc(dc_nafrefv2_id)
        ROME_index = ROME_df.index.get_loc(dc_romev3_1_id)
        
        try:
            effectif = int(dc_trancheeffectif_id)
        except ValueError:
            tranche = 'NA'
            tranche_index = 15
        else:
            tranche_index, tranche = trouve_tranche_effectif(effectif)
            
        poids = ponderation_duree(dn_nbjcaltotalmission, dn_nbmission)
        
        array_ROME1[NAF_index, ROME_index, tranche_index] += poids

        if dc_romev3_2_id in ROME_df.index:
            ROME2_index = ROME_df.index.get_loc(dc_romev3_2_id)
            array_ROME2[NAF_index, ROME2_index, tranche_index] += poids
        
    if i % 100000 == 0:
        print(i)



In [18]:

    
with open('../array_ROME1_ETT.pickle', 'wb') as f:
    pickle.dump(array_ROME1, f)



In [19]:

    
with open('../array_ROME2_ETT.pickle', 'wb') as f:
    pickle.dump(array_ROME2, f)



In [ ]: