In [1]:

    
import bisect
import pickle
from collections import Counter

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

Charge les référentiels

cf referentiels.py



In [2]:

    
ROME_df = pd.read_csv('../referentiels/referentiel_ROME/20150921_arboprincipale28427_ROME.csv', index_col=0, sep='|', dtype=str)
OGR_df = pd.read_csv('../referentiels/referentiel_OGR/20150921_arboprincipale28427_OGR.csv', sep='|', dtype=str).set_index('OGR')
NAF_df = pd.read_csv('../referentiels/referentiel_NAF/naf2008_liste_n5_nouveau_header.csv', sep='|', encoding="utf-8").set_index(['NAF'])



In [3]:

    
tranches_effectif = ['00', '01', '02', '03', '11', '12', '21', '22', '31', '32', '41', '42', '51', '52', '53', 'NN']
seuils_tranches_effectif = [0, 1, 3, 6, 10, 20, 50, 100, 200, 250, 500, 1000, 2000, 5000, 10000]



In [4]:

    
def trouve_tranche_effectif(effectif):
    index = bisect.bisect(seuils_tranches_effectif, effectif) - 1
    return index, tranches_effectif[index]

Charge les offres



In [7]:

    
filename = '../LBB_EOF_OFFRES_20160307_20170407_20170407_191410_clean.csv'



In [8]:

    
df = pd.read_csv(filename, sep='|', dtype=str, keep_default_na=False)



In [9]:

    
liste_nb_postes = [int(nb_postes) for nb_postes in df.dn_nbrpostesoffertscreation]



In [10]:

    
len(df), sum(liste_nb_postes)









    Out[10]:





(7023100, 8568949)



In [11]:

    
comptage_type_contrat = Counter(df.dc_typecontrat_id)
contrat_labels = comptage_type_contrat.keys()
contrat_comptes = [comptage_type_contrat[t] for t in contrat_labels]
n_type_contrat = len(contrat_labels)



In [12]:

    
ind = np.arange(n_type_contrat)
width = 0.35

fig, ax = plt.subplots()
rects1 = ax.bar(ind, contrat_comptes, width)

ax.set_ylabel('Nb offres')
ax.set_title('Répartition des offres par type de contrat')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(contrat_labels)









    Out[12]:





[<matplotlib.text.Text at 0x7fb9e5a90eb8>,
 <matplotlib.text.Text at 0x7fb9e51b6390>,
 <matplotlib.text.Text at 0x7fb9e85b62b0>,
 <matplotlib.text.Text at 0x7fb9e85c6cf8>,
 <matplotlib.text.Text at 0x7fb9e51b6ac8>,
 <matplotlib.text.Text at 0x7fb9e85b6c88>,
 <matplotlib.text.Text at 0x7fb9e85bc668>,
 <matplotlib.text.Text at 0x7fb9e853f0b8>,
 <matplotlib.text.Text at 0x7fb9e853fac8>,
 <matplotlib.text.Text at 0x7fb9e8542518>,
 <matplotlib.text.Text at 0x7fb9e8542f28>,
 <matplotlib.text.Text at 0x7fb9e8549978>,
 <matplotlib.text.Text at 0x7fb9e854b3c8>]



In [13]:

    
comptage_unite_duree = Counter(df.dc_unitedureecontrat)



In [14]:

    
comptage_unite_duree









    Out[14]:





Counter({'JO': 550339, 'MO': 3480435, 'null': 2992326})



In [15]:

    
def calcul_duree(row):
    if row['dc_unitedureecontrat'] == 'null':
        assert row['dc_typecontrat_id'] != 'CDD', row
        assert row['dc_duree_contrat_id'] == '00', row
        return 10*365
    else:
        # assert row['dc_typecontrat_id'] != 'CDI', row # CDI de 36 mois à l'index 271860
        if row['dc_unitedureecontrat'] == 'JO':
            return int(row['dc_duree_contrat_id'])
        elif row['dc_unitedureecontrat'] == 'MO':
            return int(row['dc_duree_contrat_id'])*30
        else:
            print(row)
            raise ValueError()



In [16]:

    
df['duree_jours'] = df.apply(calcul_duree, axis=1)



In [17]:

    
comptage_duree = Counter(df.duree_jours)



In [18]:

    
duree_max = max(comptage_duree.keys())
duree_max









    Out[18]:





3650



In [19]:

    
histo = [comptage_duree[i] for i in range(duree_max + 1)]
sum(histo), sum(histo[30:])









    Out[19]:





(7023100, 6569294)



In [20]:

    
sum(histo[:370])/sum(histo)









    Out[20]:





0.5423963776679813



In [21]:

    
plt.plot(histo[:370])









    Out[21]:





[<matplotlib.lines.Line2D at 0x7fb9e84e1748>]

Array : OGR x NAF



In [22]:

    
def ponderation_duree(duree_jours):
    return 1 if duree_jours >= 30 else 0



In [23]:

    
n_NAF = len(NAF_df)
n_OGR = len(OGR_df)



In [24]:

    
array = np.zeros((n_NAF, n_OGR))

set_OGR = set(OGR_df.index)
set_NAF = set(NAF_df.index)

for i, row in df.iterrows():
    (dn_nbrpostesoffertscreation, dc_rome_id, dc_appelationrome_id,
     dc_naturecontrat_id, dc_typecontrat_id, dc_unitedureecontrat,
     dc_duree_contrat_id, dc_naf2, duree_jours) = row
    
    dc_appelationrome_id = dc_appelationrome_id[-5:]
    
    if (dc_naf2 in set_NAF) and (dc_appelationrome_id in set_OGR):
        NAF_index = NAF_df.index.get_loc(dc_naf2)
        OGR_index = OGR_df.index.get_loc(dc_appelationrome_id)
            
        poids = ponderation_duree(duree_jours) * int(dn_nbrpostesoffertscreation)
        
        array[NAF_index, OGR_index] += poids

    if i % 100000 == 0:
        print(i)



In [26]:

    
with open('../array_offres_OGR.pickle', 'wb') as f:
    pickle.dump(array, f)

Array : ROME x NAF



In [27]:

    
def ponderation_duree(duree_jours):
    return 1 if duree_jours >= 30 else 0



In [28]:

    
df.columns









    Out[28]:





Index(['dn_nbrpostesoffertscreation', 'dc_rome_id', 'dc_appelationrome_id',
       'dc_naturecontrat_id', 'dc_typecontrat_id', 'dc_unitedureecontrat',
       'dc_duree_contrat_id', 'dc_naf2', 'duree_jours'],
      dtype='object')



In [29]:

    
n_NAF = len(NAF_df)
n_ROME = len(ROME_df)



In [30]:

    
array = np.zeros((n_NAF, n_ROME))


for i, row in df.iterrows():
    (dn_nbrpostesoffertscreation, dc_rome_id, dc_appelationrome_id,
     dc_naturecontrat_id, dc_typecontrat_id, dc_unitedureecontrat,
     dc_duree_contrat_id, dc_naf2, duree_jours) = row
    
    if (dc_naf2 in NAF_df.index) and (dc_rome_id in ROME_df.index):
        NAF_index = NAF_df.index.get_loc(dc_naf2)
        ROME_index = ROME_df.index.get_loc(dc_rome_id)
            
        poids = ponderation_duree(duree_jours) * int(dn_nbrpostesoffertscreation)
        
        array[NAF_index, ROME_index] += poids

    if i % 100000 == 0:
        print(i)



In [31]:

    
with open('../array_offres.pickle', 'wb') as f:
    pickle.dump(array, f)



In [ ]: