In [1]:
import bisect
import pickle
from collections import Counter
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
In [2]:
ROME_df = pd.read_csv('../referentiels/referentiel_ROME/20150921_arboprincipale28427_ROME.csv', index_col=0, sep='|', dtype=str)
OGR_df = pd.read_csv('../referentiels/referentiel_OGR/20150921_arboprincipale28427_OGR.csv', sep='|', dtype=str).set_index('OGR')
NAF_df = pd.read_csv('../referentiels/referentiel_NAF/naf2008_liste_n5_nouveau_header.csv', sep='|', encoding="utf-8").set_index(['NAF'])
In [3]:
tranches_effectif = ['00', '01', '02', '03', '11', '12', '21', '22', '31', '32', '41', '42', '51', '52', '53', 'NN']
seuils_tranches_effectif = [0, 1, 3, 6, 10, 20, 50, 100, 200, 250, 500, 1000, 2000, 5000, 10000]
In [4]:
def trouve_tranche_effectif(effectif):
index = bisect.bisect(seuils_tranches_effectif, effectif) - 1
return index, tranches_effectif[index]
In [5]:
filename = '../LBB_ETT_ETT_20160430_20170530_20170530_162434_clean.csv'
In [6]:
df = pd.read_csv(filename, sep='|', dtype=str, keep_default_na=False)
In [7]:
duree_liste = list(df.dn_nbjcaltotalmission)
In [8]:
duree_liste = [int(duree) for duree in duree_liste if duree != 'NULL']
In [9]:
comptage_duree = Counter(duree_liste)
In [10]:
duree_max = max(comptage_duree.keys())
duree_max
Out[10]:
In [11]:
histo = [comptage_duree[i] for i in range(duree_max + 1)]
sum(histo), sum(histo[30:])
Out[11]:
In [12]:
plt.plot(histo[:50])
Out[12]:
In [13]:
volume = [histo[i]*i for i in range(duree_max + 1)]
plt.plot(volume[:100])
Out[13]:
In [14]:
sum(volume), sum(volume[30:]), float(sum(volume[30:]))/sum(volume)
Out[14]:
In [15]:
def ponderation_duree(nbrjourtravaille, dn_nbmission):
return 1 if (int(nbrjourtravaille) * int(dn_nbmission) >=30) else 0
In [16]:
n_NAF = len(NAF_df)
n_ROME = len(ROME_df)
n_tranches = len(tranches_effectif)
array_ROME1 = np.zeros((n_NAF, n_ROME, n_tranches))
array_ROME2 = np.zeros((n_NAF, n_ROME, n_tranches))
In [17]:
for i, row in df.iterrows():
(dn_nbjcaltotalmission, dc_nafinsee700_id, dn_nbmission,
dc_nafrefv2_id, dc_trancheeffectif_id, dc_romev3_1_id,
dc_romev3_2_id) = row
if ((dn_nbjcaltotalmission != 'NULL') and (dc_nafrefv2_id in NAF_df.index) and
(dc_romev3_1_id in ROME_df.index)):
NAF_index = NAF_df.index.get_loc(dc_nafrefv2_id)
ROME_index = ROME_df.index.get_loc(dc_romev3_1_id)
try:
effectif = int(dc_trancheeffectif_id)
except ValueError:
tranche = 'NA'
tranche_index = 15
else:
tranche_index, tranche = trouve_tranche_effectif(effectif)
poids = ponderation_duree(dn_nbjcaltotalmission, dn_nbmission)
array_ROME1[NAF_index, ROME_index, tranche_index] += poids
if dc_romev3_2_id in ROME_df.index:
ROME2_index = ROME_df.index.get_loc(dc_romev3_2_id)
array_ROME2[NAF_index, ROME2_index, tranche_index] += poids
if i % 100000 == 0:
print(i)
In [18]:
with open('../array_ROME1_ETT.pickle', 'wb') as f:
pickle.dump(array_ROME1, f)
In [19]:
with open('../array_ROME2_ETT.pickle', 'wb') as f:
pickle.dump(array_ROME2, f)
In [ ]: