In [1]:
import pickle
from collections import defaultdict
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.cluster import AffinityPropagation
In [2]:
ROME_df = pd.read_csv('../referentiels/referentiel_ROME/20150921_arboprincipale28427_ROME.csv', index_col=0, sep='|', dtype=str)
OGR_df = pd.read_csv('../referentiels/referentiel_OGR/20150921_arboprincipale28427_OGR.csv', sep='|', dtype=str).set_index('OGR')
NAF_df = pd.read_csv('../referentiels/referentiel_NAF/naf2008_liste_n5_nouveau_header.csv', sep='|', encoding="utf-8").set_index(['NAF'])
In [3]:
with open('../array_offres_OGR.pickle', 'rb') as f:
offres_OGR = pickle.load(f)
In [4]:
offres_OGR.shape
Out[4]:
In [5]:
liste_nb_embauche_OGR = offres_OGR.sum(0)
nb_embauche_by_OGR = dict(zip(OGR_df.index, liste_nb_embauche_OGR))
In [6]:
def get_matrix(ROME):
# Calcule le nombre d'embauches par NAF pour les différents codes OGR d'un code ROME
# Les codes OGR sans embauches sont exclus
ROME_OGRs = list(OGR_df[OGR_df.ROME == ROME].index)
ROME_OGR_indexes = [OGR_df.index.get_loc(OGR) for OGR in ROME_OGRs]
ROME_matrix = offres_OGR[:, ROME_OGR_indexes]
is_positive = ROME_matrix.sum(0)>0
is_positive_mask = np.where(is_positive)[0]
is_empty = ROME_matrix.sum(0)==0
is_empty_mask = np.where(is_empty)[0]
ROME_matrix = ROME_matrix[:, is_positive_mask]
empty_OGRs = [ROME_OGRs[i] for i in is_empty_mask]
non_empty_OGRs = [ROME_OGRs[i] for i in is_positive_mask]
return ROME_matrix, non_empty_OGRs, empty_OGRs
In [7]:
def similarity(A):
# Calcule la matrice de similarité des codes OGR d'un code ROME
norm2 = np.power(A, 2).sum(0)
norm = np.sqrt(norm2)
A_normalized = A/norm
product = np.dot(A_normalized.T, A_normalized)
return product
In [29]:
def print_clusters(clusters):
output = ''
for cluster in clusters:
for OGR in cluster:
output += '{} {} ({})\n'.format(OGR, OGR_df.loc[OGR].label, nb_embauche_by_OGR[OGR])
output += '\n'
return output
In [ ]:
bool_sim = sim>0.7
equi = np.linalg.matrix_power(bool_sim, 100)
plt.imshow(equi, interpolation='none')
In [ ]:
clusters = defaultdict(list)
for i in range(len(ROME_OGRs)):
repres = min(np.where(equi[i, :])[0])
clusters[repres].append(i)
clusters = list(clusters.values())
clusters
In [ ]:
print_clusters(clusters)
In [10]:
def compute_clusters_affiny_propagation(sim):
if len(sim) == 0:
return []
if len(sim) == 1:
return [[0]]
model = AffinityPropagation(affinity='precomputed')
labels = model.fit_predict(sim)
clusters = defaultdict(list)
for i, l in enumerate(labels):
clusters[l].append(i)
clusters = list(clusters.values())
return clusters
In [12]:
def merge_small_OGR(ROME_matrix, ROME_OGRs):
represented = defaultdict(list)
while True:
if len(ROME_OGRs) < 2:
break
# Find the smallest OGR
index_merged = ROME_matrix.sum(0).argmin()
nb = ROME_matrix[:, index_merged].sum()
if nb >= 500:
# Exit if the smallest OGR is large enough
break
OGR_merged = ROME_OGRs[index_merged]
# Find the closest OGR
sim = similarity(ROME_matrix)
sim_line = sim[index_merged]
sim_line[index_merged] = 0
index_representative = sim_line.argmax()
OGR_representative = ROME_OGRs[index_representative]
# Merge the two OGRs
ROME_matrix[:, index_representative] += ROME_matrix[:, index_merged]
ROME_matrix = np.delete(ROME_matrix, [index_merged], axis=1)
del ROME_OGRs[index_merged]
# Remember the merged OGR is represented by the larger one.
represented[OGR_representative].append(OGR_merged)
return ROME_matrix, ROME_OGRs, represented
In [18]:
def cluster_indexes_to_names(clusters, ROME_OGRs, represented):
clusters_OGR = []
for cluster in clusters:
cluster_OGR = []
for OGR_index in cluster:
OGR = ROME_OGRs[OGR_index]
cluster_OGR.append(OGR)
cluster_OGR += represented[OGR]
clusters_OGR.append(cluster_OGR)
return clusters_OGR
def classify_empty_OGR(ROME_matrix, empty_OGRs, clusters, clusters_OGR):
OGR_weights = ROME_matrix.sum(0)
cluster_weights = []
for cluster in clusters:
cluster_weight = 0
for OGR_index in cluster:
cluster_weight += OGR_weights[OGR_index]
cluster_weights.append(cluster_weight)
largest_cluster = np.array(cluster_weights).argmax()
clusters_OGR[largest_cluster] += empty_OGRs
return clusters_OGR
def compute_clusters(ROME):
ROME_matrix, ROME_OGRs, empty_OGRs = get_matrix(ROME)
ROME_matrix, ROME_OGRs, represented = merge_small_OGR(ROME_matrix, ROME_OGRs)
sim = similarity(ROME_matrix)
clusters = compute_clusters_affiny_propagation(sim)
clusters_OGR = cluster_indexes_to_names(clusters, ROME_OGRs, represented)
clusters_OGR = classify_empty_OGR(ROME_matrix, empty_OGRs, clusters, clusters_OGR)
return clusters_OGR
In [31]:
def print_ROME(ROME):
ROME_label = ROME_df.loc[ROME].label
output = ''
output += '*** {} : {} ***\n\n'.format(ROME, ROME_label)
clusters_OGR = compute_clusters(ROME)
output += print_clusters(clusters_OGR)
output += '\n'
return output
In [33]:
print(print_ROME('D1106'))
In [34]:
with open('../decoupage_ROME.txt', 'w') as f:
for ROME in ROME_df.index:
f.write(print_ROME(ROME))
In [ ]: