In [1]:
import pickle
from collections import defaultdict

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.cluster import AffinityPropagation

Load


In [2]:
ROME_df = pd.read_csv('../referentiels/referentiel_ROME/20150921_arboprincipale28427_ROME.csv', index_col=0, sep='|', dtype=str)
OGR_df = pd.read_csv('../referentiels/referentiel_OGR/20150921_arboprincipale28427_OGR.csv', sep='|', dtype=str).set_index('OGR')
NAF_df = pd.read_csv('../referentiels/referentiel_NAF/naf2008_liste_n5_nouveau_header.csv', sep='|', encoding="utf-8").set_index(['NAF'])

In [3]:
with open('../array_offres_OGR.pickle', 'rb') as f:
    offres_OGR = pickle.load(f)

In [4]:
offres_OGR.shape


Out[4]:
(732, 10877)

In [5]:
liste_nb_embauche_OGR = offres_OGR.sum(0)
nb_embauche_by_OGR = dict(zip(OGR_df.index, liste_nb_embauche_OGR))

Data transformation


In [6]:
def get_matrix(ROME):
    # Calcule le nombre d'embauches par NAF pour les différents codes OGR d'un code ROME
    # Les codes OGR sans embauches sont exclus

    ROME_OGRs = list(OGR_df[OGR_df.ROME == ROME].index)
    ROME_OGR_indexes = [OGR_df.index.get_loc(OGR) for OGR in ROME_OGRs]
    
    ROME_matrix = offres_OGR[:, ROME_OGR_indexes]
    
    is_positive = ROME_matrix.sum(0)>0
    is_positive_mask = np.where(is_positive)[0]
    is_empty = ROME_matrix.sum(0)==0
    is_empty_mask = np.where(is_empty)[0]
    ROME_matrix = ROME_matrix[:, is_positive_mask]
    empty_OGRs = [ROME_OGRs[i] for i in is_empty_mask]
    non_empty_OGRs = [ROME_OGRs[i] for i in is_positive_mask]
    
    return ROME_matrix, non_empty_OGRs, empty_OGRs

In [7]:
def similarity(A):
    # Calcule la matrice de similarité des codes OGR d'un code ROME
    norm2 = np.power(A, 2).sum(0)
    norm = np.sqrt(norm2)
    A_normalized = A/norm
    product = np.dot(A_normalized.T, A_normalized)
    return product

In [29]:
def print_clusters(clusters):
    output = ''
    for cluster in clusters:
        for OGR in cluster:
            output += '{} {} ({})\n'.format(OGR, OGR_df.loc[OGR].label, nb_embauche_by_OGR[OGR])
        output += '\n'
    return output

Naive clustering (using a threashold)


In [ ]:
bool_sim = sim>0.7
equi = np.linalg.matrix_power(bool_sim, 100)
plt.imshow(equi, interpolation='none')

In [ ]:
clusters = defaultdict(list)
for i in range(len(ROME_OGRs)):
    repres = min(np.where(equi[i, :])[0])
    clusters[repres].append(i)
clusters = list(clusters.values())
clusters

In [ ]:
print_clusters(clusters)

Affinity propagation


In [10]:
def compute_clusters_affiny_propagation(sim):
    if len(sim) == 0:
        return []
    if len(sim) == 1:
        return [[0]]
    
    model = AffinityPropagation(affinity='precomputed')
    labels = model.fit_predict(sim)
    clusters = defaultdict(list)
    for i, l in enumerate(labels):
        clusters[l].append(i)
    clusters = list(clusters.values())
    return clusters

In [12]:
def merge_small_OGR(ROME_matrix, ROME_OGRs):
    represented = defaultdict(list)
    while True:
        if len(ROME_OGRs) < 2:
            break

        # Find the smallest OGR
        index_merged = ROME_matrix.sum(0).argmin()
        nb = ROME_matrix[:, index_merged].sum()
        if nb >= 500:
            # Exit if the smallest OGR is large enough
            break
        OGR_merged = ROME_OGRs[index_merged]

        # Find the closest OGR
        sim = similarity(ROME_matrix)
        sim_line = sim[index_merged]
        sim_line[index_merged] = 0
        index_representative = sim_line.argmax()
        OGR_representative = ROME_OGRs[index_representative]

        # Merge the two OGRs
        ROME_matrix[:, index_representative] += ROME_matrix[:, index_merged]
        ROME_matrix = np.delete(ROME_matrix, [index_merged], axis=1)
        del ROME_OGRs[index_merged]

        # Remember the merged OGR is represented by the larger one.
        represented[OGR_representative].append(OGR_merged)

    return ROME_matrix, ROME_OGRs, represented

In [18]:
def cluster_indexes_to_names(clusters, ROME_OGRs, represented):
    clusters_OGR = []
    for cluster in clusters:
        cluster_OGR = []
        for OGR_index in cluster:
            OGR = ROME_OGRs[OGR_index]
            cluster_OGR.append(OGR)
            cluster_OGR += represented[OGR]
        clusters_OGR.append(cluster_OGR)
    return clusters_OGR

def classify_empty_OGR(ROME_matrix, empty_OGRs, clusters, clusters_OGR):
    OGR_weights = ROME_matrix.sum(0)
    cluster_weights = []
    for cluster in clusters:
        cluster_weight = 0
        for OGR_index in cluster:
            cluster_weight += OGR_weights[OGR_index]
        cluster_weights.append(cluster_weight)
    largest_cluster = np.array(cluster_weights).argmax()
    clusters_OGR[largest_cluster] += empty_OGRs
    return clusters_OGR


def compute_clusters(ROME):
    ROME_matrix, ROME_OGRs, empty_OGRs = get_matrix(ROME)

    ROME_matrix, ROME_OGRs, represented = merge_small_OGR(ROME_matrix, ROME_OGRs)

    sim = similarity(ROME_matrix)
    clusters = compute_clusters_affiny_propagation(sim)

    clusters_OGR = cluster_indexes_to_names(clusters, ROME_OGRs, represented)
    clusters_OGR = classify_empty_OGR(ROME_matrix, empty_OGRs, clusters, clusters_OGR)
    return clusters_OGR

Loop


In [31]:
def print_ROME(ROME):
    ROME_label = ROME_df.loc[ROME].label
    output = ''
    output += '*** {} : {} ***\n\n'.format(ROME, ROME_label)
    clusters_OGR = compute_clusters(ROME)
    output += print_clusters(clusters_OGR)
    output += '\n'
    return output

In [33]:
print(print_ROME('D1106'))


*** D1106 : Vente en alimentation ***

20560 Vendeur / Vendeuse en boulangerie-pâtisserie (45782.0)

20567 Vendeur / Vendeuse en chocolaterie (2981.0)
20568 Vendeur / Vendeuse en confiserie (811.0)

20576 Vendeur / Vendeuse en épicerie (1467.0)
20577 Vendeur / Vendeuse en épicerie fine (2868.0)
20570 Vendeur / Vendeuse en crèmerie (434.0)
38776 Vendeur / Vendeuse en produits biologiques (2067.0)
20622 Vendeur / Vendeuse en produits diététiques (1062.0)

20525 Vendeur / Vendeuse de fruits et légumes (3062.0)
16388 Marchand / Marchande de fruits et légumes (51.0)
20530 Vendeur / Vendeuse de primeurs (249.0)
20540 Vendeur / Vendeuse en alimentation générale (3395.0)
14863 Epicier / Epicière (353.0)
20558 Vendeur / Vendeuse en boucherie (1443.0)
20559 Vendeur / Vendeuse en boucherie-charcuterie (2069.0)
20627 Vendeur / Vendeuse en rôtisserie (236.0)
20564 Vendeur / Vendeuse en charcuterie (4729.0)
20612 Vendeur / Vendeuse en poissonnerie (3842.0)
20623 Vendeur / Vendeuse en produits frais (1928.0)
20645 Vendeur / Vendeuse rayon fromages (1194.0)
20646 Vendeur / Vendeuse rayon traiteur (926.0)

20635 Vendeur / Vendeuse en vins et spiritueux (1913.0)




In [34]:
with open('../decoupage_ROME.txt', 'w') as f:
    for ROME in ROME_df.index:
        f.write(print_ROME(ROME))

In [ ]: