In [1]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from jupyterthemes import jtplot
import re
import unidecode

jtplot.style()

def millions(x, pos):
    return '%1.f M€' % (x*1e-6)

def custom_parser(s):
    if type(s) == str:
        return pd.datetime.strptime(s, '%d/%m/%Y')
    return pd.np.nan

Analyse des avantages


In [2]:
avantage_data = pd.read_csv('../../data/tmp/declaration_avantage_2017_07_27_04_00.csv', 
                       parse_dates=['avant_date_signature'], 
                       date_parser=custom_parser,
                       sep=';')


/Users/fmassot/Envs/RC3/lib/python3.4/site-packages/IPython/core/interactiveshell.py:2718: DtypeWarning: Columns (12,13,14,18,19,25,26,27,28,29,34) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [4]:
avantage_data.head()


Out[4]:
entreprise_identifiant denomination_sociale ligne_identifiant ligne_rectification benef_categorie_code categorie benef_nom benef_prenom benef_qualite_code qualite ... benef_etablissement_codepostal benef_etablissement_ville benef_denomination_sociale benef_objet_social ligne_type avant_date_signature avant_montant_ttc avant_nature avant_convention_lie semestre
0 LFSWZPFF AbbVie 1H12015NR1792 N [PRS] Professionnel de santé REY MARIE [10] Médecin ... NaN NaN NaN NaN [A] 2015-02-16 40 REPAS NaN S1
1 LFSWZPFF AbbVie 1H12015NR2265 N [PRS] Professionnel de santé ORANCEON LIONEL [10] Médecin ... NaN NaN NaN NaN [A] 2015-03-12 35 REPAS NaN S1
2 IOUJLTOH GLOBAL D GD-3712 N [PRS] Professionnel de santé LECONTE CAROLE [40] Chirurgien-dentiste ... NaN NaN NaN NaN [A] 2016-02-04 94 REPAS NaN S1
3 IOUJLTOH GLOBAL D GD-3628 N [PRS] Professionnel de santé HENRY FRANCOIS [40] Chirurgien-dentiste ... NaN NaN NaN NaN [A] 2016-01-11 50 REPAS NaN S1
4 IOUJLTOH GLOBAL D GD-3418 N [PRS] Professionnel de santé JOUINES OLIVIER [40] Chirurgien-dentiste ... NaN NaN NaN NaN [A] 2016-01-26 20 REPAS NaN S1

5 rows × 36 columns

Montant total des avantages données aux étudiants depuis 2012


In [5]:
print("Total des avantages depuis mi 2013 :", millions(avantage_data.avant_montant_ttc.sum(), ""))
print("Nombre d'avantages :", avantage_data.avant_montant_ttc.shape[0])
print("Montant moyen d'un avantage :", avantage_data.avant_montant_ttc.mean())


Total des avantages depuis mi 2013 : 911 M€
Nombre d'avantages : 6894196
Montant moyen d'un avantage : 132.182683956

Evolution annuelle du montant des avantages


In [7]:
avantage_data_by_month = avantage_data.groupby(avantage_data.avant_date_signature.dt.to_period("M"))
avantage_data_by_quarter = avantage_data.groupby(avantage_data.avant_date_signature.dt.to_period("Q"))
avantage_data_by_year = avantage_data.groupby(avantage_data.avant_date_signature.dt.to_period("A"))
avantage_data_by_year.avant_montant_ttc.sum()


Out[7]:
avant_date_signature
2012     31882376
2013    150507607
2014    242825450
2015    258937971
2016    227139927
Freq: A-DEC, Name: avant_montant_ttc, dtype: int64

In [8]:
with sns.axes_style("whitegrid"):
    sns.set_context("talk")
    plot = avantage_data_by_year.avant_montant_ttc.sum().plot()
    formatter = mpl.ticker.FuncFormatter(millions)
    plot.yaxis.set_major_formatter(formatter)
    plot = avantage_data_by_quarter.avant_montant_ttc.sum().plot()
    formatter = mpl.ticker.FuncFormatter(millions)
    plot.yaxis.set_major_formatter(formatter)


Montant des avantages par type


In [9]:
repas_re = re.compile('(repas|restauration|restaurant|dejeuner|diner|collation|bouche|coktail|cocktail|traiteur|buffet|reception)')
transport_re = re.compile('(transport|deplacement|train)')
hebergement_re = re.compile('(hebergement|hospitalite|nuit|voyage|hotel|bergement)')
congre_re = re.compile("(inscription|congres|invitation|conferences)")
autre_re = re.compile("(autre|divers|cadeau)")
don_re = re.compile("don")

def normalize_nature(nature):
    nature = unidecode.unidecode(nature).lower()
    if repas_re.search(nature):
        return "REPAS"
    if hebergement_re.search(nature):
        return "HEBERGEMENT"
    if transport_re.search(nature):
        return "TRANSPORT"
    if congre_re.search(nature):
        return "CONGRES"
    if don_re.search(nature):
        return "DON"
    if autre_re.search(nature):
        return "AUTRE"
    
    return nature

avantage_data["normalized_avant_nature"] = avantage_data.avant_nature.apply(normalize_nature)
avant_by_nature = avantage_data.groupby('normalized_avant_nature').sum()

In [10]:
avant_by_nature.reset_index(level=0, inplace=True)
avant_by_nature.sort_values(by="avant_montant_ttc", ascending=False, inplace=True)

In [11]:
with sns.axes_style("whitegrid"):
    sns.set_context("talk")
    sns.barplot(x="avant_montant_ttc", y="normalized_avant_nature", data=avant_by_nature[:10], label="Total", color="b")



In [ ]: