In [192]:
import json
import os, sys
import pandas as pd
curdir = os.path.abspath('./..')
from localfinance.spiders.localfinance_spider import uniformize_code, convert_dom_code, convert_city
data_dir = os.path.join(curdir, 'scraped_data')
insee_filepath = os.path.join(curdir, 'data/france2013.txt')
quantiles = pd.np.arange(0.01, 1, 0.01)
def plot_quantiles(series, figsize=(12,10)):
pd.Series(quantiles, index=[series.quantile(q) for q in quantiles]).plot(figsize=figsize)
from localfinance.account_parsing import city_account
In [140]:
# Load insee code of cities
insee_df = pd.io.parsers.read_csv(insee_filepath, '\t')
In [142]:
# Load data on 2012
df = pd.DataFrame(json.load(open(os.path.join(data_dir, 'cities_2012.json'))))
In [144]:
print "Les informations suivantes ont été récupérées pour chacune des communes: \n %s"%df.columns.tolist()
In [207]:
names_mapping = pd.DataFrame([dict([(k, v.get('name', '')) for k, v in city_account.nodes.items()])])
names_mapping.transpose()
Out[207]:
In [170]:
# Sanity checks
print u"Nombre de communes dupliquées: %s"%(df['insee_code'].count() - df['insee_code'].unique().size)
print u"Numbre of communes non dupliquées: %s"%df['insee_code'].unique().size
df['dep'] = df['insee_code'].apply(lambda r: r[:3])
gp_by_dep = df.groupby('dep')
print u"Nombre de départements: %s"%gp_by_dep.dep.size().size
In [171]:
# Take only current cities (Cf. insee doc http://www.insee.fr/fr/methodes/default.asp?page=nomenclatures/cog/doc_ffrancee.htm)
insee_df['DEP'] = uniformize_code(insee_df, 'DEP')
insee_df['COM'] = uniformize_code(insee_df, 'COM')
insee_df['DEP'] = convert_dom_code(insee_df)
insee_df['COM'] = insee_df.apply(convert_city, axis=1)
insee_df['DEPCOM'] = insee_df['DEP'] + insee_df['COM']
# Remove MAYOTTE department,
current_insee_df = insee_df[(insee_df['ACTUAL'] == 1) & (insee_df['DEP'] <> '976')]
print "Nombre de commune sans le département de Mayotte au 2013/01/01 selon l'insee: %s"%current_insee_df['DEPCOM'].unique().size
Certaines communes ne présentent aucune données financières sur 2012 sur le site des collectivités locales.
Quelques exemples de page où il n'y a effectivement aucune donnée sur 2012:
On notera que les données sont bien présentes sur les années précédentes.
Ci-dessous les 20 communes sans donnée financières.
(moyenne, écart-type, min, quantile 25%, 50%, 75% et max)
On notera que les charges de personnels (staff_costs) peuvent être négatives!
In [167]:
df[['operating_revenues', 'operating_costs', 'staff_costs', 'net_profit', 'investment_ressources',
'financing_capacity', 'investments_usage', 'debt_at_end_year', 'debt_annual_costs']].describe()[1:]
Out[167]:
In [162]:
, 'property_tax_rate', 'land_property_tax_rate',
'additionnal_land_property_tax_rate', 'business_property_contribution_rate']].describe()[1:]
Out[162]:
In [174]:
codes = set(current_insee_df['DEPCOM']).symmetric_difference(df['insee_code']).intersection(current_insee_df['DEPCOM'])
com_nodata = current_insee_df[['DEPCOM', 'NCC', 'DEP']][current_insee_df['DEPCOM'].apply(lambda r: r in codes)]
com_nodata.head(n=30)
Out[174]:
In [168]:
# Prepare data for analysis
df['property_tax_value_per_person'] = df['property_tax_value']/df['population']
df['home_tax_value_per_person'] = df['home_tax_value']/df['population']
df['debt_ratio'] = df['debt_annual_costs']/df['operating_revenues']
df['staff_costs_ratio'] = df['staff_costs']/df['operating_revenues']
df['staff_costs_per_person'] = df['staff_costs']/df['population']
In [175]:
df[['property_tax_rate', 'home_tax_rate']].describe()
Out[175]:
Une disparité entre les communes qui peut être très importante: on passe de 0% à 59% pour la taxe foncière, de 0% à 61% pour la taxe d'habitation.
In [176]:
plt.figure(figsize=(12,12));
df[['property_tax_rate', 'home_tax_rate']].boxplot()
df[['property_tax_rate', 'home_tax_rate', 'name', 'insee_code']].head(n=20)
Out[176]:
In [177]:
# Biggest property tax rate
_df = df.sort(columns='property_tax_rate', ascending=False)
_df[['property_tax_rate', 'property_tax_value', 'name', 'insee_code']].head(n=20)
Out[177]:
In [178]:
# Lowest property tax
_df = df.sort(columns='property_tax_rate', ascending=True)
_df[['property_tax_rate', 'name', 'insee_code']].head(n=20)
Out[178]:
In [179]:
# Biggest home tax rate
_df = df.sort(columns='home_tax_rate', ascending=False)
_df[['home_tax_rate', 'home_tax_value', 'name', 'insee_code']].head(n=20)
Out[179]:
In [180]:
#Biggest home tax value
_df = df.sort(columns='home_tax_value', ascending=False).dropna()
_df[['home_tax_rate', 'home_tax_value', 'home_tax_value_per_person', 'name', 'insee_code']].head(n=20)
Out[180]:
In [181]:
#Biggest home tax value per person
_df = df.sort(columns='home_tax_value_per_person', ascending=False).dropna()
_df[['home_tax_rate', 'home_tax_value', 'home_tax_value_per_person', 'name', 'insee_code']].head(n=20)
Out[181]:
In [182]:
# lowest home tax rate
_df = df.sort(columns='home_tax_value', ascending=True)
_df[['home_tax_rate', 'home_tax_value', 'name', 'insee_code']].dropna().head(n=20)
Out[182]:
In [183]:
df[['debt_ratio']].describe()
Out[183]:
In [184]:
# biggest debt ratio
plt.figure(figsize=(12,12));
df[['debt_ratio']].boxplot()
_df = df.sort(columns='debt_ratio', ascending=False)
_df[['debt_ratio', 'name', 'insee_code']].head(n=20)
Out[184]:
In [185]:
# lowest debt ratio
_df = df.sort(columns='debt_ratio', ascending=True)
_df[['debt_ratio', 'name', 'insee_code']].head(n=20)
Out[185]:
In [186]:
df[['staff_costs_ratio']].describe()
Out[186]:
In [187]:
plt.figure(figsize=(12,12));
df[['staff_costs_ratio']].boxplot()
_df = df.sort(columns='staff_costs_ratio', ascending=False)
_df[['staff_costs_ratio', 'name', 'insee_code']].head(n=20)
Out[187]:
In [188]:
print df[['home_tax_rate', 'property_tax_rate', 'staff_costs_ratio', 'debt_ratio']].corr()
plot(df['property_tax_rate'], df['staff_costs_ratio'], 'o')
Out[188]:
In [191]:
df[['name', 'insee_code', 'staff_costs']].ix[df['staff_costs_ratio'].argmax()]
Out[191]:
In [ ]: