Guillaume Chaslot guillaume.chaslot@data.gouv.fr
The tax/benefit legislation on individuals in France is more than 200,000 lines long.
We believe that with this tool, you can make it less than 100 lines, transparent, and 95% similar to the existing legislation
For this test, we only take a population of people from all ages, who have 0 to 5 children, no salary, married, veuf, paces, divorces or celibataires, and simulate the "aides sociales"
Within a few minutes, we got a tax reform for "aides sociales" for people with no salary that is:
In [1]:
from utils import show_histogram, percent_diff, scatter_plot, multi_scatter
from econometrics import gini, draw_ginis
import matplotlib.pyplot as plt
# import bqplot.pyplot as plt
import pandas as pd
import numpy as np
import random
import qgrid
import copy
from reformators import Excalibur
qgrid.nbinstall(overwrite=True)
%matplotlib inline
def show_last_reform_results(revdisp, simulated_reform):
if cost > 0:
display_cost_name = 'Total Cost in Euros'
display_cost_value = int(cost)
else:
display_cost_name = 'Total Savings in Euros'
display_cost_value = -int(cost)
old_gini = gini(revdisp)
new_gini = gini(simulated_reform)
result_frame = pd.DataFrame({
display_cost_name: [display_cost_value],
'Average change / family / month in Euros' : [int(error) / 12],
'People losing money' : str(100 * pissed) + '%',
'Old Gini' : old_gini,
'New Gini' : new_gini,
})
result_frame.set_index(display_cost_name, inplace=True)
qgrid.show_grid(result_frame)
In [2]:
reformator = Excalibur(target_variable='revdisp',
taxable_variable='taxable_variable')
In [3]:
reformator.load_openfisca_results('1aj-1bj-f-2000')
# Removing unlikely cases of super old parents due to our approximative population
# reformator.filter_only_likely_population()
# # Keeping only people with no revenu for this test
# reformator.filter_only_no_revenu()
In [4]:
def age_dec_1(family):
return age_from_box(family, '0DA')
def age_dec_2(family):
if '0DB' in family:
return age_from_box(family, '0DB')
return None
def if_married(family):
return 'M' in family
def if_pacse(family):
return 'O' in family
def if_veuf(family):
return 'V' in family
def if_divorce(family):
return 'D' in family
def if_two_people(family):
return '0DB' in family
def revenu_1(family):
return family['1AJ']
def taxable_variable(family):
if '1BJ' in family:
return family['1AJ'] + family['1BJ']
else:
return family['1AJ']
def revenu_2(family):
if '1BJ' in family:
return family['1BJ']
return False
def age_from_box(family, birthdate_box):
return 2014 - family.get(birthdate_box, 2014)
def if_both_declarant_parent_below_24(family):
if age_dec_1(family) >= 24:
return False
if '0DB' in family and age_dec_2(family) >= 24:
return False
if 'F' not in family or family['F'] == 0:
return False
return True
def per_child(family):
if 'F' in family:
return family['F']
else:
return None
def per_child_parent_isole(family):
if '0DB' in family:
return None
if 'F' in family:
return family['F']
else:
return None
def if_parent_isole_moins_20k(family):
return 'F' in family and family['F'] >= 1 and ('0DB' not in family) and family['1AJ'] < 20000
def if_enfant_unique(family):
return 'F' in family and family['F'] == 1
def if_deux_enfants_ou_plus(family):
return 'F' in family and family['F'] >= 2
def per_child_after_2(family):
if 'F' in family and family['F'] >= 2:
return family['F'] - 2
return None
def if_declarant_above_65(family):
return age_from_box(family, '0DA') >= 65
def if_codeclarant_above_65(family):
return '0DB' in family and age_from_box(family, '0DB') > 65
def per_declarant_above_65(family):
return int(age_from_box(family, '0DB') >= 65) + int(age_from_box(family, '0DA') >= 65)
def per_declarant_above_24(family):
return int(age_from_box(family, '0DB') >= 24) + int(age_from_box(family, '0DA') >= 24)
def if_one_declarant_above_24(family):
return (age_from_box(family, '0DA') >= 24 or ('0DB' in family and age_from_box(family, '0DB') >= 24))
def if_one_declarant_above_24_or_has_children(family):
return (age_from_box(family, '0DA') >= 24 or ('0DB' in family and age_from_box(family, '0DB') >= 24) or
('F' in family and family['F'] >= 1))
def if_earns_10k(family):
return taxable_variable(family) > 10000
def if_earns_30k(family):
return taxable_variable(family) > 30000
def if_earns_40k(family):
return taxable_variable(family) > 40000
def base(family):
return True
reformator.add_concept('base', base)
reformator.add_concept('age_dec_1', age_dec_1)
reformator.add_concept('age_dec_2', age_dec_2)
reformator.add_concept('taxable_variable', taxable_variable)
reformator.add_concept('revenu_1', revenu_1)
reformator.add_concept('revenu_2', revenu_2)
reformator.add_concept('per_child', per_child)
reformator.add_concept('per_child_parent_isole', per_child_parent_isole)
reformator.add_concept('per_declarant_above_24', per_declarant_above_24)
reformator.add_concept('per_declarant_above_65', per_declarant_above_65)
reformator.add_concept('per_child_after_2', per_child_after_2)
reformator.add_concept('if_earns_10k', if_earns_10k)
reformator.add_concept('if_earns_30k', if_earns_30k)
reformator.add_concept('if_earns_40k', if_earns_40k)
reformator.add_concept('if_one_declarant_above_24', if_one_declarant_above_24)
reformator.add_concept('if_one_declarant_above_24_or_has_children', if_one_declarant_above_24_or_has_children)
reformator.add_concept('if_both_declarant_parent_below_24', if_both_declarant_parent_below_24)
reformator.add_concept('if_declarant_above_65', if_declarant_above_65)
reformator.add_concept('if_codeclarant_above_65', if_codeclarant_above_65)
reformator.add_concept('if_enfant_unique', if_enfant_unique)
reformator.add_concept('if_deux_enfants_ou_plus', if_deux_enfants_ou_plus)
reformator.add_concept('if_two_people', if_two_people)
reformator.add_concept('if_parent_isole_moins_20k', if_parent_isole_moins_20k)
reformator.summarize_population()
In [5]:
revdisp = list(family['revdisp'] for family in reformator._population)
show_histogram(revdisp, 'Distribution of revenu disponible')
In [6]:
simulated_reform, error, cost, final_parameters, pissed = reformator.suggest_reform(
parameters=[
'per_child',
'if_declarant_above_65',
'if_codeclarant_above_65',
'if_one_declarant_above_24_or_has_children',
# 'if_parent_isole_moins_20k',
],
tax_rate_parameters=[
'base',
'if_deux_enfants_ou_plus',
'if_earns_30k',
'if_earns_40k',
],
# tax_threshold_parameters=[
# 'base',
# # 'if_enfant_unique',
# # 'if_deux_enfants_ou_plus',
# ],
max_cost=0,
min_saving=0)
In [7]:
show_last_reform_results(revdisp, simulated_reform)
In [8]:
draw_ginis(revdisp, simulated_reform)
In [9]:
print repr(final_parameters)
def show_coefficients(final_parameters, current_type):
coefficients = []
variables = []
for parameter in final_parameters:
if parameter['type'] == current_type:
coefficients.append(parameter['value'])
variables.append(parameter['variable'])
result_frame = pd.DataFrame({'Variables': variables, current_type + ' coef': coefficients})
result_frame.set_index('Variables', inplace=True)
qgrid.show_grid(result_frame)
show_coefficients(final_parameters, 'base_revenu')
show_coefficients(final_parameters, 'tax_rate')
# show_coefficients(final_parameters, 'tax_threshold')
In [10]:
new_pop = copy.deepcopy(reformator._population)
for i in range(len(new_pop)):
new_pop[i]['reform'] = simulated_reform[i]
def plot_for_population(pop):
revenu_imposable = list(family['taxable_variable'] for family in pop)
revdisp = list(family['revdisp'] for family in pop)
reform = list(family['reform'] for family in pop)
multi_scatter('Revenu disponible for different people before / after reforme', 'Revenu initial', 'Revenu disponible', [
{'x':revenu_imposable, 'y':reform, 'label':'Reform', 'color':'blue'},
{'x':revenu_imposable, 'y':revdisp, 'label':'Original', 'color':'red'},
])
In [11]:
un_enfant_pop = list(filter(lambda x: x.get('per_child', 0) == 3
and x.get('age_dec_1', 0) < 65
and x.get('age_dec_2', 0) < 65,
new_pop))
plot_for_population(un_enfant_pop)
In [14]:
show_last_reform_results(revdisp, simulated_reform)
In [15]:
# simulated_reform_tree, error_tree, cost_tree, final_parameters_tree = reformator.suggest_reform_tree(
# parameters=[
# 'per_child',
# 'if_one_declarant_above_24_or_has_children',
# 'age_dec_1',
# 'age_dec_2'
# ],
# max_cost=0,
# min_saving=0,
# image_file='./enfants_age',
# max_depth=5,
# min_samples_leaf=20
# )
In [16]:
# show_last_reform_results()
# from IPython.display import Image
# Image(filename='./enfants_age.png')
In [17]:
xmin = 0
xmax = 60000
nb_buckets = 35
bins = np.linspace(xmin, xmax, nb_buckets)
plt.hist(revdisp, bins, alpha=0.5, label='current')
plt.hist(simulated_reform, bins, alpha=0.5, label='reform')
plt.legend(loc='upper right')
plt.show()
In [18]:
difference = list(simulated_reform[i] - revdisp[i] for i in range(len(simulated_reform)))
show_histogram(difference, 'Changes in revenu')
In [19]:
percentage_difference = list(100 * percent_diff(simulated_reform[i], revdisp[i]) for i in range(len(simulated_reform)))
show_histogram(percentage_difference, 'Changes in revenu')
In [20]:
nb_children = list((reformator._population[i].get('per_child', 0) for i in range(len(reformator._population))))
scatter_plot(nb_children, difference, 'Children', 'Difference reform - current', alpha=0.05)
In [21]:
age_dec1 = list((reformator._population[i].get('age_dec_1', 0) for i in range(len(reformator._population))))
scatter_plot(age_dec1, difference, 'Age declarant 1', 'Difference reform - current', alpha=0.1)
In [22]:
order = sorted(range(len(simulated_reform)), key=lambda k: -(simulated_reform[k] - revdisp[k]))
data = {}
possible_keys = set()
for i in order:
for key in reformator._raw_population[i]:
if key != 'year':
possible_keys.add(key)
for i in order:
# Adding the diff with the reform.
differences = data.get('difference', [])
differences.append(int(simulated_reform[i] - revdisp[i]))
data['difference'] = differences
for key in possible_keys:
new_vals = data.get(key, [])
value = reformator._raw_population[i].get(key, '')
if type(value) == float:
value = int(value)
new_vals.append(value)
data[key] = new_vals
# Adding reformed line.
reforms = data.get('reform', [])
reforms.append(int(simulated_reform[i]))
data['reform'] = reforms
df = pd.DataFrame(data=data)
df.set_index('difference', inplace=True)
qgrid.show_grid(df)
In [23]:
# res, error, cost, final_parameters = reformator.suggest_reform(parameters=[
# 'if_one_declarant_above_24',
# 'if_declarant_above_64',
# 'if_codeclarant_above_64',
# 'if_both_declarant_parent_below_24',
# 'if_two_people',
# 'if_enfant_unique',
# 'if_deux_enfants_ou_plus',
# 'nb_enfants_after_2',
# ])
In [24]:
# coefficients = list(map(lambda x: x['value'], final_parameters)); variables = list(map(lambda x: x['variable'], final_parameters))
# result_frame = pd.DataFrame({'Variables': variables, 'Coefficient': coefficients})
# result_frame.set_index('Variables', inplace=True)
# qgrid.show_grid(result_frame)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: