Guillaume Chaslot guillaume.chaslot@data.gouv.fr
There are several simulators for the same tax (e.g., irpp), it is important to compare them to detect bugs. This notebook propose a vizualization of the comparisons
In [1]:
from compare_simulators import CalculatorComparator
from population_simulator import CerfaPopulationSimulator
from utils import multi_scatter
import matplotlib.pyplot as plt
import pandas as pd
import qgrid
import copy
%matplotlib inline
In [2]:
# Creating the comparator and loading existing results
comp = CalculatorComparator()
# Load comparison that were already done
comp.load_results_from_json('1aj-1bj-f-2000')
# If you want to run tests comparing openfisca / M / Online, run:
# 'python simulate_comparators', which arguments are the following:
# --tests The number of "declaration of revenu" to simulate
# --ir If we only compute the "impot sur le revenu"
# --save Saves the result in a given json file
# --load Loads test results from the given json file
# --linear If we generate from a linear distribution instead of a
# gaussian one.
# If you want to change the population simulated, you can do so in CerfaPopulationSimulator
In [3]:
# Adding both salaries
salaire1 = comp.get_variable_from_testcases('1AJ')
salaire2 = comp.get_variable_from_testcases('1BJ')
salaire = list([(salaire1[i] + salaire2[i]) for i in range(0, len(salaire1))])
revdisp = comp.get_variable_from_openfisca('revdisp')
impot_openfisca = comp.get_variable_from_openfisca('irpp')
impot_m = comp.get_variable_from_m('IRN')
impot_online = comp.get_variable_from_online('IINET')
In [4]:
multi_scatter('Tax for different people / calculators', 'Revenu', u'Impôt sur le revenu', [
{'x':salaire, 'y':impot_openfisca, 'label':'Openfisca', 'color':'red'},
{'x':salaire, 'y':impot_m, 'label':'M', 'color':'green'},
{'x':salaire, 'y':impot_online, 'label':'Online', 'color':'yellow'},
])
In [5]:
diff_openfisca = [a_i - b_i for a_i, b_i in zip(impot_openfisca, impot_online)]
diff_m = [a_i - b_i for a_i, b_i in zip(impot_m, impot_online)]
# TOOL 1 to find discrepencies: differences
# Plotting difference between:
# 1) openfisca and online
# 2) m and online
# x = salary (1AJ)
# y_yellow = difference between openfisca and online calculator
# y_blue = difference between m and online calculator
####################################
# Filtering dimensions of interest #
####################################
# for i in range(0, len(salaire1)):
# #Removing veufs and children
# if ('V' in comp.testcases[i]):
# # or ('F' in comp.testcases[i] and comp.testcases[i]['F'] > 0)):
# # if ('F' in comp.testcases[i] and comp.testcases[i]['F'] != 2):
# diff_m[i] = 0
# # diff_openfisca[i] = -1000
multi_scatter('Difference with online simulator', 'Revenu', u'Impôt sur le revenu', [
{'x':salaire, 'y': map(abs, diff_openfisca), 'label':'OpenFisca', 'color':'red'},
{'x':salaire, 'y': map(abs, diff_m), 'label':'M', 'color':'green'},
])
Out[5]:
In [6]:
# Computing Probable outliner:
# if two calculators are equal and one is different, the different might be the one buggy
def commun(a, b, c):
diffa = min(abs(a-b), abs(a-c))
diffb = min(abs(a-b), abs(b-c))
diffc = min(abs(c-b), abs(a-c))
if diffa <= diffb and diffa <= diffc:
return a
if diffb <= diffa and diffb <= diffc:
return b
if diffc <= diffa and diffc <= diffb:
return c
# We do not handle yet the hypothetic case of a circular reference
print 'Circular condition, please define behavior here'
assert(False)
probable_true_tax = []
for i in range(0, len(impot_m)):
probable_true_tax.append(commun(impot_m[i], impot_online[i], impot_openfisca[i]))
In [7]:
# x = salary
# y = probable_true_tax
plt.scatter(
salaire,
probable_true_tax,
c= ['red'] * len(probable_true_tax))
Out[7]:
In [8]:
error_openfisca = [abs(a_i - b_i) for a_i, b_i in zip(impot_openfisca, probable_true_tax)]
plt.title('Errors of Openfisca')
plt.scatter(
salaire,
error_openfisca)
Out[8]:
In [9]:
error_m = [abs(a_i - b_i) for a_i, b_i in zip(impot_m, probable_true_tax)]
plt.title('Errors of M')
plt.scatter(
salaire,
error_m,
c=['green'] * len(impot_m))
Out[9]:
In [10]:
# Errors of online
error_online = [abs(a_i - b_i) for a_i, b_i in zip(impot_online, probable_true_tax)]
plt.title('Possible errors for the online calculator')
plt.scatter(
salaire,
error_online,
c=['yellow'] * len(impot_online))
Out[10]:
In [11]:
# TOOL 3 to find discrepencies: showing worst testcases
def get_highest_discrepencies(my_list):
index_errors = sorted(range(len(my_list)), key=lambda k: my_list[k], reverse=True)
average_age = 0
from input_variable_converter import CerfaOpenFiscaConverter
input_converter = CerfaOpenFiscaConverter()
worst_testcases = []
display_data = []
for i in range(0, len(salaire)):
idx = index_errors[i]
new_datum =copy.deepcopy(comp.testcases[index_errors[i]])
new_datum['Diff openfisca: '] = abs(int(impot_openfisca[idx]) - int(impot_online[idx]))
new_datum['Diff m: '] = abs(int(impot_m[idx]) - int(impot_online[idx]))
new_datum['Impot m: '] = int(impot_m[idx])
new_datum['Impot openfisca: '] = int(impot_openfisca[idx])
new_datum['Impot online: '] = int(impot_online[idx])
display_data.append(new_datum)
return display_data
In [12]:
worst_testcases = get_highest_discrepencies(error_openfisca)
df = pd.DataFrame(data=worst_testcases)
df.set_index('Diff openfisca: ', inplace=True)
qgrid.show_grid(df)
In [13]:
worst_testcases = get_highest_discrepencies(error_m)
df = pd.DataFrame(data=worst_testcases)
df.set_index('Diff m: ', inplace=True)
qgrid.show_grid(df)
In [14]:
probable_true_marginal_rate = []
for i in range(0, len(salaire)):
probable_true_marginal_rate.append(100 * probable_true_tax[i] / (salaire[i]+0.01))
plt.title('True tax paid / each person of the population')
plt.scatter(
salaire,
probable_true_marginal_rate,
c=['yellow'] * len(impot_online))
Out[14]:
In [15]:
# TOOL 4 to find discrepencies: weighting input variables by error
# For instance, we could find a bug that people born in 1936 with a salary of 19k have a discrepency in M
combined_total = {}
combined_total_ref = {}
combined_diff = {}
for var in comp.testcases[0]:
combined_total[var] = 0
combined_total_ref[var] = 0
combined_weights = 0
for i in range(0, len(salaire)):
for var in comp.testcases[i]:
combined_total[var] = combined_total.get(var, 0) + comp.testcases[i].get(var, 0) * error_openfisca[i]
combined_total_ref[var] = combined_total_ref.get(var, 0) + comp.testcases[i][var]
combined_weights += error_openfisca[i]
for var in combined_total:
combined_total[var] = combined_total[var] / combined_weights
combined_total_ref[var] = combined_total_ref[var] / len(salaire)
combined_diff[var] = combined_total[var] - combined_total_ref[var]
print 'Reference average: ' + repr(combined_total_ref)
print 'Weighted by bug average: ' + repr(combined_total)
print 'Difference: ' + repr(combined_diff)
In [16]:
# Disposable income
plt.scatter(
salaire,
revdisp,
c=['yellow'] * len(impot_online))
Out[16]:
In [17]:
for i in range(0, len(salaire)):
if revdisp[i] > salaire[i] + 18000 and ('C' in comp.testcases[i]) and ('F' in comp.testcases[i]) and comp.testcases[i]['F'] < 4:
print revdisp[i]
print comp.testcases[i]
In [18]:
# Government help:
balance_x = []
balance_y = []
for i in range(0, len(salaire)):
if ('F' not in comp.testcases[i]):
balance_x.append(salaire[i])
balance_y.append(revdisp[i] - salaire[i])
# Disposable income
plt.scatter(
balance_x,
balance_y,
c=['magenta'] * len(impot_online))
Out[18]:
In [19]:
# comparator.simulate_m([{u'C': 1, u'1AJ': 0, u'0DA': 1992, u'year': 2014}], {})
test = comparator.simulate_of([{u'F': 2, u'M': 1, u'year': 2014, u'1BJ': 0, u'0DB': 1950, u'0DA': 1981, u'1AJ': 0}])
# comparator.simulate_online([{u'C': 1, u'1AJ': 0, u'0DA': 1992, u'year': 2014}],{})
In [ ]:
In [ ]:
In [ ]: