In [9]:
import sys
import os
sys.path.append(os.getcwd())
In [3]:
from common.bootstrap import collect_sources
from pandas import DataFrame
mappings = DataFrame()
for source in collect_sources():
mappings = mappings.append(source.fields_mapping, ignore_index=True)
In [11]:
mappings.info()
In [5]:
fiscal_fields = mappings.groupby('maps_to').size().sort_values(ascending=False)
fiscal_fields = fiscal_fields.reindex(fiscal_fields.index.rename('Fiscal field'))
DataFrame(fiscal_fields, columns=['Count'])
Out[5]:
In [7]:
from bokeh.charts import Bar, output_notebook, show
from bokeh.sampledata.autompg import autompg as df
output_notebook()
p = Bar(fiscal_fields, legend=None, plot_width=900)
show(p)
In [32]:
empty_mappings = mappings[mappings['maps_to'].isnull()]
print('There are {} fields out of {} that have not been mapped...'.format(len(empty_mappings), len(mappings)))
print('In other words, that\'s {0:.0f}%.'.format(len(empty_mappings) / len(mappings) * 100))
How many of these empty mappings do not have a translation?
In [37]:
nb_without_translation = len(empty_mappings[empty_mappings['translates_to'].isnull()])
print('There are {} fields that have not been mapped that are missing a translation'.format(nb_without_translation))
After removing those without a translation, the set of fields without a mapping is...
In [39]:
set(empty_mappings['translates_to'][empty_mappings['translates_to'].notnull()].values)
Out[39]:
In [59]:
minimum_fields = [
'beneficary_name',
'project_name',
'total_amount',
'fund_acronym',
'beneficiary_nuts_region',
'program_name'
]
In [66]:
pipeline_ids = mappings['pipeline_id'].unique()
print('There are currently {} datasets'.format(len(pipeline_ids))
In [77]:
minimum_requirements = []
minimum_requirements_counter = {}
for pipeline_id in pipeline_ids:
pipeline = mappings[mappings['pipeline_id'] == pipeline_id]
dataset = {'pipeline_id': pipeline_id}
counter = 0
for field in minimum_fields:
has_field = field in pipeline['maps_to'].values
dataset.update({field: has_field})
counter = counter + 1 if has_field else counter
minimum_requirements.append(dataset)
minimum_requirements_counter.update({pipeline_id: counter})
minimum_requirements = DataFrame(minimum_requirements)
minimum_requirements_counter
Out[77]:
Now how complete is each dataset? In other words, how many fields do each dataset have?
In [52]:
datasets = mappings.groupby('pipeline_id').size()
In [57]:
datasets.sort_values(ascending=False)
Out[57]:
In [ ]: