In [9]:
    
import sys
import os
sys.path.append(os.getcwd())
    
In [3]:
    
from common.bootstrap import collect_sources
from pandas import DataFrame
mappings = DataFrame()
for source in collect_sources():
    mappings = mappings.append(source.fields_mapping, ignore_index=True)
    
In [11]:
    
mappings.info()
    
    
In [5]:
    
fiscal_fields = mappings.groupby('maps_to').size().sort_values(ascending=False)
fiscal_fields = fiscal_fields.reindex(fiscal_fields.index.rename('Fiscal field'))
DataFrame(fiscal_fields, columns=['Count'])
    
    Out[5]:
In [7]:
    
from bokeh.charts import Bar, output_notebook, show
from bokeh.sampledata.autompg import autompg as df
output_notebook()
p = Bar(fiscal_fields, legend=None, plot_width=900)
show(p)
    
    
    
    
In [32]:
    
empty_mappings = mappings[mappings['maps_to'].isnull()]
print('There are {} fields out of {} that have not been mapped...'.format(len(empty_mappings), len(mappings)))
print('In other words, that\'s {0:.0f}%.'.format(len(empty_mappings) / len(mappings) * 100))
    
    
How many of these empty mappings do not have a translation?
In [37]:
    
nb_without_translation = len(empty_mappings[empty_mappings['translates_to'].isnull()])
print('There are {} fields that have not been mapped that are missing a translation'.format(nb_without_translation))
    
    
After removing those without a translation, the set of fields without a mapping is...
In [39]:
    
set(empty_mappings['translates_to'][empty_mappings['translates_to'].notnull()].values)
    
    Out[39]:
In [59]:
    
minimum_fields = [
    'beneficary_name', 
    'project_name', 
    'total_amount', 
    'fund_acronym', 
    'beneficiary_nuts_region', 
    'program_name'
]
    
In [66]:
    
pipeline_ids = mappings['pipeline_id'].unique()
print('There are currently {} datasets'.format(len(pipeline_ids))
    
    
In [77]:
    
minimum_requirements = []
minimum_requirements_counter = {}
for pipeline_id in pipeline_ids:
    pipeline = mappings[mappings['pipeline_id'] == pipeline_id]
    dataset = {'pipeline_id': pipeline_id}
    counter = 0
    for field in minimum_fields:
        has_field = field in pipeline['maps_to'].values
        dataset.update({field: has_field})
        counter = counter + 1 if has_field else counter
    minimum_requirements.append(dataset)
    minimum_requirements_counter.update({pipeline_id: counter})
    
minimum_requirements = DataFrame(minimum_requirements)
minimum_requirements_counter
    
    Out[77]:
Now how complete is each dataset? In other words, how many fields do each dataset have?
In [52]:
    
datasets = mappings.groupby('pipeline_id').size()
    
In [57]:
    
datasets.sort_values(ascending=False)
    
    Out[57]:
In [ ]: