In [1]:
import os
import pandas as pd
import pandas_profiling as pd_profiling
import altair as alt
import matplotlib.pyplot as plt
#%matplotlib ipympl
In [2]:
def read_field_type(x):
'''
Para facilitar la lectura de los dataframes con los tipos de columna correspondientes.
'''
if x in ['String']:
return str
elif x in ['Integer', 'Long']:
return int
else:
return str
In [3]:
pub_revenue_spend_names = pd.read_csv(os.path.join('data',
'Anual_revenue and spend_2009-2017 by Programs (DICCIONARY).csv'),
sep=';',
encoding='latin-1')
In [4]:
pub_revenue_spend_names
Out[4]:
In [ ]:
pub_revenue_spend_names
In [5]:
pub_revenue_spend = pd.read_csv(os.path.join('data',
'Anual_revenue and spend_2009-2017 by Programs (millions) v3.csv'),
sep=';',
encoding='utf-8',
dtype=dict(zip(pub_revenue_spend_names['Field name'], pub_revenue_spend_names['Field type'].apply(read_field_type))))
In [12]:
['partida', 'partida', 'capitulo', 'programa', 'subtitulo', 'item', 'asignacion', 'tipo', 'presupuesto', ]
Out[12]:
In [6]:
pub_revenue_spend.head(5)
Out[6]:
In [8]:
pub_revenue_spend.describe(include='all')
Out[8]:
In [7]:
pub_revenue_spend['TIPO'].unique()
Out[7]:
In [8]:
pub_revenue_spend['SUBTÍTULO'].nunique()
Out[8]:
In [9]:
pub_revenue_spend.groupby(['TIPO'])['SUBTÍTULO'].nunique()
Out[9]:
In [10]:
for name, group in pub_revenue_spend.groupby('TIPO'):
print('tipo: {}\n'.format(name))
print('\t{}'.format(group['SUBTÍTULO'].unique()))
print('\n\n')
In [11]:
pub_revenue_spend[pub_revenue_spend['SUBTÍTULO'].isnull()]
Out[11]:
In [ ]:
pub_revenue_spend.query)
In [ ]: