Datathon Mayo 2018 "Haciendo el gasto fiscal visible para el público"

Gasto Neto Mensualizado por año de las Instituciones Públicas

Lectura de Datos



In [1]:

    
import os
import sys
import numpy as np
import pandas as pd
import datetime as dt
# import pandas_profiling as pd_profiling
import altair as alt

alt.data_transformers.enable('json')
# alt.data_transformers.enable('default', max_rows=10000)









    Out[1]:





DataTransformerRegistry.enable('json')



In [2]:

    
def read_field_type(x):
    '''
    Para facilitar la lectura de los dataframes con los tipos de columna correspondientes.
    '''
    if x in ['String']:
        return str
    elif x in ['Integer', 'Long']:
        return int
    else:
        return str



In [3]:

    
pub_spend_names = pd.read_csv(os.path.join('data', 
                              'dataset_1_monthly_spend_2009-2017_DICTIONARY.csv'), 
                              sep=';', 
                              encoding='utf-8')



In [4]:

    
pub_spend_names









    Out[4]:







  
    
      
      Field name
      Field type
      Field size
      Field format
      Range
      Description
      Example
    
  
  
    
      0
      Periodo
      Integer
      15
      #
      2009 to 2017
      Year of the spend
      2009
    
    
      1
      Nombre Partida
      String
      63
      NaN
      28 categories
      Ministeries
      Ministerio de Educación
    
    
      2
      Nombre Capitulo
      String
      78
      NaN
      219 categories
      Public Services
      Fondo Nacional de Salud
    
    
      3
      Nombre Programa
      String
      78
      NaN
      363 categories
      Programs
      Fondo Nacional de Salud
    
    
      4
      Nombre Subtitulo
      String
      37
      NaN
      10 categories
      First level of the budget classification
      TRANSFERENCIAS CORRIENTES
    
    
      5
      Nombre Item
      String
      54
      NaN
      43 categories
      Second level of the budget classification
      Al Sector Privado
    
    
      6
      Nombre Asignación
      String
      250
      NaN
      4372 categories
      Third level of the budget classification
      Subsidios Fondo Solidario de Vivienda
    
    
      7
      Nombre SubAsignación
      String
      80
      NaN
      1355 categories
      Last level of the budget classification
      Asistencia Técnica
    
    
      8
      Mes
      String
      10
      NaN
      12 categories
      Month
      enero
    
    
      9
      Monto
      Integer
      15
      #
      (-49702217) to 357607483
      Amount, this data was update by the inflation ...
      357607483



In [5]:

    
pub_spend_names.iloc[9, 5]









    Out[5]:





"Amount, this data was update by the inflation factors, and it's expressed in thousands of pesos"



In [6]:

    
pub_spend = pd.read_csv(os.path.join('data', 
                              'dataset_1_monthly_spend_2009-2017.csv'),
                 sep=';', 
                 encoding='utf-8',
                 dtype=dict(zip(pub_spend_names['Field name'], pub_spend_names['Field type'].apply(read_field_type))))



In [7]:

    
pub_spend.head(5)









    Out[7]:







  
    
      
      Periodo
      Nombre Partida
      Nombre Capitulo
      Nombre Programa
      Nombre Subtitulo
      Nombre Item
      Nombre Asignación
      Nombre SubAsignación
      Mes
      Monto_sum
    
  
  
    
      0
      2009
      Congreso Nacional
      Biblioteca Del Congreso
      Biblioteca Del Congreso
      INICIATIVAS DE INVERSIÓN
      Proyectos
      Gastos Administrativos
      NaN
      Abril
      0
    
    
      1
      2009
      Congreso Nacional
      Biblioteca Del Congreso
      Biblioteca Del Congreso
      INICIATIVAS DE INVERSIÓN
      Proyectos
      Gastos Administrativos
      NaN
      Agosto
      0
    
    
      2
      2009
      Congreso Nacional
      Biblioteca Del Congreso
      Biblioteca Del Congreso
      INICIATIVAS DE INVERSIÓN
      Proyectos
      Gastos Administrativos
      NaN
      Diciembre
      0
    
    
      3
      2009
      Congreso Nacional
      Biblioteca Del Congreso
      Biblioteca Del Congreso
      INICIATIVAS DE INVERSIÓN
      Proyectos
      Gastos Administrativos
      NaN
      Enero
      0
    
    
      4
      2009
      Congreso Nacional
      Biblioteca Del Congreso
      Biblioteca Del Congreso
      INICIATIVAS DE INVERSIÓN
      Proyectos
      Gastos Administrativos
      NaN
      Febrero
      0



In [8]:

    
pub_spend.columns = ['periodo', 'partida', 'capitulo', 'programa', 'subtitulo', 'item', 'asignacion', 'subasignacion', 'mes', 'gasto']



In [9]:

    
pub_spend.shape









    Out[9]:





(3110448, 10)



In [10]:

    
pub_spend = pub_spend[pub_spend['gasto'] != 0]



In [11]:

    
pub_spend.shape









    Out[11]:





(2107310, 10)



In [12]:

    
def mbSize(obj):
    return round(sys.getsizeof(obj) / 1024.0 / 1024.0, 2)

mbSize(pub_spend)









    Out[12]:





1425.51

Data Transformation

Gasto anual



In [13]:

    
es_month_dict = {'enero': 1,
                 'febrero': 2,
                 'marzo': 3, 
                 'abril': 4,
                 'mayo': 5,
                 'junio': 6,
                 'julio': 7,
                 'agosto': 8,
                 'septiembre': 9,
                 'octubre': 10,
                 'noviembre': 11,
                 'diciembre': 12
                }

pub_spend['month'] = pub_spend['mes'].str.lower().map(es_month_dict)
pub_spend['day'] = 15
pub_spend['date'] = pd.to_datetime(pub_spend[['day', 'month', 'periodo']].rename(columns={'periodo': 'year'}))
pub_spend.drop(['periodo', 'mes', 'month', 'day'], axis=1, inplace=True)
pub_spend['gasto'] = pub_spend['gasto'] / 1000.0

Aggregation data and JSON creation



In [14]:

    
ps_ministry = pub_spend.groupby(['date', 'partida'])['gasto'].sum().reset_index()
ps_ministry_dict = alt.to_json(ps_ministry, prefix='ps_ministry')
print('Memoria utilizada: {} mb'.format(mbSize(ps_ministry)))









    



Memoria utilizada: 0.54 mb



In [15]:

    
ps_chapter = pub_spend.groupby(['date', 'capitulo'])['gasto'].sum().reset_index()
ps_chapter_dict = alt.to_json(ps_chapter, prefix='ps_chapter')
print('Memoria utilizada: {} mb'.format(mbSize(ps_chapter)))









    



Memoria utilizada: 4.8 mb



In [16]:

    
ps_ministry_chapter = pub_spend.groupby(['date', 'partida', 'capitulo'])['gasto'].sum().reset_index()
ps_ministry_chapter_dict = alt.to_json(ps_ministry_chapter, prefix='ps_ministry_chapter')
print('Memoria utilizada: {} mb'.format(mbSize(ps_ministry_chapter)))









    



Memoria utilizada: 8.42 mb



In [17]:

    
ps_ministry_chapter_subtitle = pub_spend.groupby(['date', 'partida', 'capitulo', 'subtitulo'])['gasto'].sum().reset_index()
ps_ministry_chapter_subtitle_dict = alt.to_json(ps_ministry_chapter_subtitle, prefix='ps_ministry_chapter_subtitle')
print('Memoria utilizada: {} mb'.format(mbSize(ps_ministry_chapter_subtitle)))









    



Memoria utilizada: 53.25 mb



In [18]:

    
# ps_ministry_chapter_subtitle_item = pub_spend.groupby(['date', 'partida', 'capitulo', 'subtitulo', 'item'])['gasto'].sum().reset_index()
# ps_ministry_chapter_subtitle_item_dict = alt.to_json(ps_ministry_chapter_subtitle_item, prefix='ps_ministry_chapter_subtitle_item')
# print('Memoria utilizada: {} mb'.format(mbSize(ps_ministry_chapter_subtitle_item)))

Gasto Anual



In [19]:

    
pub_spend_base = alt.Chart().encode(
    x=alt.X('date:T', 
        timeUnit='year',
        axis=alt.Axis(title='Año'),
           ),
    y=alt.Y('sum(gasto):Q', 
            axis=alt.Axis(format="($,.2r", 
                          title='Millones de Pesos Chilenos')),
    tooltip=alt.Tooltip('sum(gasto):Q', 
                        format="($,.2r")
)

pub_spend_chart = alt.layer(
    pub_spend_base.mark_point(),
    pub_spend_base.mark_line(),
#     pub_spend_base.mark_text(dy=-10, dx=-5).encode(text=alt.Text('sum(gasto):Q', format="$.2s")), 
    data=ps_ministry_dict['url']
).properties(
    title='Gasto Anual',
    width=600, 
    height=400 
)



In [20]:

    
pub_spend_chart.save(os.path.join('charts', 'gastoAnual.json'))
pub_spend_chart









    Out[20]:

Gasto Año-Mes



In [21]:

    
pub_spend_monthly_chart = alt.layer(
    pub_spend_base.mark_point().encode(
        x=alt.X('date:T', 
            timeUnit='yearmonth',
            axis=alt.Axis(title='Mes-Año')
            )
        ),
    pub_spend_base.mark_line().encode(
        x=alt.X('date:T', 
            timeUnit='yearmonth',
            axis=alt.Axis(title='Mes-Año')
            )
        ),
#     pub_spend_base.mark_text(dy=-10, dx=-5).encode(text=alt.Text('sum(gasto):Q', format="$.2s")), 
    data=ps_ministry_dict['url']
).properties(
    title='Gasto Mensual',
    width=600, 
    height=400 
)



In [22]:

    
pub_spend_monthly_chart.save(os.path.join('charts', 'gastoMensual.json'))
pub_spend_monthly_chart









    Out[22]:

Gasto Promedio Anual por Partida



In [23]:

    
from altair import datum



In [24]:

    
gasto_promedio_anual_partida = ps_ministry_chapter.groupby(['partida', pd.Grouper(key='date', freq='1Y')])['gasto'].sum().groupby(level=0).mean().sort_values(ascending=False).reset_index()
gasto_promedio_anual_partida_dict = alt.to_json(gasto_promedio_anual_partida, prefix='gasto_promedio_anual_partida')
print('Memoria utilizada: {} mb'.format(mbSize(gasto_promedio_anual_partida)))









    



Memoria utilizada: 0.01 mb



In [25]:

    
ministry_mean_pub_spend = alt.Chart(gasto_promedio_anual_partida_dict['url']).mark_bar().encode(
    x=alt.X('sum(gasto):Q', 
            axis=alt.Axis(format="($,.2r", 
                          title='Millones de Pesos Chilenos')
           ),
    y=alt.Y('partida:N', 
            axis=alt.Axis(title='Partida')
           ),
    tooltip=alt.Tooltip('sum(gasto):Q', 
                        format="($,.2r"
                       )
).properties(
    title='Gasto Promedio Anual por Partida',
    width=700, 
    height=700 
)



In [26]:

    
ministry_mean_pub_spend.save(os.path.join('charts', 'gastoPromedioAnualPartida.json'))
ministry_mean_pub_spend









    Out[26]:

Gasto Anual por Partida



In [27]:

    
# Base mark and encoding
pub_spend_ministry_base_layer = alt.Chart().encode(
    x=alt.X('date:T', 
        timeUnit='year',
        axis=alt.Axis(title='Año')
           ),
    y=alt.Y('sum(gasto):Q', 
            axis=alt.Axis(format="($,.2r", 
                          title='Millones de Pesos Chilenos')
           ),
    tooltip='partida:N'
)

# Highlight
pub_spend_ministry_highlight = alt.selection(type='single', on='mouseover',
                          fields=['partida'], nearest=True)

# Layered chart
pub_spend_ministry_layer = alt.layer(
    pub_spend_ministry_base_layer.mark_line(strokeWidth=2.5).encode(
        color=alt.Color('partida:N', 
                        scale=None, 
                        title=None
                       ),
        size=alt.condition(~pub_spend_ministry_highlight, 
                           alt.value(1), 
                           alt.value(3)
                          ),
    ),
    pub_spend_ministry_base_layer.mark_line(strokeWidth=2.5).encode(
        color=alt.Color('partida:N', 
                        title='Partida'
                       ),
        size=alt.condition(~pub_spend_ministry_highlight, 
                           alt.value(1), 
                           alt.value(3)
                          ),
    ).transform_filter(
        (datum.partida == gasto_promedio_anual_partida.iloc[0, 0]) | \
        (datum.partida == gasto_promedio_anual_partida.iloc[1, 0]) | \
        (datum.partida == gasto_promedio_anual_partida.iloc[2, 0])
    ),
    pub_spend_ministry_base_layer.mark_circle().encode(
        opacity=alt.value(0)
    ).properties(
        selection=pub_spend_ministry_highlight,
    ),
    data=ps_ministry_dict['url']
).properties(
    title='Gasto Anual por Partida',
    width=600, 
    height=400 
).encode(tooltip='partida:N')



In [28]:

    
pub_spend_ministry_layer.save(os.path.join('charts', 'gastoAnualPorPartida.json'))
pub_spend_ministry_layer









    Out[28]:

Gasto Anual por Capitulo



In [29]:

    
gasto_promedio_anual_partida_capitulo = ps_ministry_chapter.groupby(['capitulo', pd.Grouper(key='date', freq='1Y')])['gasto'].sum().groupby(level=0).mean().sort_values(ascending=False).reset_index()



In [30]:

    
# Base mark and encoding
pub_spend_ministry_chapter_base_layer = alt.Chart().encode(
    x=alt.X('date:T', 
        timeUnit='year',
        axis=alt.Axis(title='Año')
           ),
    y=alt.Y('sum(gasto):Q', 
            axis=alt.Axis(format="($,.2r", 
                          title='Millones de Pesos Chilenos')
           ),
    tooltip='capitulo:N'
)

# Highlight
pub_spend_ministry_chapter_highlight = alt.selection(type='single', on='mouseover',
                          fields=['capitulo'], nearest=True)

# Layered chart
pub_spend_ministry_chapter_layer = alt.layer(
    pub_spend_ministry_chapter_base_layer.mark_line(strokeWidth=2.5).encode(
        color=alt.Color('capitulo:N', 
                        scale=None, 
                        title=None
                       ),
        size=alt.condition(~pub_spend_ministry_chapter_highlight, 
                           alt.value(1), 
                           alt.value(3)
                          ),
    ),
    pub_spend_ministry_chapter_base_layer.mark_line(strokeWidth=2.5).encode(
        color=alt.Color('capitulo:N', 
                        title='Capítulo'
                       ),
        size=alt.condition(~pub_spend_ministry_chapter_highlight, 
                           alt.value(1), 
                           alt.value(3)
                          ),
    ).transform_filter(
        (datum.capitulo == gasto_promedio_anual_partida_capitulo.iloc[0, 0]) | \
        (datum.capitulo == gasto_promedio_anual_partida_capitulo.iloc[1, 0]) | \
        (datum.capitulo == gasto_promedio_anual_partida_capitulo.iloc[2, 0]) | \
        (datum.capitulo == gasto_promedio_anual_partida_capitulo.iloc[3, 0])
    ),
    pub_spend_ministry_chapter_base_layer.mark_circle().encode(
        opacity=alt.value(0)
    ).properties(
        selection=pub_spend_ministry_chapter_highlight,
    ),
    data=ps_ministry_chapter_dict['url']
).properties(
    title='Gasto Anual por Capítulo',
    width=600, 
    height=400 
).encode(tooltip='capitulo:N')



In [31]:

    
pub_spend_ministry_chapter_layer.save(os.path.join('charts', 'gastoAnualCapitulo.json'))
pub_spend_ministry_chapter_layer









    Out[31]:

Por subtítulo



In [32]:

    
pub_spend_agg2_how = pub_spend.groupby(['partida', 'subtitulo', 'item', 'date'])['gasto'].sum().reset_index()



In [33]:

    
pub_spend_agg2_how_dict = alt.to_json(pub_spend_agg2_how, prefix='pub_spend_agg2_how')



In [34]:

    
ps_ministry_chapter_subtitle_dict









    Out[34]:





{'url': 'ps_ministry_chapter_subtitle-4c578cec-4681-4e80-bc79-134927cb693f.json',
 'format': {'type': 'json'}}



In [35]:

    
# Base mark and encoding
pub_spend_subtitle_base_layer = alt.Chart().encode(
    x=alt.X('date:T', 
        timeUnit='year',
        axis=alt.Axis(title='Año')
           ),
    y=alt.Y('sum(gasto):Q', 
            axis=alt.Axis(format="($,.2r", 
                          title='Millones de Pesos Chilenos')
           ),
    tooltip='subtitulo:N'
)

# Highlight
pub_spend_subtitle_highlight = alt.selection(type='single', on='mouseover',
                          fields=['subtitulo'], nearest=True)

# Layered chart
pub_spend_subtitle_layer = alt.layer(
    pub_spend_subtitle_base_layer.mark_line(strokeWidth=2.5).encode(
        color=alt.Color('subtitulo:N', 
                        title='Subtítulo'
                       ),
        size=alt.condition(~pub_spend_subtitle_highlight, 
                           alt.value(1), 
                           alt.value(3)
                          )
    ),
    pub_spend_subtitle_base_layer.mark_circle().encode(
        opacity=alt.value(0)
    ).properties(
        selection=pub_spend_subtitle_highlight,
    ),
    data=ps_ministry_chapter_subtitle_dict['url']
).properties(
    title='Gasto Anual por Subtítulo',
    width=600, 
    height=400 
).encode(tooltip='subtitulo:N')



In [36]:

    
pub_spend_subtitle_layer.save(os.path.join('charts', 'gastoAnualSubtitulo.json'))
pub_spend_subtitle_layer









    Out[36]:

Predicción



In [37]:

    
import itertools
import warnings
import statsmodels.api as sm
import matplotlib.pyplot as plt



In [38]:

    
ts = pub_spend.groupby(pd.Grouper(key='date', freq='1M'))['gasto'].sum().reset_index().set_index('date')



In [39]:

    
ts.head()









    Out[39]:







  
    
      
      gasto
    
    
      date
      
    
  
  
    
      2009-01-31
      1983148.607
    
    
      2009-02-28
      1884621.232
    
    
      2009-03-31
      2494693.436
    
    
      2009-04-30
      2182175.027
    
    
      2009-05-31
      2232954.923



In [40]:

    
np.random.seed(42)



In [41]:

    
ts = ts['gasto'].resample('MS').mean()
ts = ts.fillna(ts.bfill())



In [42]:

    
ts.plot(figsize=(15, 6))
plt.show()



In [43]:

    
# Define the p, d and q parameters to take any value between 0 and 2
p = d = q = range(0, 2)
pdq = list(itertools.product(p, d, q))



In [44]:

    
# Generate all different combinations of seasonal p, q and q triplets
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
print('Examples of parameter combinations for Seasonal ARIMA...')
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[1]))
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[2]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[3]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[4]))









    



Examples of parameter combinations for Seasonal ARIMA...
SARIMAX: (0, 0, 1) x (0, 0, 1, 12)
SARIMAX: (0, 0, 1) x (0, 1, 0, 12)
SARIMAX: (0, 1, 0) x (0, 1, 1, 12)
SARIMAX: (0, 1, 0) x (1, 0, 0, 12)



In [45]:

    
warnings.filterwarnings("ignore") # specify to ignore warning messages

info = []
for param in pdq:
    for param_seasonal in seasonal_pdq:
        try:
            mod = sm.tsa.statespace.SARIMAX(ts,
                                            order=param,
                                            seasonal_order=param_seasonal,
                                            enforce_stationarity=False,
                                            enforce_invertibility=False)

            results = mod.fit()
            

            print('ARIMA{}x{}12 - AIC:{} - BIC:{}'.format(param, param_seasonal, results.aic, results.bic))
            info.append((param, param_seasonal,results.aic, results.bic)) 
        
        except:
            continue









    



ARIMA(0, 0, 0)x(0, 0, 0, 12)12 - AIC:3495.34343728014 - BIC:3498.016266114602
ARIMA(0, 0, 0)x(0, 0, 1, 12)12 - AIC:3052.1684000990253 - BIC:3057.2761538822265
ARIMA(0, 0, 0)x(0, 1, 0, 12)12 - AIC:2619.863554464298 - BIC:2622.4174313558983
ARIMA(0, 0, 0)x(0, 1, 1, 12)12 - AIC:2272.5670504849236 - BIC:2277.404731700517
ARIMA(0, 0, 0)x(1, 0, 0, 12)12 - AIC:2597.208459227472 - BIC:2602.3371556104075
ARIMA(0, 0, 0)x(1, 0, 1, 12)12 - AIC:2579.827979603313 - BIC:2587.4896102781145
ARIMA(0, 0, 0)x(1, 1, 0, 12)12 - AIC:2291.6207559038116 - BIC:2296.4823895014983
ARIMA(0, 0, 0)x(1, 1, 1, 12)12 - AIC:2243.360194057181 - BIC:2250.616715880571
ARIMA(0, 0, 1)x(0, 0, 0, 12)12 - AIC:3401.0623493478543 - BIC:3406.3892275360786
ARIMA(0, 0, 1)x(0, 0, 1, 12)12 - AIC:2995.2162538938783 - BIC:3002.846138240688
ARIMA(0, 0, 1)x(0, 1, 0, 12)12 - AIC:2583.7364991919235 - BIC:2588.8230887564637
ARIMA(0, 0, 1)x(0, 1, 1, 12)12 - AIC:2238.3758076167555 - BIC:2245.5959653585483
ARIMA(0, 0, 1)x(1, 0, 0, 12)12 - AIC:3045.6509546324583 - BIC:3053.3439992068616
ARIMA(0, 0, 1)x(1, 0, 1, 12)12 - AIC:2984.3272771801517 - BIC:2994.5004563092316
ARIMA(0, 0, 1)x(1, 1, 0, 12)12 - AIC:2292.089884218097 - BIC:2299.382334614627
ARIMA(0, 0, 1)x(1, 1, 1, 12)12 - AIC:2227.4335677667414 - BIC:2237.0604447557985
ARIMA(0, 1, 0)x(0, 0, 0, 12)12 - AIC:3208.138807932766 - BIC:3210.8022470268784
ARIMA(0, 1, 0)x(0, 0, 1, 12)12 - AIC:2788.8697041380415 - BIC:2793.9562937025817
ARIMA(0, 1, 0)x(0, 1, 0, 12)12 - AIC:2606.806568402998 - BIC:2609.349863185268
ARIMA(0, 1, 0)x(0, 1, 1, 12)12 - AIC:2237.9613332109134 - BIC:2242.7747717054417
ARIMA(0, 1, 0)x(1, 0, 0, 12)12 - AIC:2633.989419139482 - BIC:2639.097172922683
ARIMA(0, 1, 0)x(1, 0, 1, 12)12 - AIC:2596.503980176457 - BIC:2604.1338645232668
ARIMA(0, 1, 0)x(1, 1, 0, 12)12 - AIC:2264.3838360506493 - BIC:2269.2215172662427
ARIMA(0, 1, 0)x(1, 1, 1, 12)12 - AIC:2239.7332033416533 - BIC:2246.953361083446
ARIMA(0, 1, 1)x(0, 0, 0, 12)12 - AIC:3113.703496515069 - BIC:3119.0114172153844
ARIMA(0, 1, 1)x(0, 0, 1, 12)12 - AIC:2716.4888009279525 - BIC:2724.0865994074125
ARIMA(0, 1, 1)x(0, 1, 0, 12)12 - AIC:2544.19247559856 - BIC:2549.2576745848664
ARIMA(0, 1, 1)x(0, 1, 1, 12)12 - AIC:2182.7358794660777 - BIC:2189.919226930095
ARIMA(0, 1, 1)x(1, 0, 0, 12)12 - AIC:2754.6248643320664 - BIC:2762.286495006868
ARIMA(0, 1, 1)x(1, 0, 1, 12)12 - AIC:2695.7538441888164 - BIC:2705.8842421614295
ARIMA(0, 1, 1)x(1, 1, 0, 12)12 - AIC:2238.093098004001 - BIC:2245.349619827391
ARIMA(0, 1, 1)x(1, 1, 1, 12)12 - AIC:2184.7310919395786 - BIC:2194.3088885582683
ARIMA(1, 0, 0)x(0, 0, 0, 12)12 - AIC:3238.2054868661658 - BIC:3243.5511445350894
ARIMA(1, 0, 0)x(0, 0, 1, 12)12 - AIC:2817.2554984681733 - BIC:2824.9171291429748
ARIMA(1, 0, 0)x(0, 1, 0, 12)12 - AIC:2602.9923067464597 - BIC:2608.100060529661
ARIMA(1, 0, 0)x(0, 1, 1, 12)12 - AIC:2252.1234887241462 - BIC:2259.380010547536
ARIMA(1, 0, 0)x(1, 0, 0, 12)12 - AIC:2786.349369333366 - BIC:2794.0110000081677
ARIMA(1, 0, 0)x(1, 0, 1, 12)12 - AIC:2788.3029520612286 - BIC:2798.518459627631
ARIMA(1, 0, 0)x(1, 1, 0, 12)12 - AIC:2252.1055884760463 - BIC:2259.362110299436
ARIMA(1, 0, 0)x(1, 1, 1, 12)12 - AIC:2246.584418938277 - BIC:2256.2597813694633
ARIMA(1, 0, 1)x(0, 0, 0, 12)12 - AIC:3134.9620129403124 - BIC:3142.9523302226485
ARIMA(1, 0, 1)x(0, 0, 1, 12)12 - AIC:2740.703696124097 - BIC:2750.876875253177
ARIMA(1, 0, 1)x(0, 1, 0, 12)12 - AIC:2570.770399262279 - BIC:2578.400283609089
ARIMA(1, 0, 1)x(0, 1, 1, 12)12 - AIC:2209.518715904097 - BIC:2219.145592893154
ARIMA(1, 0, 1)x(1, 0, 0, 12)12 - AIC:2748.5409559048726 - BIC:2758.756463471275
ARIMA(1, 0, 1)x(1, 0, 1, 12)12 - AIC:2721.4964859507663 - BIC:2734.2129598621163
ARIMA(1, 0, 1)x(1, 1, 0, 12)12 - AIC:2237.805049199972 - BIC:2247.4804116311584
ARIMA(1, 0, 1)x(1, 1, 1, 12)12 - AIC:2211.462677144406 - BIC:2223.496273380727
ARIMA(1, 1, 0)x(0, 0, 0, 12)12 - AIC:3187.2056378396946 - BIC:3192.532516027919
ARIMA(1, 1, 0)x(0, 0, 1, 12)12 - AIC:2770.431142318353 - BIC:2778.061026665163
ARIMA(1, 1, 0)x(0, 1, 0, 12)12 - AIC:2582.5513705170415 - BIC:2587.6379600815817
ARIMA(1, 1, 0)x(0, 1, 1, 12)12 - AIC:2225.2412984022653 - BIC:2232.461456144058
ARIMA(1, 1, 0)x(1, 0, 0, 12)12 - AIC:2741.7794082681835 - BIC:2749.4092926149933
ARIMA(1, 1, 0)x(1, 0, 1, 12)12 - AIC:2742.431406700555 - BIC:2752.6045858296347
ARIMA(1, 1, 0)x(1, 1, 0, 12)12 - AIC:2225.2824992000637 - BIC:2232.5026569418565
ARIMA(1, 1, 0)x(1, 1, 1, 12)12 - AIC:2226.814346984326 - BIC:2236.4412239733833
ARIMA(1, 1, 1)x(0, 0, 0, 12)12 - AIC:3114.889998824019 - BIC:3122.8518798744917
ARIMA(1, 1, 1)x(0, 0, 1, 12)12 - AIC:2718.4835293601923 - BIC:2728.6139273328054
ARIMA(1, 1, 1)x(0, 1, 0, 12)12 - AIC:2544.5700909637067 - BIC:2552.1678894431666
ARIMA(1, 1, 1)x(0, 1, 1, 12)12 - AIC:2184.7272679396497 - BIC:2194.3050645583394
ARIMA(1, 1, 1)x(1, 0, 0, 12)12 - AIC:2726.4004640134144 - BIC:2736.5736431424943
ARIMA(1, 1, 1)x(1, 0, 1, 12)12 - AIC:2697.6489530805857 - BIC:2710.311950546352
ARIMA(1, 1, 1)x(1, 1, 0, 12)12 - AIC:2211.542441576318 - BIC:2221.1693185653753
ARIMA(1, 1, 1)x(1, 1, 1, 12)12 - AIC:2186.723503819883 - BIC:2198.695749593245



In [46]:

    
df_info = pd.DataFrame(info,columns = ["order","seasonal_order","AIC","BIC"])
df_info.head()









    Out[46]:







  
    
      
      order
      seasonal_order
      AIC
      BIC
    
  
  
    
      0
      (0, 0, 0)
      (0, 0, 0, 12)
      3495.343437
      3498.016266
    
    
      1
      (0, 0, 0)
      (0, 0, 1, 12)
      3052.168400
      3057.276154
    
    
      2
      (0, 0, 0)
      (0, 1, 0, 12)
      2619.863554
      2622.417431
    
    
      3
      (0, 0, 0)
      (0, 1, 1, 12)
      2272.567050
      2277.404732
    
    
      4
      (0, 0, 0)
      (1, 0, 0, 12)
      2597.208459
      2602.337156



In [47]:

    
mask = df_info.AIC == df_info.AIC.min()
df_info[mask]









    Out[47]:







  
    
      
      order
      seasonal_order
      AIC
      BIC
    
  
  
    
      27
      (0, 1, 1)
      (0, 1, 1, 12)
      2182.735879
      2189.919227



In [48]:

    
mask = df_info.BIC == df_info.BIC.min()
df_info[mask]









    Out[48]:







  
    
      
      order
      seasonal_order
      AIC
      BIC
    
  
  
    
      27
      (0, 1, 1)
      (0, 1, 1, 12)
      2182.735879
      2189.919227



In [49]:

    
mod = sm.tsa.statespace.SARIMAX(ts,
                                order=(0, 1, 1),
                                seasonal_order=(0, 1, 1, 12),
                                enforce_stationarity=False,
                                enforce_invertibility=False)

results = mod.fit()

print(results.summary().tables[1])









    



==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ma.L1         -0.8115      0.078    -10.408      0.000      -0.964      -0.659
ma.S.L12      -0.0070      0.078     -0.090      0.928      -0.160       0.146
sigma2      4.205e+10   1.71e-13   2.47e+23      0.000     4.2e+10     4.2e+10
==============================================================================



In [50]:

    
results.plot_diagnostics(figsize=(15, 12))
plt.show()



In [51]:

    
pred = results.get_prediction(start=ts.index[70], dynamic=False)
pred_ci = pred.conf_int()



In [52]:

    
ax = ts['2009':].plot(label='observed')
pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7)

ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.2)

ax.set_xlabel('Date')
ax.set_ylabel('Gastos')
plt.legend()

plt.show()



In [53]:

    
ts_forecasted = pred.predicted_mean
ts_truth = ts['1998-01-01':]

# Compute the mean square error
mse = ((ts_forecasted - ts_truth) ** 2).mean()
print('The Mean Squared Error of our forecasts is {}'.format(round(mse, 2)))









    



The Mean Squared Error of our forecasts is 29316257114.87



In [54]:

    
pred_dynamic = results.get_prediction(start=ts.index[70], dynamic=True, full_results=True)
pred_dynamic_ci = pred_dynamic.conf_int()



In [55]:

    
ax = ts['2009':].plot(label='observed', figsize=(20, 15))
pred_dynamic.predicted_mean.plot(label='Dynamic Forecast', ax=ax)

ax.fill_between(pred_dynamic_ci.index,
                pred_dynamic_ci.iloc[:, 0],
                pred_dynamic_ci.iloc[:, 1], color='k', alpha=.25)

ax.fill_betweenx(ax.get_ylim(),ts.index[70], ts.index[-1],
                 alpha=.1, zorder=-1)

ax.set_xlabel('Date')
ax.set_ylabel('Gastos')

plt.legend()
plt.show()



In [56]:

    
# Get forecast 500 steps ahead in future
pred_uc = results.get_forecast(steps=48)

# Get confidence intervals of forecasts
pred_ci = pred_uc.conf_int()



In [57]:

    
ax = ts['2009':].plot(label='observed', figsize=(20, 15))
pred_uc.predicted_mean.plot(ax=ax, label='Forecast')
ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.25)
ax.set_xlabel('Date')
ax.set_ylabel('Gastos')

plt.legend()
plt.show()



In [58]:

    
df_temporal = ts.reset_index()



In [59]:

    
df_predict = pred_ci.assign(predict=pred_uc.predicted_mean)
df_predict = df_predict.reset_index()
df_predict.loc[48] = [df_temporal.loc[107,'date'],
                      df_temporal.loc[107,'gasto'],
                      df_temporal.loc[107,'gasto'],
                      df_temporal.loc[107,'gasto']]



In [60]:

    
df_predict.rename(columns={'index':'date'},inplace=True)
df_predict['date'] = df_predict['date'].astype('datetime64[ns]')

Graficar



In [61]:

    
df_predict_dict = alt.to_json(df_predict, prefix='forecastSpend.json')



In [62]:

    
line_predict = alt.Chart(df_predict_dict).mark_line(strokeDash = [3, 3]).encode(
    x=alt.X('date:T', 
            timeUnit='yearmonth',
            axis=alt.Axis(title='Mes-Año')
            ),
    y='predict:Q'
)

confidence_predict = alt.Chart(df_predict_dict).mark_area(opacity = 0.3).encode(
 x=alt.X('date:T', 
            timeUnit='yearmonth',
            axis=alt.Axis(title='Mes-Año')
            ),
 y  = 'lower gasto:Q',
 y2 = 'upper gasto:Q' ) 

spend_line = alt.Chart(ps_ministry_dict['url']).mark_line().encode(
    x=alt.X('date:T', 
        timeUnit='yearmonth',
        axis=alt.Axis(title='Mes-Año'),
           ),
    y=alt.Y('sum(gasto):Q', axis=alt.Axis(format="($,.2r", title='Millones de Pesos Chilenos'))
)



In [63]:

    
forecast_ps_monthly = alt.layer(
    spend_line,
    confidence_predict,
    line_predict
).properties(
    title='Predicción de Gasto Mensual',
    width=600, 
    height=400 
)



In [64]:

    
forecast_ps_monthly.save(os.path.join('charts', 'gastoMensualPred.json'))
forecast_ps_monthly









    Out[64]:

Gastos Diciembre



In [65]:

    
gasto_promedio_anual_partida = pub_spend.groupby(['partida', pd.Grouper(key='date', freq='1m')])['gasto'].sum()
df_partida = pd.DataFrame({'gasto' : gasto_promedio_anual_partida}).reset_index()
df_partida.head()









    Out[65]:







  
    
      
      partida
      date
      gasto
    
  
  
    
      0
      Congreso Nacional
      2009-01-31
      11967.407
    
    
      1
      Congreso Nacional
      2009-02-28
      4863.919
    
    
      2
      Congreso Nacional
      2009-03-31
      9854.629
    
    
      3
      Congreso Nacional
      2009-04-30
      7156.741
    
    
      4
      Congreso Nacional
      2009-05-31
      7937.524



In [ ]:



In [66]:

    
df_partida['date'] = df_partida['date'].astype(str)
df_partida["year"]  = df_partida["date"].apply(lambda x:x.split('-')[0] )
df_partida["month"] = df_partida["date"].apply(lambda x:x.split('-')[1] )
df_partida["day"]   = df_partida["date"].apply(lambda x:x.split('-')[2] )



In [67]:

    
partida_list = np.unique(df_partida['partida']).tolist()
year_list = np.unique(df_partida['year']).tolist()
aux_list = []
for partida in partida_list:
    df_p = df_partida[df_partida['partida'] == partida]
    for year in year_list:
        aux_df = df_p[df_p['year']==year]
        aux_df['percentage'] = aux_df['gasto']/aux_df['gasto'].sum()
        aux_list = aux_list + aux_df['percentage'].tolist()



In [68]:

    
# solo mes 12
df_partida['percentage_respect_year'] = aux_list
df_partida_12 = df_partida[df_partida['month'] == '12']



In [69]:

    
df_partida_12.head()









    Out[69]:







  
    
      
      partida
      date
      gasto
      year
      month
      day
      percentage_respect_year
    
  
  
    
      11
      Congreso Nacional
      2009-12-31
      13279.862
      2009
      12
      31
      0.125660
    
    
      23
      Congreso Nacional
      2010-12-31
      12769.237
      2010
      12
      31
      0.117450
    
    
      35
      Congreso Nacional
      2011-12-31
      11524.817
      2011
      12
      31
      0.108105
    
    
      47
      Congreso Nacional
      2012-12-31
      14729.433
      2012
      12
      31
      0.125755
    
    
      59
      Congreso Nacional
      2013-12-31
      11763.431
      2013
      12
      31
      0.098658



In [70]:

    
df_partida_12.groupby('partida')['percentage_respect_year'].mean()









    Out[70]:





partida
Congreso Nacional                                                  0.111395
Contraloría General De La República                                0.167376
Ministerio De Agricultura                                          0.176739
Ministerio De Bienes Nacionales                                    0.196740
Ministerio De Defensa Nacional                                     0.126188
Ministerio De Desarrollo Social                                    0.193309
Ministerio De Economía, Fomento Y Turismo                          0.246730
Ministerio De Educación                                            0.133075
Ministerio De Energía                                              0.227916
Ministerio De Hacienda                                             0.164237
Ministerio De Justicia Y Derechos Humanos                          0.132851
Ministerio De La Mujer Y La Equidad De Género                      0.144067
Ministerio De Minería                                              0.200201
Ministerio De Obras Públicas                                       0.195053
Ministerio De Relaciones Exteriores                                0.246546
Ministerio De Salud                                                0.130303
Ministerio De Transportes Y Telecomunicaciones                     0.247827
Ministerio De Vivienda Y Urbanismo                                 0.134981
Ministerio Del Deporte                                             0.223436
Ministerio Del Interior Y Seguridad Pública                        0.163366
Ministerio Del Medio Ambiente                                      0.268032
Ministerio Del Trabajo Y Previsión Social                          0.100445
Ministerio Público                                                 0.119967
Ministerio Secretaría General De Gobierno                          0.218487
Ministerio Secretaría General De La Presidencia De La República    0.149298
Poder Judicial                                                     0.153120
Presidencia De La República                                        0.166237
Servicio Electoral                                                 0.444782
Tesoro Público                                                     0.109093
Name: percentage_respect_year, dtype: float64



In [71]:

    
partida_12 = alt.Chart(df_partida_12).mark_bar().encode(
    x=alt.X('mean(percentage_respect_year):Q', axis=alt.Axis(format=".0%", title='porcentaje')),
    y=alt.Y('partida:N', axis=alt.Axis(title='Partida')),
    tooltip=alt.Tooltip('mean(percentage_respect_year):Q', format=".2%")
).properties(
    title='Gasto promedio del mes de diciembre (porcentual respecto a cada año)',
    width=700, 
    height=700 
)



In [72]:

    
partida_12 = alt.Chart(df_partida_12).mark_bar().encode(
    x=alt.X('mean(percentage_respect_year):Q', axis=alt.Axis(format=".0%", title='porcentaje')),
    y=alt.Y('partida:N', axis=alt.Axis(title='Partida')),
    tooltip=alt.Tooltip('mean(percentage_respect_year):Q', format=".2%")
).properties(
    title='Gasto promedio del mes de diciembre (porcentual respecto a cada año)',
    width=700, 
    height=700 
)



In [73]:

    
df_partida_12_dict = alt.to_json(df_partida_12, prefix='procentaje_dec.json')



In [74]:

    
data2 = pd.DataFrame([{"ThresholdValue": 1.0/12.0, "Threshold": "porcentaje esperado"}])



In [75]:

    
bar1 = alt.Chart(df_partida_12_dict['url']).mark_bar().encode(
    x=alt.X('mean(percentage_respect_year):Q', axis=alt.Axis(format=".0%", title='Porcentaje Gasto Anual Promedio')),
    y=alt.Y('partida:N', axis=alt.Axis(title='Partida')),
    tooltip=alt.Tooltip('mean(percentage_respect_year):Q', format=".2%")
).properties(
    title='Gasto Porcentual Promedio de Diciembre por Partida',
    width=700, 
    height=700 
)

bar2 = alt.Chart(df_partida_12_dict['url']).mark_bar(color="#e45755").encode(
    x='mean(percentage_respect_year):Q',
    x2='baseline:Q',
    y='partida:N',
    tooltip=alt.Tooltip('mean(percentage_respect_year):Q', format=".2%")
).transform_calculate(
    "baseline", "0.0833"
)

rule = alt.Chart(data2).mark_rule().encode(
    x='ThresholdValue:Q'
)

text = alt.Chart(data2).mark_text(
    align='center', dx=215, dy=-5
).encode(
    alt.X('ThresholdValue:Q', axis=alt.Axis(title='porcentaje')),
    text=alt.value('porcentaje esperado')
)

december_perc_chart = alt.layer(
    bar1,
    bar2,
    rule
)



In [76]:

    
december_perc_chart.save(os.path.join('charts', 'gastoPromedioDec.json'))
december_perc_chart









    Out[76]:

	Field name	Field type	Field size	Field format	Range	Description	Example
0	Periodo	Integer	15	#	2009 to 2017	Year of the spend	2009
1	Nombre Partida	String	63	NaN	28 categories	Ministeries	Ministerio de Educación
2	Nombre Capitulo	String	78	NaN	219 categories	Public Services	Fondo Nacional de Salud
3	Nombre Programa	String	78	NaN	363 categories	Programs	Fondo Nacional de Salud
4	Nombre Subtitulo	String	37	NaN	10 categories	First level of the budget classification	TRANSFERENCIAS CORRIENTES
5	Nombre Item	String	54	NaN	43 categories	Second level of the budget classification	Al Sector Privado
6	Nombre Asignación	String	250	NaN	4372 categories	Third level of the budget classification	Subsidios Fondo Solidario de Vivienda
7	Nombre SubAsignación	String	80	NaN	1355 categories	Last level of the budget classification	Asistencia Técnica
8	Mes	String	10	NaN	12 categories	Month	enero
9	Monto	Integer	15	#	(-49702217) to 357607483	Amount, this data was update by the inflation ...	357607483

	Periodo	Nombre Partida	Nombre Capitulo	Nombre Programa	Nombre Subtitulo	Nombre Item	Nombre Asignación	Nombre SubAsignación	Mes
0	2009	Congreso Nacional	Biblioteca Del Congreso	Biblioteca Del Congreso	INICIATIVAS DE INVERSIÓN	Proyectos	Gastos Administrativos	NaN	Abril
1	2009	Congreso Nacional	Biblioteca Del Congreso	Biblioteca Del Congreso	INICIATIVAS DE INVERSIÓN	Proyectos	Gastos Administrativos	NaN	Agosto
2	2009	Congreso Nacional	Biblioteca Del Congreso	Biblioteca Del Congreso	INICIATIVAS DE INVERSIÓN	Proyectos	Gastos Administrativos	NaN	Diciembre
3	2009	Congreso Nacional	Biblioteca Del Congreso	Biblioteca Del Congreso	INICIATIVAS DE INVERSIÓN	Proyectos	Gastos Administrativos	NaN	Enero
4	2009	Congreso Nacional	Biblioteca Del Congreso	Biblioteca Del Congreso	INICIATIVAS DE INVERSIÓN	Proyectos	Gastos Administrativos	NaN	Febrero

	gasto
date
2009-01-31	1983148.607
2009-02-28	1884621.232
2009-03-31	2494693.436
2009-04-30	2182175.027
2009-05-31	2232954.923

	order	seasonal_order	AIC	BIC
0	(0, 0, 0)	(0, 0, 0, 12)	3495.343437	3498.016266
1	(0, 0, 0)	(0, 0, 1, 12)	3052.168400	3057.276154
2	(0, 0, 0)	(0, 1, 0, 12)	2619.863554	2622.417431
3	(0, 0, 0)	(0, 1, 1, 12)	2272.567050	2277.404732
4	(0, 0, 0)	(1, 0, 0, 12)	2597.208459	2602.337156

	partida	date	gasto
0	Congreso Nacional	2009-01-31	11967.407
1	Congreso Nacional	2009-02-28	4863.919
2	Congreso Nacional	2009-03-31	9854.629
3	Congreso Nacional	2009-04-30	7156.741
4	Congreso Nacional	2009-05-31	7937.524

	partida	date	gasto	year	month	day	percentage_respect_year
11	Congreso Nacional	2009-12-31	13279.862	2009	12	31	0.125660
23	Congreso Nacional	2010-12-31	12769.237	2010	12	31	0.117450
35	Congreso Nacional	2011-12-31	11524.817	2011	12	31	0.108105
47	Congreso Nacional	2012-12-31	14729.433	2012	12	31	0.125755
59	Congreso Nacional	2013-12-31	11763.431	2013	12	31	0.098658