In [188]:
import os
import pandas as pd
import scipy.stats as stats
import scipy.optimize as opt
import numpy as np

In [246]:
raw_data_path = 'C:\\Users\\EEU227\\Documents\\Projects\\Inflation\\SPF\\data'
out_data = 'C:\\Users\\EEU227\\Documents\\Projects\\Inflation\\SPF\\clean_data\\'

In [228]:
#writer = pd.ExcelWriter(out_data + 'SPF-aggregate-moments.xlsx')
#years = [2015]
years = [year for year in range(2001,2017)]
quarters = [1,2,3,4]
months = ['Dec','Mar','Jun','Sep']
dfList = []

In [229]:
def Hist_std(h):
    if not all(h==0):
        if len(h) == 9:
            bins =np.array([-1.0,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.5])
#                df = df[['source','target','id','point','[-1.0,-0.1]',
#                         '[0.0,0.4]','[0.5,0.9]','[1.0,1.4]','[1.5,1.9]','[2.0,2.4]','[2.5,2.9]',
#                         '[3.0,3.4]','[3.5,4.5]']]                
        elif len(h) == 10:
#               df = df[['source','target','id','point','[-1.0,-0.1]',
#                         '[0.0,0.4]','[0.5,0.9]','[1.0,1.4]','[1.5,1.9]','[2.0,2.4]','[2.5,2.9]',
#                         '[3.0,3.4]','[3.5,3.9]','[4.0,5.0]']]             
            bins =np.array([-1.0,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0]) 
        elif len(h) == 11:
#               df = df[['source','target','id','point','[-1.0,-0.1]',
#                         '[0.0,0.4]','[0.5,0.9]','[1.0,1.4]','[1.5,1.9]','[2.0,2.4]','[2.5,2.9]',
#                         '[3.0,3.4]','[3.5,3.9]','[4.0,5.0]']]                  
#           
            bins =np.array([-1.0,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.5])             
        elif len(h)== 14:
#                df = df[['source','target','id','point','[-3.0,-2.1]','[-2.0,-1.6]','[-1.5,-1.1]',
#                         '[-1.0,-0.6]','[-0.5,-0.1]','[0.0,0.4]','[0.5,0.9]','[1.0,1.4]','[1.5,1.9]',
#                         '[2.0,2.4]','[2.5,2.9]','[3.0,3.4]','[3.5,3.9]', '[4.0,5.0]']]   
            bins =np.array([-3.0,-2.0,-1.5,-1.0,-0.5,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0])                   
        elif len(h)== 13:
            bins =np.array([-3-0,-2.0,-1.5,-1.0,-0.5,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0])              
        elif len(h)== 12:
#                df = df[['source','target','id','point','[-2.0,-1.1]','[-1.0,-0.6]','[-0.5,-0.1]',
#                         '[0.0,0.4]','[0.5,0.9]','[1.0,1.4]','[1.5,1.9]','[2.0,2.4]','[2.5,2.9]',
#                         '[3.0,3.4]','[3.5,3.9]', '[4.0,5.0]']]            
            bins =np.array([-2.0,-1.0,-0.5,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0])  
            #
        orig_hist = np.array(h).astype(float)
        norm_hist = orig_hist/float(np.sum(orig_hist))
        mid_points = (bins[1:] + bins[:-1])/2
        MeanCrude = np.dot(norm_hist,mid_points)
        VarCrude  = np.dot(norm_hist,(mid_points-MeanCrude)**2)
        bin_widths = np.diff(bins)
        BinWidth = bin_widths.mean()
        VarSheppard = VarCrude - (BinWidth**2)/12       #variance, Sheppard's correction
        if VarSheppard>0:
            hist_std = np.sqrt(VarSheppard)
        else:
            hist_std = np.sqrt(VarCrude)
    else:
        hist_std = float('nan')
    return hist_std

In [230]:
for year in years:
    for q in quarters:
        f = str(year) + 'Q' + str(q)
        fname = f + '.csv'
        if os.path.isfile(raw_data_path + '\\' + fname):
            raw_df = pd.read_csv(raw_data_path + '\\' + fname,header = 1)
            # find the row where the growth expectations start
            dum = raw_df[raw_df['TARGET_PERIOD'] == 'GROWTH EXPECTATIONS; YEAR-ON-YEAR CHANGE IN REAL GDP'].index[0]
            mask_columns = ~raw_df.columns.str.contains('Unnamed')
            df = raw_df.iloc[0:dum-1,mask_columns]    
            df['source'] = str(year) + '-Q' + str(q)
            #print(fname,len(df.columns))

            if len(df.columns) == 13:
                df = df.rename(columns={'TARGET_PERIOD':'target','FCT_SOURCE':'id','POINT':'point',
                                       'T0_0':'[-1.0,-0.1]','F0_0T0_4':'[0.0,0.4]',
                                       'F0_5T0_9':'[0.5,0.9]','F1_0T1_4':'[1.0,1.4]',
                                       'F1_5T1_9':'[1.5,1.9]','F2_0T2_4':'[2.0,2.4]',
                                       'F2_5T2_9':'[2.5,2.9]','F3_0T3_4':'[3.0,3.4]',
                                       'F3_5':'[3.5,4.5]'})
                
                df = df[['source','target','id','point','[-1.0,-0.1]',
                         '[0.0,0.4]','[0.5,0.9]','[1.0,1.4]','[1.5,1.9]','[2.0,2.4]','[2.5,2.9]',
                         '[3.0,3.4]','[3.5,4.5]']]    
            elif len(df.columns) == 14:
                df = df.rename(columns={'TARGET_PERIOD':'target','FCT_SOURCE':'id','POINT':'point',
                                       'T0_0':'[-1.0,-0.1]','F0_0T0_4':'[0.0,0.4]',
                                       'F0_5T0_9':'[0.5,0.9]','F1_0T1_4':'[1.0,1.4]',
                                       'F1_5T1_9':'[1.5,1.9]','F2_0T2_4':'[2.0,2.4]',
                                       'F2_5T2_9':'[2.5,2.9]','F3_0T3_4':'[3.0,3.4]',
                                       'F3_5T3_9':'[3.5,3.9]','F4_0':'[4.0,5.0]'})
                
                df = df[['source','target','id','point','[-1.0,-0.1]',
                         '[0.0,0.4]','[0.5,0.9]','[1.0,1.4]','[1.5,1.9]','[2.0,2.4]','[2.5,2.9]',
                         '[3.0,3.4]','[3.5,3.9]','[4.0,5.0]']]      
                
            elif len(df.columns) == 18:
                df = df.rename(columns={'TARGET_PERIOD':'target','FCT_SOURCE':'id','POINT':'point',
                                       'TN2_0':'[-3.0,-2.1]','FN2_0TN1_6':'[-2.0,-1.6]',
                                       'FN1_5TN1_1':'[-1.5,-1.1]','FN1_0TN0_6':'[-1.0,-0.6]',
                                       'FN0_5TN0_1':'[-0.5,-0.1]','F0_0T0_4':'[0.0,0.4]',
                                       'F0_5T0_9':'[0.5,0.9]','F1_0T1_4':'[1.0,1.4]',
                                       'F1_5T1_9':'[1.5,1.9]','F2_0T2_4':'[2.0,2.4]',
                                       'F2_5T2_9':'[2.5,2.9]','F3_0T3_4':'[3.0,3.4]',
                                       'F3_5T3_9':'[3.5,3.9]','F4_0':'[4.0,5.0]'})
                
                df = df[['source','target','id','point','[-3.0,-2.1]','[-2.0,-1.6]','[-1.5,-1.1]',
                         '[-1.0,-0.6]','[-0.5,-0.1]','[0.0,0.4]','[0.5,0.9]','[1.0,1.4]','[1.5,1.9]',
                         '[2.0,2.4]','[2.5,2.9]','[3.0,3.4]','[3.5,3.9]', '[4.0,5.0]']]                 
                 
            else: # len(df.columns) == 16:
                df = df.rename(columns={'TARGET_PERIOD':'target','FCT_SOURCE':'id','POINT':'point',
                                       'TN1_0':'[-2.0,-1.1]','FN1_0TN0_6':'[-1.0,-0.6]',
                                       'FN0_5TN0_1':'[-0.5,-0.1]','F0_0T0_4':'[0.0,0.4]',
                                       'F0_5T0_9':'[0.5,0.9]','F1_0T1_4':'[1.0,1.4]',
                                       'F1_5T1_9':'[1.5,1.9]','F2_0T2_4':'[2.0,2.4]',
                                       'F2_5T2_9':'[2.5,2.9]','F3_0T3_4':'[3.0,3.4]',
                                       'F3_5T3_9':'[3.5,3.9]','F4_0':'[4.0,5.0]'})
                
                df = df[['source','target','id','point','[-2.0,-1.1]','[-1.0,-0.6]','[-0.5,-0.1]',
                         '[0.0,0.4]','[0.5,0.9]','[1.0,1.4]','[1.5,1.9]','[2.0,2.4]','[2.5,2.9]',
                         '[3.0,3.4]','[3.5,3.9]', '[4.0,5.0]']]
              
            # remove rows where point is missing
            maskNaN = df.point.isnull()
            df = df[~maskNaN]                    
            df.fillna(0,inplace = True)
            for colname in df.columns[3:]:
                df[colname] = df[colname].astype('float')
            mask = df.columns.str.contains(',')
            #print(fname,len(mask[mask==True]))
#            df['GA_std'] = df.iloc[:,mask].apply(fit_normal_to_hist,axis=1)
#            df['ZL_std'] = df.iloc[:,mask].apply(ZL_std,axis=1)
            df['Hist_std'] = df.iloc[:,mask].apply(Hist_std,axis=1)
            df = df[['source','target','id','point','Hist_std']]
            df['id'] = df['id'].astype('int')
            df['point'] = df['point'].astype('float32')
            #df.to_excel(writer,f,index=False)             
            # create time series representation of the targets
            mask_t0 = str(year)
            mask_t1 = str(year+1)
            mask_t2 = str(year+2)
            if q<3:
                mask_t4or5 = str(year+4)
            else:
                mask_t4or5 = str(year+5)
            if q==1:
                mask_Rt1 = str(year) + months[q-1]
                mask_Rt2 = str(year+1) + months[q-1]
            else:
                mask_Rt1 = str(year+1) + months[q-1]
                mask_Rt2 = str(year+2) + months[q-1]
                #
            df.loc[df.loc[:,'target'] == mask_t0,'targetNew'] = 't'
            df.loc[df.loc[:,'target'] == mask_t1,'targetNew'] = 't+1'
            df.loc[df.loc[:,'target'] == mask_t2,'targetNew'] = 't+2'
            df.loc[df.loc[:,'target'] == mask_t4or5,'targetNew'] = 't+4'
            df.loc[df.loc[:,'target'] == mask_Rt1,'targetNew'] = 'roll 1'
            df.loc[df.loc[:,'target'] == mask_Rt2,'targetNew'] = 'roll 2'
            
            dfList.append(df) 
#writer.save()

In [231]:
dfAll = pd.concat(dfList,axis=0)

In [232]:
# Define the aggregation calculations
aggregations = {
    'point': { # 
        'std': 'std',  # 
    },
    'Hist_std': {
        'mean': 'mean',
    }
}

In [248]:
grouped = dfAll.groupby(['source','targetNew']).agg(aggregations)

In [249]:
grouped.head()


Out[249]:
Hist_std point
mean std
source targetNew
2001-Q1 roll 1 0.318479 0.308394
roll 2 0.358096 0.250939
t 0.296765 0.224281
t+1 0.334156 0.227422
t+4 0.399493 0.262188

In [250]:
grouped['AggStd']= grouped['point']['std'] + grouped['Hist_std']['mean']

In [251]:
aggStdLR = grouped.xs('t+4',level=1).sortlevel(0,axis=1)

In [241]:
aggStdLR.to_csv('aggStdLR.csv')

In [252]:
writer = pd.ExcelWriter(out_data + 'SPF_aggregate_uncertainty.xlsx')

In [253]:
aggStdLR.to_excel(writer)

In [254]:
writer.save()

In [ ]: