In [136]:

    
import xlwings



In [137]:

    
xlwings.__version__









    Out[137]:





'0.7.0'



In [116]:

    
import os
import pandas as pd
import scipy.stats as stats
import scipy.optimize as opt
import numpy as np



In [117]:

    
raw_data_path = 'C:\\Users\\EEU227\\Documents\\Projects\\Inflation\\SPF\\data'
out_data = 'C:\\Users\\EEU227\\Documents\\Projects\\Inflation\\SPF\\CleanData\\'

def fit_normal_to_hist(h): if not all(h==0): bins =np.array([-2.0,-1.0,-0.5,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0])
orig_hist = np.array(h).astype(float) norm_hist = orig_hist/float(np.sum(orig_hist)) mid_points = (bins[1:] + bins[:-1])/2 popt,pcov = opt.curve_fit(lambda x,mu,sig: stats.norm.pdf(x,mu,sig), mid_points,norm_hist) else: popt = [float('nan'),float('nan')] return popt[1]

def ZL_std(h): intervals =[[-2.0,-1.1],[-1.0,-0.6],[-0.5,-0.1],[0.0,0.4],[0.5,0.9],[1.0,1.4], [1.5,1.9],[2.0,2.4],[2.5,2.9],[3.0,3.4],[3.5,3.9],[4.0,5.0]] if not all(h==0): sum_i1 = 0 sum_i2 = 0 for i in range(1,len(h)): p = h[i]/100 v1,v2 = intervals[i] sum_i1 += p(v23 - v13)/(3(v2-v1)) sum_i2 += p(v22 - v12)/(2(v2-v1)) zl_std = np.sqrt(sum_i1 - sum_i2**2) else: zl_std = float('nan')
return zl_std



In [118]:

    
def Hist_std(h):
    if not all(h==0):
        bins =np.array([-2.0,-1.0,-0.5,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0])  
        orig_hist = np.array(h).astype(float)
        norm_hist = orig_hist/float(np.sum(orig_hist))
        mid_points = (bins[1:] + bins[:-1])/2
        MeanCrude = np.dot(norm_hist,mid_points)
        VarCrude  = np.dot(norm_hist,(mid_points-MeanCrude)**2)
        bin_widths = np.diff(bins)
        BinWidth = bin_widths.mean()
        VarSheppard = VarCrude - (BinWidth**2)/12       #variance, Sheppard's correction
        if VarSheppard>0:
            hist_std = np.sqrt(VarSheppard)
        else:
            hist_std = np.sqrt(VarCrude)
    else:
        hist_std = float('nan')
    return hist_std



In [119]:

    
years = [year for year in range(2010,2017)]



In [120]:

    
years









    Out[120]:





[2010, 2011, 2012, 2013, 2014, 2015, 2016]



In [124]:

    
#writer = pd.ExcelWriter(out_data + 'SPF-aggregate-moments.xlsx')
#years = [2015]
years = [year for year in range(2010,2017)]
quarters = [1,2,3,4]
months = ['Dec','Mar','Jun','Sep']
dfList = []



In [125]:

    
for year in years:
    for q in quarters:
        f = str(year) + 'Q' + str(q)
        fname = f + '.csv'
        if os.path.isfile(raw_data_path + '\\' + fname):
            raw_df = pd.read_csv(raw_data_path + '\\' + fname,header = 1)
            # find the row where the growth expectations start
            dum = raw_df[raw_df['TARGET_PERIOD'] == 'GROWTH EXPECTATIONS; YEAR-ON-YEAR CHANGE IN REAL GDP'].index[0]
            mask_columns = ~raw_df.columns.str.contains('Unnamed')
            df = raw_df.iloc[0:dum-1,mask_columns]    
            df['source'] = str(year) + '-Q' + str(q)
            df = df.rename(columns={'TARGET_PERIOD':'target','FCT_SOURCE':'id','POINT':'point',
                                   'TN1_0':'[-2.0,-1.1]','FN1_0TN0_6':'[-1.0,-0.6]',
                                   'FN0_5TN0_1':'[-0.5,-0.1]','F0_0T0_4':'[0.0,0.4]',
                                   'F0_5T0_9':'[0.5,0.9]','F1_0T1_4':'[1.0,1.4]',
                                   'F1_5T1_9':'[1.5,1.9]','F2_0T2_4':'[2.0,2.4]',
                                   'F2_5T2_9':'[2.5,2.9]','F3_0T3_4':'[3.0,3.4]',
                                   'F3_5T3_9':'[3.5,3.9]','F4_0':'[4.0,5.0]'})
            df = df[['source','target','id','point','[-2.0,-1.1]','[-1.0,-0.6]','[-0.5,-0.1]',
                     '[0.0,0.4]','[0.5,0.9]','[1.0,1.4]','[1.5,1.9]','[2.0,2.4]','[2.5,2.9]',
                     '[3.0,3.4]','[3.5,3.9]', '[4.0,5.0]']]
            # remove rows where point is missing
            maskNaN = df.point.isnull()
            df = df[~maskNaN]                    
            df.fillna(0,inplace = True)
            for colname in df.columns[3:]:
                df[colname] = df[colname].astype('float')
            mask = df.columns.str.contains(',')
#            df['GA_std'] = df.iloc[:,mask].apply(fit_normal_to_hist,axis=1)
#            df['ZL_std'] = df.iloc[:,mask].apply(ZL_std,axis=1)
            df['Hist_std'] = df.iloc[:,mask].apply(Hist_std,axis=1)
            df = df[['source','target','id','point','Hist_std']]
            df['id'] = df['id'].astype('int')
            df['point'] = df['point'].astype('float32')
            #df.to_excel(writer,f,index=False)             
            # create time series representation of the targets
            mask_t0 = str(year)
            mask_t1 = str(year+1)
            mask_t2 = str(year+2)
            if q<3:
                mask_t4or5 = str(year+4)
            else:
                mask_t4or5 = str(year+5)
            if q==1:
                mask_Rt1 = str(year) + months[q-1]
                mask_Rt2 = str(year+1) + months[q-1]
            else:
                mask_Rt1 = str(year+1) + months[q-1]
                mask_Rt2 = str(year+2) + months[q-1]
                #
            df.loc[df.loc[:,'target'] == mask_t0,'targetNew'] = 't'
            df.loc[df.loc[:,'target'] == mask_t1,'targetNew'] = 't+1'
            df.loc[df.loc[:,'target'] == mask_t2,'targetNew'] = 't+2'
            df.loc[df.loc[:,'target'] == mask_t4or5,'targetNew'] = 't+4'
            df.loc[df.loc[:,'target'] == mask_Rt1,'targetNew'] = 'roll 1'
            df.loc[df.loc[:,'target'] == mask_Rt2,'targetNew'] = 'roll 2'
            
            dfList.append(df) 
#writer.save()



In [126]:

    
# Define the aggregation calculations
aggregations = {
    'point': { # 
        'std': 'std',  # 
    },
    'Hist_std': {
        'mean': 'mean',
    }
}



In [127]:

    
dfAll = pd.concat(dfList,axis=0)



In [128]:

    
dfAll.head()



In [129]:

    
dfAll.tail()



In [130]:

    
grouped = dfAll.groupby(['source','targetNew']).agg(aggregations)



In [131]:

    
grouped.head()



In [132]:

    
grouped['AggVar']= grouped['point']['std'] + grouped['Hist_std']['mean']



In [133]:

    
grouped.head(10)



In [134]:

    
dfAll.loc[:,'targetNew'].unique()









    Out[134]:





array(['t', 'roll 1', 't+1', 'roll 2', 't+4', 't+2'], dtype=object)



In [135]:

    
grouped.xs('t+4',level=1).sortlevel(0,axis=1)



In [ ]:

    
for h in dfAll.loc[:,'targetNew'].unique():
    df_h = grouped.xs(h,level=1).sortlevel(0,axis=1)

	source	target	id	point	Hist_std	targetNew
335	2016-Q1	2020	102	1.4	0.329421	t+4
336	2016-Q1	2020	103	1.8	0.690219	t+4
337	2016-Q1	2020	105	2.2	0.107905	t+4
339	2016-Q1	2020	112	2.0	0.627809	t+4
340	2016-Q1	2020	114	1.5	0.844241	t+4

		point	Hist_std
		std	mean
source	targetNew
2010-Q1	roll 1	0.379926	0.570146
	roll 2	0.386152	0.628710
	t	0.210075	0.478055
	t+1	0.412391	0.568893
	t+4	0.223462	0.655118

		point	Hist_std	AggVar
		std	mean
source	targetNew
2010-Q1	roll 1	0.379926	0.570146	0.950073
	roll 2	0.386152	0.628710	1.014862
	t	0.210075	0.478055	0.688130
	t+1	0.412391	0.568893	0.981285
	t+4	0.223462	0.655118	0.878579
2010-Q2	roll 1	0.321718	0.517251	0.838969
	roll 2	0.339143	0.572631	0.911774
	t	0.215482	0.422376	0.637857
	t+1	0.343113	0.515379	0.858492
	t+4	0.170838	0.626043	0.796881

	point	Hist_std	AggVar
	std	mean
source
2010-Q1	0.223462	0.655118	0.878579
2010-Q2	0.170838	0.626043	0.796881
2010-Q3	0.272371	0.631932	0.904302
2010-Q4	0.289806	0.665188	0.954994
2011-Q1	0.255767	0.596972	0.852739
2011-Q2	0.202176	0.621294	0.823470
2011-Q3	0.214857	0.633878	0.848735
2011-Q4	0.267817	0.624849	0.892666
2012-Q1	0.197640	0.648206	0.845846
2012-Q2	0.320977	0.651167	0.972144
2012-Q3	0.222922	0.648010	0.870933
2012-Q4	0.268997	0.673533	0.942530
2013-Q1	0.216292	0.642344	0.858635
2013-Q2	0.206506	0.655308	0.861814
2013-Q3	0.238225	0.634926	0.873151
2013-Q4	0.254614	0.627572	0.882185
2014-Q1	0.201399	0.642977	0.844377
2014-Q2	0.187420	0.649761	0.837181
2014-Q3	0.276752	0.619263	0.896015
2014-Q4	0.181649	0.635252	0.816901
2015-Q1	0.211796	0.626370	0.838165
2015-Q2	0.179172	0.666846	0.846018
2015-Q3	0.207662	0.641744	0.849407
2015-Q4	0.195837	0.638468	0.834305
2016-Q1	0.223398	0.627756	0.851153

	source	target	id	point	Hist_std	targetNew
0	2010-Q1	2010	1	1.0	0.623333	t
1	2010-Q1	2010	2	1.4	0.611673	t
2	2010-Q1	2010	4	1.4	0.451130	t
3	2010-Q1	2010	7	1.0	NaN	t
4	2010-Q1	2010	8	1.3	0.575429	t