In [1]:

    
import os
import pandas as pd
import scipy.stats as stats
import scipy.optimize as opt
import numpy as np



In [2]:

    
raw_data_path = 'C:\\Users\\EEU227\\Documents\\IPython Notebooks\\DEEdata\\SPF_individual_forecasts'



In [3]:

    
if not os.path.isdir(raw_data_path + '\\CleanData'):
    os.mkdir(raw_data_path + '\\CleanData')



In [4]:

    
out_data = raw_data_path + '\\CleanData\\'
out_data









    Out[4]:





'C:\\Users\\EEU227\\Documents\\IPython Notebooks\\DEEdata\\SPF_individual_forecasts\\CleanData\\'



In [264]:

    
years = [2014]
quarters = [4]
for year in years:
    for q in quarters:
        f = str(year) + 'Q' + str(q)
        fname = f + '.csv'
        if os.path.isfile(raw_data_path + '\\' + fname):
            raw_df = pd.read_csv(raw_data_path + '\\' + fname,header = True)
            # find the row where the growth expectations start
            dum = raw_df[raw_df['TARGET_PERIOD'] == 'GROWTH EXPECTATIONS; YEAR-ON-YEAR CHANGE IN REAL GDP'].index[0]
            mask_columns = ~raw_df.columns.str.contains('Unnamed')
            df = raw_df.iloc[0:dum-1,mask_columns]    
            df['source'] = str(year) + '-Q' + str(q)
            df = df.rename(columns={'TARGET_PERIOD':'target','FCT_SOURCE':'id','POINT':'point',
                                   'TN1_0':'[-2.0,-1.1]','FN1_0TN0_6':'[-1.0,-0.6]',
                                   'FN0_5TN0_1':'[-0.5,-0.1]','F0_0T0_4':'[0.0,0.4]',
                                   'F0_5T0_9':'[0.5,0.9]','F1_0T1_4':'[1.0,1.4]',
                                   'F1_5T1_9':'[1.5,1.9]','F2_0T2_4':'[2.0,2.4]',
                                   'F2_5T2_9':'[2.5,2.9]','F3_0T3_4':'[3.0,3.4]',
                                   'F3_5T3_9':'[3.5,3.9]','F4_0':'[4.0,5.0]'})
            df = df[['source','target','id','point','[-2.0,-1.1]','[-1.0,-0.6]','[-0.5,-0.1]',
                     '[0.0,0.4]','[0.5,0.9]','[1.0,1.4]','[1.5,1.9]','[2.0,2.4]','[2.5,2.9]',
                     '[3.0,3.4]','[3.5,3.9]', '[4.0,5.0]']]
            df.fillna(0,inplace = True)
            for colname in df.columns[3:]:
                df[colname] = df[colname].astype('float')

intervals =[[-2.0,-1.1],[-1.0,-0.6],[-0.5,-0.1],[0.0,0.4],[0.5,0.9],[1.0,1.4],
       [1.5,1.9],[2.0,2.4],[2.5,2.9],[3.0,3.4],[3.5,3.9],[4.0,5.0]]



In [265]:

    
def fit_normal_to_hist(h):
    if not all(h==0):
        bins =np.array([-2.0,-1.0,-0.5,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0])  
        orig_hist = np.array(h).astype(float)
        norm_hist = orig_hist/float(sum(orig_hist))
        mid_points = (bins[1:] + bins[:-1])/2
        popt,pcov = opt.curve_fit(lambda x,mu,sig: stats.norm.pdf(x,mu,sig), mid_points,norm_hist)
    else:
        popt = [float('nan'),float('nan')]
    return popt[1]



In [266]:

    
def ZL_std(h):
    intervals =[[-2.0,-1.1],[-1.0,-0.6],[-0.5,-0.1],[0.0,0.4],[0.5,0.9],[1.0,1.4],
           [1.5,1.9],[2.0,2.4],[2.5,2.9],[3.0,3.4],[3.5,3.9],[4.0,5.0]]
    if not all(h==0):
        sum_i1 = 0
        sum_i2 = 0
        for i in range(1,len(h)):
            p = h[i]/100
            v1,v2 = intervals[i]
            sum_i1 += p*(v2**3 - v1**3)/(3*(v2-v1))
            sum_i2 += p*(v2**2 - v1**2)/(2*(v2-v1))
            zl_std = np.sqrt(sum_i1 - sum_i2**2)
    else:
            zl_std = float('nan')    
    return zl_std



In [267]:

    
def Hist_std(h):
    if not all(h==0):
        bins =np.array([-2.0,-1.0,-0.5,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0])  
        orig_hist = np.array(h).astype(float)
        norm_hist = orig_hist/float(sum(orig_hist))
        mid_points = (bins[1:] + bins[:-1])/2
        MeanCrude = np.dot(norm_hist,mid_points)
        VarCrude  = np.dot(norm_hist,(mid_points-MeanCrude)**2)
        bin_widths = np.diff(bins)
        BinWidth = bin_widths.mean()
        VarSheppard = VarCrude - (BinWidth**2)/12       #variance, Sheppard's correction
        hist_std = np.sqrt(VarSheppard)
    else:
        hist_std = float('nan')
    return hist_std

bins =np.array([-2.0,-1.0,-0.5,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0])
orig_hist = np.array(h).astype(float) norm_hist = orig_hist/float(sum(orig_hist)) mid_points = (bins[1:] + bins[:-1])/2 bin_widths = np.diff(bins)

MeanCrude = np.dot(norm_hist,mid_points) MeanCrude

VarCrude = np.dot(norm_hist,(mid_points-MeanCrude)**2) VarCrude

total = 0 for i in range(0,len(norm_hist)): total += norm_hist[i]*mid_points[i] total

for i in range(0,len(norm_hist)): tt[i] = (mid_points[i] - MeanCrude)**2 tt

VarSheppard = VarCrude - (BinWidth**2)/12
VarSheppard

MeanCrude = sum(Prob . CatMid); %crude mean and variance VarCrude = sum(Prob . (CatMid-MeanCrude).^2);

BinWidth = mean(diff(CatBounds(:))); VarSheppard = VarCrude - BinWidth^2/12; %variance, Sheppard's correction

def Entr(h): if not all(h==0): bins =np.array([-2.0,-1.0,-0.5,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0])
orig_hist = np.array(h).astype(float) norm_hist = orig_hist/float(sum(orig_hist))
bin_widths = np.diff(bins)

    np.dot(h,np.log(h/bin_widths))
    bin_density = h.astype(float) / np.dot(bin_widths, h)
    popt
else:
    popt = [float('nan'),float('nan')]
return popt[1]



In [268]:

    
mask = df.columns.str.contains(',')
mask









    Out[268]:





array([False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True], dtype=bool)



In [269]:

    
df.columns[mask]









    Out[269]:





Index(['[-2.0,-1.1]', '[-1.0,-0.6]', '[-0.5,-0.1]', '[0.0,0.4]', '[0.5,0.9]',
       '[1.0,1.4]', '[1.5,1.9]', '[2.0,2.4]', '[2.5,2.9]', '[3.0,3.4]',
       '[3.5,3.9]', '[4.0,5.0]'],
      dtype='object')

h = df.iloc[10,mask] bins =np.array([-2.0,-1.0,-0.5,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0]) orig_hist = np.array(h).astype(float) norm_hist = orig_hist/float(sum(orig_hist)) mid_points = (bins[1:] + bins[:-1])/2 popt,pcov = opt.curve_fit(lambda x,mu,sig: stats.norm.pdf(x,mu,sig), mid_points,norm_hist) popt

for i in range(0,10):
print(i,fit_normal_to_hist(df.iloc[i,mask].astype(float)))



In [270]:

    
df['GA_std'] = df.iloc[:,mask].apply(fit_normal_to_hist,axis=1)
df['ZL_std'] = df.iloc[:,mask].apply(ZL_std,axis=1)
df['Hist_std'] = df.iloc[:,mask].apply(Hist_std,axis=1)



In [271]:

    
df.head(10)









    Out[271]:






  
    
      
      source
      target
      id
      point
      [-2.0,-1.1]
      [-1.0,-0.6]
      [-0.5,-0.1]
      [0.0,0.4]
      [0.5,0.9]
      [1.0,1.4]
      [1.5,1.9]
      [2.0,2.4]
      [2.5,2.9]
      [3.0,3.4]
      [3.5,3.9]
      [4.0,5.0]
      GA_std
      ZL_std
      Hist_std
    
  
  
    
      0
      2014-Q4
      2014
      1
      0.4
      0
      0.0
      0
      50
      50
      0.0
      0
      0
      0
      0
      0
      0
      0.541179
      0.275379
      0.184780
    
    
      1
      2014-Q4
      2014
      2
      0.5
      0
      0.0
      0
      40
      60
      0.0
      0
      0
      0
      0
      0
      0
      0.524163
      0.270801
      0.177886
    
    
      2
      2014-Q4
      2014
      3
      0.5
      0
      2.5
      5
      45
      45
      2.5
      0
      0
      0
      0
      0
      0
      0.687504
      0.385141
      0.326563
    
    
      3
      2014-Q4
      2014
      6
      0.5
      0
      0.0
      0
      10
      90
      0.0
      0
      0
      0
      0
      0
      0
      0.337515
      0.189297
      NaN
    
    
      4
      2014-Q4
      2014
      7
      0.8
      0
      0.0
      0
      0
      0
      0.0
      0
      0
      0
      0
      0
      0
      NaN
      NaN
      NaN
    
    
      5
      2014-Q4
      2014
      14
      0.4
      0
      0.0
      0
      0
      0
      0.0
      0
      0
      0
      0
      0
      0
      NaN
      NaN
      NaN
    
    
      6
      2014-Q4
      2014
      15
      0.5
      0
      0.0
      0
      10
      90
      0.0
      0
      0
      0
      0
      0
      0
      0.337515
      0.189297
      NaN
    
    
      7
      2014-Q4
      2014
      16
      0.6
      1
      2.0
      3
      25
      45
      16.0
      5
      2
      1
      0
      0
      0
      1.109188
      0.574853
      0.577938
    
    
      8
      2014-Q4
      2014
      20
      0.5
      0
      0.0
      0
      35
      50
      15.0
      0
      0
      0
      0
      0
      0
      0.693979
      0.358236
      0.294353
    
    
      9
      2014-Q4
      2014
      23
      0.5
      0
      0.0
      0
      25
      70
      5.0
      0
      0
      0
      0
      0
      0
      0.490474
      0.279881
      0.191425



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
dfList = []
writer = pd.ExcelWriter(out_data + 'PointForecasts.xlsx')

years = [2014,2015,2016]
quarters = [1,2,3,4]
for year in years:
    for q in quarters:
        f = str(year) + 'Q' + str(q)
        fname = f + '.csv'
        if os.path.isfile(raw_data_path + '\\' + fname):
            raw_df = pd.read_csv(raw_data_path + '\\' + fname,header = True)
            # find the row where the growth expectations start
            dum = raw_df[raw_df['TARGET_PERIOD'] == 'GROWTH EXPECTATIONS; YEAR-ON-YEAR CHANGE IN REAL GDP'].index[0]
            mask_columns = ~raw_df.columns.str.contains('Unnamed')
            df = raw_df.iloc[0:dum-1,[0,1,2]]    
            df['source'] = str(year) + '-Q' + str(q)
            df = df.rename(columns={'TARGET_PERIOD':'target','FCT_SOURCE':'id','POINT':'point'})
            df = df[['source','target','id','point']]
            df['id'] = df['id'].astype('int')
            df['point'] = df['point'].astype('float32')
            df.to_excel(writer,f,index=False)
            dfList.append(df)  
writer.save()

	source	target	id	point	[-2.0,-1.1]	[-1.0,-0.6]	[-0.5,-0.1]	[0.0,0.4]	[0.5,0.9]	[1.0,1.4]	[1.5,1.9]	[2.0,2.4]	[2.5,2.9]	GA_std	ZL_std	Hist_std
0	2014-Q4	2014	1	0.4	0	0.0	0	50	50	0.0	0	0	0	0.541179	0.275379	0.184780
1	2014-Q4	2014	2	0.5	0	0.0	0	40	60	0.0	0	0	0	0.524163	0.270801	0.177886
2	2014-Q4	2014	3	0.5	0	2.5	5	45	45	2.5	0	0	0	0.687504	0.385141	0.326563
3	2014-Q4	2014	6	0.5	0	0.0	0	10	90	0.0	0	0	0	0.337515	0.189297	NaN
4	2014-Q4	2014	7	0.8	0	0.0	0	0	0	0.0	0	0	0	NaN	NaN	NaN
5	2014-Q4	2014	14	0.4	0	0.0	0	0	0	0.0	0	0	0	NaN	NaN	NaN
6	2014-Q4	2014	15	0.5	0	0.0	0	10	90	0.0	0	0	0	0.337515	0.189297	NaN
7	2014-Q4	2014	16	0.6	1	2.0	3	25	45	16.0	5	2	1	1.109188	0.574853	0.577938
8	2014-Q4	2014	20	0.5	0	0.0	0	35	50	15.0	0	0	0	0.693979	0.358236	0.294353
9	2014-Q4	2014	23	0.5	0	0.0	0	25	70	5.0	0	0	0	0.490474	0.279881	0.191425