In [1]:
import os
import pandas as pd
import scipy.stats as stats
import scipy.optimize as opt
import numpy as np
In [2]:
raw_data_path = 'C:\\Users\\EEU227\\Documents\\IPython Notebooks\\DEEdata\\SPF_individual_forecasts'
In [3]:
if not os.path.isdir(raw_data_path + '\\CleanData'):
os.mkdir(raw_data_path + '\\CleanData')
In [4]:
out_data = raw_data_path + '\\CleanData\\'
out_data
Out[4]:
In [264]:
years = [2014]
quarters = [4]
for year in years:
for q in quarters:
f = str(year) + 'Q' + str(q)
fname = f + '.csv'
if os.path.isfile(raw_data_path + '\\' + fname):
raw_df = pd.read_csv(raw_data_path + '\\' + fname,header = True)
# find the row where the growth expectations start
dum = raw_df[raw_df['TARGET_PERIOD'] == 'GROWTH EXPECTATIONS; YEAR-ON-YEAR CHANGE IN REAL GDP'].index[0]
mask_columns = ~raw_df.columns.str.contains('Unnamed')
df = raw_df.iloc[0:dum-1,mask_columns]
df['source'] = str(year) + '-Q' + str(q)
df = df.rename(columns={'TARGET_PERIOD':'target','FCT_SOURCE':'id','POINT':'point',
'TN1_0':'[-2.0,-1.1]','FN1_0TN0_6':'[-1.0,-0.6]',
'FN0_5TN0_1':'[-0.5,-0.1]','F0_0T0_4':'[0.0,0.4]',
'F0_5T0_9':'[0.5,0.9]','F1_0T1_4':'[1.0,1.4]',
'F1_5T1_9':'[1.5,1.9]','F2_0T2_4':'[2.0,2.4]',
'F2_5T2_9':'[2.5,2.9]','F3_0T3_4':'[3.0,3.4]',
'F3_5T3_9':'[3.5,3.9]','F4_0':'[4.0,5.0]'})
df = df[['source','target','id','point','[-2.0,-1.1]','[-1.0,-0.6]','[-0.5,-0.1]',
'[0.0,0.4]','[0.5,0.9]','[1.0,1.4]','[1.5,1.9]','[2.0,2.4]','[2.5,2.9]',
'[3.0,3.4]','[3.5,3.9]', '[4.0,5.0]']]
df.fillna(0,inplace = True)
for colname in df.columns[3:]:
df[colname] = df[colname].astype('float')
intervals =[[-2.0,-1.1],[-1.0,-0.6],[-0.5,-0.1],[0.0,0.4],[0.5,0.9],[1.0,1.4],
[1.5,1.9],[2.0,2.4],[2.5,2.9],[3.0,3.4],[3.5,3.9],[4.0,5.0]]
In [265]:
def fit_normal_to_hist(h):
if not all(h==0):
bins =np.array([-2.0,-1.0,-0.5,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0])
orig_hist = np.array(h).astype(float)
norm_hist = orig_hist/float(sum(orig_hist))
mid_points = (bins[1:] + bins[:-1])/2
popt,pcov = opt.curve_fit(lambda x,mu,sig: stats.norm.pdf(x,mu,sig), mid_points,norm_hist)
else:
popt = [float('nan'),float('nan')]
return popt[1]
In [266]:
def ZL_std(h):
intervals =[[-2.0,-1.1],[-1.0,-0.6],[-0.5,-0.1],[0.0,0.4],[0.5,0.9],[1.0,1.4],
[1.5,1.9],[2.0,2.4],[2.5,2.9],[3.0,3.4],[3.5,3.9],[4.0,5.0]]
if not all(h==0):
sum_i1 = 0
sum_i2 = 0
for i in range(1,len(h)):
p = h[i]/100
v1,v2 = intervals[i]
sum_i1 += p*(v2**3 - v1**3)/(3*(v2-v1))
sum_i2 += p*(v2**2 - v1**2)/(2*(v2-v1))
zl_std = np.sqrt(sum_i1 - sum_i2**2)
else:
zl_std = float('nan')
return zl_std
In [267]:
def Hist_std(h):
if not all(h==0):
bins =np.array([-2.0,-1.0,-0.5,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0])
orig_hist = np.array(h).astype(float)
norm_hist = orig_hist/float(sum(orig_hist))
mid_points = (bins[1:] + bins[:-1])/2
MeanCrude = np.dot(norm_hist,mid_points)
VarCrude = np.dot(norm_hist,(mid_points-MeanCrude)**2)
bin_widths = np.diff(bins)
BinWidth = bin_widths.mean()
VarSheppard = VarCrude - (BinWidth**2)/12 #variance, Sheppard's correction
hist_std = np.sqrt(VarSheppard)
else:
hist_std = float('nan')
return hist_std
bins =np.array([-2.0,-1.0,-0.5,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0])
orig_hist = np.array(h).astype(float)
norm_hist = orig_hist/float(sum(orig_hist))
mid_points = (bins[1:] + bins[:-1])/2
bin_widths = np.diff(bins)
MeanCrude = np.dot(norm_hist,mid_points) MeanCrude
VarCrude = np.dot(norm_hist,(mid_points-MeanCrude)**2) VarCrude
total = 0 for i in range(0,len(norm_hist)): total += norm_hist[i]*mid_points[i] total
for i in range(0,len(norm_hist)): tt[i] = (mid_points[i] - MeanCrude)**2 tt
VarSheppard = VarCrude - (BinWidth**2)/12
VarSheppard
MeanCrude = sum(Prob . CatMid); %crude mean and variance VarCrude = sum(Prob . (CatMid-MeanCrude).^2);
BinWidth = mean(diff(CatBounds(:))); VarSheppard = VarCrude - BinWidth^2/12; %variance, Sheppard's correction
def Entr(h):
if not all(h==0):
bins =np.array([-2.0,-1.0,-0.5,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0])
orig_hist = np.array(h).astype(float)
norm_hist = orig_hist/float(sum(orig_hist))
bin_widths = np.diff(bins)
np.dot(h,np.log(h/bin_widths))
bin_density = h.astype(float) / np.dot(bin_widths, h)
popt
else:
popt = [float('nan'),float('nan')]
return popt[1]
In [268]:
mask = df.columns.str.contains(',')
mask
Out[268]:
In [269]:
df.columns[mask]
Out[269]:
h = df.iloc[10,mask] bins =np.array([-2.0,-1.0,-0.5,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0]) orig_hist = np.array(h).astype(float) norm_hist = orig_hist/float(sum(orig_hist)) mid_points = (bins[1:] + bins[:-1])/2 popt,pcov = opt.curve_fit(lambda x,mu,sig: stats.norm.pdf(x,mu,sig), mid_points,norm_hist) popt
for i in range(0,10):
print(i,fit_normal_to_hist(df.iloc[i,mask].astype(float)))
In [270]:
df['GA_std'] = df.iloc[:,mask].apply(fit_normal_to_hist,axis=1)
df['ZL_std'] = df.iloc[:,mask].apply(ZL_std,axis=1)
df['Hist_std'] = df.iloc[:,mask].apply(Hist_std,axis=1)
In [271]:
df.head(10)
Out[271]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
dfList = []
writer = pd.ExcelWriter(out_data + 'PointForecasts.xlsx')
years = [2014,2015,2016]
quarters = [1,2,3,4]
for year in years:
for q in quarters:
f = str(year) + 'Q' + str(q)
fname = f + '.csv'
if os.path.isfile(raw_data_path + '\\' + fname):
raw_df = pd.read_csv(raw_data_path + '\\' + fname,header = True)
# find the row where the growth expectations start
dum = raw_df[raw_df['TARGET_PERIOD'] == 'GROWTH EXPECTATIONS; YEAR-ON-YEAR CHANGE IN REAL GDP'].index[0]
mask_columns = ~raw_df.columns.str.contains('Unnamed')
df = raw_df.iloc[0:dum-1,[0,1,2]]
df['source'] = str(year) + '-Q' + str(q)
df = df.rename(columns={'TARGET_PERIOD':'target','FCT_SOURCE':'id','POINT':'point'})
df = df[['source','target','id','point']]
df['id'] = df['id'].astype('int')
df['point'] = df['point'].astype('float32')
df.to_excel(writer,f,index=False)
dfList.append(df)
writer.save()