In [1]:
"""
This NB is used to develop the .py script that makes csv files with
individual uncertainty in ../output/indiv_uncert
"""
import os
import pandas as pd
import scipy.stats as stats
import scipy.optimize as opt
import numpy as np
In [42]:
input_dir = "../clean_data/"
series = 'HICP' #'UN' #'HICP'
year = '2014'
quarter = 'Q4'
input_file = series + year + quarter + '.csv'
fname = input_dir + input_file
print(input_file)
In [43]:
if not os.path.isfile(fname):
print(input_file + ' does not exist in ' + input_dir)
else:
print("it's all good!")
In [44]:
df = pd.read_csv(fname,)
df.head()
Out[44]:
In [45]:
df.columns[5:]
Out[45]:
df = df.rename(columns={'TARGET_PERIOD':'target','FCT_SOURCE':'id','POINT':'point',
'TN1_0':'[-2.0,-1.1]',
'FN1_0TN0_6':'[-1.0,-0.6]',
'FN0_5TN0_1':'[-0.5,-0.1]',
'F0_0T0_4':'[0.0,0.4]',
'F0_5T0_9':'[0.5,0.9]',
'F1_0T1_4':'[1.0,1.4]',
'F1_5T1_9':'[1.5,1.9]',
'F2_0T2_4':'[2.0,2.4]',
'F2_5T2_9':'[2.5,2.9]',
'F3_0T3_4':'[3.0,3.4]',
'F3_5T3_9':'[3.5,3.9]',
'F4_0':'[4.0,5.0]'})
elif df.shape[1]==23: # UNEMP
df = df.rename(columns={'TARGET_PERIOD':'target','FCT_SOURCE':'id','POINT':'point',
'T6_5':'[5.5,6.4]',
'F6_5T6_9':'[6.5,6.9]',
'F7_0T7_4':'[7.0,7.4]',
'F7_5T7_9':'[7.5,7.9]',
'F8_0T8_4':'[8.0,8.4]',
'F8_5T8_9':'[8.5,8.9]',
'F9_0T9_4':'[9.0,9.4]',
'F9_5T9_9':'[9.5,9.9]',
'F10_0T10_4':'[10.0,10.4]',
'F10_5T10_9':'[10.5,10.9]',
'F11_0T11_4':'[11.0,11.4]',
'F11_5T11_9':'[11.5,11.9]',
'F12_0T12_4':'[12.0,12.4]',
'F12_5T12_9':'[12.5,12.9]',
'F13_0T13_4':'[13.0,13.4]',
'F13_5T13_9':'[13.5,13.9]',
'F14_0T14_4':'[14.0,14.4]',
'F14_5T14_9':'[14.5,14.9]',
'F15_0':'[15.0,15.9]',})
In [46]:
def fit_normal_to_hist(h):
if len(h)==12:
bins =np.array([-2.0,-1.0,-0.5,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0])
elif len(h) ==19:
bins =np.array([5.5,6.5,7.0,7.5,8.0,8.5,9.0,9.5,10.0,10.5,11.0,11.5,12.0,12.5,13.0,13.5,14.0,14.5,15.0,16.0])
#
if not all(h==0):
orig_hist = np.array(h).astype(float)
norm_hist = orig_hist/float(np.sum(orig_hist))
mid_points = (bins[1:] + bins[:-1])/2
popt,pcov = opt.curve_fit(lambda x,mu,sig: stats.norm.pdf(x,mu,sig), mid_points,norm_hist)
else:
popt = [float('nan'),float('nan')]
return popt[1]
In [47]:
def ZL_std(h):
if len(h)==12:
intervals =[[-2.0,-1.1],[-1.0,-0.6],[-0.5,-0.1],[0.0,0.4],[0.5,0.9],[1.0,1.4],
[1.0,1.4],[1.5,1.9],[2.0,2.4],[2.5,2.9],[3.0,3.4],[3.5,3.9],[4.0,5.0]]
elif len(h) ==19:
intervals = [[5.5,6.4], [6.5,6.9], [7.0,7.4], [7.5,7.9], [8.0,8.4],[8.5,8.9],
[9.0,9.4], [9.5,9.9], [10.0,10.4], [10.5,10.9],[11.0,11.4],
[11.5,11.9],[12.0,12.4], [12.5,12.9],[13.0,13.4], [13.5,13.9],
[14.0,14.4], [14.5,14.9],[15.0,15.9]]
if not all(h==0):
sum_i1 = 0
sum_i2 = 0
for i in range(1,len(h)):
p = h[i]/100
v1,v2 = intervals[i]
sum_i1 += p*(v2**3 - v1**3)/(3*(v2-v1))
sum_i2 += p*(v2**2 - v1**2)/(2*(v2-v1))
zl_std = np.sqrt(sum_i1 - sum_i2**2)
else:
zl_std = float('nan')
return zl_std
In [48]:
def Hist_std(h):
if not all(h==0):
if len(h)==12:
bins =np.array([-2.0,-1.0,-0.5,0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,5.0])
elif len(h) ==19:
bins =np.array([5.5,6.5,7.0,7.5,8.0,8.5,9.0,9.5,10.0,10.5,11.0,11.5,12.0,12.5,13.0,13.5,14.0,14.5,15.0,16.0])
orig_hist = np.array(h).astype(float)
norm_hist = orig_hist/float(np.sum(orig_hist))
mid_points = (bins[1:] + bins[:-1])/2
MeanCrude = np.dot(norm_hist,mid_points)
VarCrude = np.dot(norm_hist,(mid_points-MeanCrude)**2)
bin_widths = np.diff(bins)
BinWidth = bin_widths.mean()
VarSheppard = VarCrude - (BinWidth**2)/12 #variance, Sheppard's correction
if VarSheppard>0:
hist_std = np.sqrt(VarSheppard)
else:
hist_std = np.sqrt(VarCrude)
else:
hist_std = float('nan')
return hist_std
In [49]:
h = df.iloc[0,5:].values
h
Out[49]:
In [28]:
yy = fit_normal_to_hist(h)
In [ ]:
In [50]:
tt = ZL_std(h)
tt
Out[50]:
In [30]:
ss = Hist_std(h)
ss
Out[30]:
In [ ]:
df.iloc[:,5:].head()
In [ ]:
df.iloc[0,5:]
In [53]:
mask = df.columns.str.contains(',')
mask
Out[53]:
In [9]:
df['mean'] = df.iloc[:,mask].apply(np.mean,axis=1)
In [54]:
df['GA_std'] = df.iloc[:,mask].apply(fit_normal_to_hist,axis=1)
In [55]:
df
Out[55]:
In [56]:
df['ZL_std'] = df.iloc[:,mask].apply(ZL_std,axis=1)
In [57]:
df['Hist_std'] = df.iloc[:,mask].apply(Hist_std,axis=1)
In [58]:
df.head()
Out[58]:
In [ ]:
df.to_csv('temp.csv')
In [37]:
# Define the aggregation calculations
aggregations = {
'point': { #
'1.mean': 'mean', #
'2.std': 'std', #
'3.IQR': lambda x: x.quantile(.75) - x.quantile(.25)
},
'GA_std': {#
# 'mean': 'mean',
'median': 'median'
},
'ZL_std': {#
# 'mean': 'mean',
'median': 'median'
},
'Hist_std': {#
# 'mean': 'mean',
'median': 'median'
}
}
In [39]:
grouped = df.groupby(['source','newTarget']).agg(aggregations)
grouped = grouped.rename(columns={'point':'a. point','ZL_std':'b. ZL_std',
'GA_std':'c. GA_std','Hist_std':'d. Hist_std'})
In [40]:
df.head()
Out[40]:
In [ ]:
writer = pd.ExcelWriter(out_data + 'SPF_time_series.xlsx')
In [ ]:
for h in df.loc[:,'targetNew'].unique():
df_h = grouped.xs(h,level=1).sortlevel(0,axis=1)
# print(df_h)
df_h.to_excel(writer,h)
writer.save()