develop-clean-HICP



In [85]:
# this notebook is to develop and test a .py script (function) that takes raw data csv files and makes new and clean
# csv files only for inflation (GDP)

In [98]:
import csv
import pandas as pd
import numpy as np
import os

In [107]:
data_dir = "../data/"
year = 2016
quarter = 1
months = ['Dec','Mar','Jun','Sep']
# raw_file = "2016Q1.csv"

In [108]:
raw_file = str(year) + 'Q' + str(quarter) + '.csv'
output_file = 'HICP' + raw_file
print(raw_file)
print(output_file)


2016Q1.csv
HICP2016Q1.csv

In [109]:
fname = data_dir + raw_file
print(fname)

not os.path.isfile(fname)


../data/2016Q1.csv
Out[109]:
False

In [110]:
fname


Out[110]:
'../data/2016Q1.csv'

In [111]:
if not os.path.isfile(fname):
    print('File ' + fname + 'is not available')
else:
    raw_df = pd.read_csv(fname,header = 1)
raw_df.head()

# find the row where the growth expectations start
dum = raw_df[raw_df['TARGET_PERIOD'] == 'GROWTH EXPECTATIONS; YEAR-ON-YEAR CHANGE IN REAL GDP'].index[0]
mask_columns = ~raw_df.columns.str.contains('Unnamed')
df = raw_df.iloc[0:dum-1,mask_columns]    
df = df.rename(columns={'TARGET_PERIOD':'target','FCT_SOURCE':'id','POINT':'point',
                                   'TN1_0':'[-2.0,-1.1]','FN1_0TN0_6':'[-1.0,-0.6]',
                                   'FN0_5TN0_1':'[-0.5,-0.1]','F0_0T0_4':'[0.0,0.4]',
                                   'F0_5T0_9':'[0.5,0.9]','F1_0T1_4':'[1.0,1.4]',
                                   'F1_5T1_9':'[1.5,1.9]','F2_0T2_4':'[2.0,2.4]',
                                   'F2_5T2_9':'[2.5,2.9]','F3_0T3_4':'[3.0,3.4]',
                                   'F3_5T3_9':'[3.5,3.9]','F4_0':'[4.0,5.0]'})

# remove rows where point is missing
maskNaN = df.point.isnull()
df = df[~maskNaN]                    
df.fillna(0,inplace = True)
for colname in df.columns[3:]:
    df[colname] = df[colname].astype('float')

# create a new target column 
mask_t0 = str(year)
mask_t1 = str(year+1)
mask_t2 = str(year+2)
if quarter<3:
    mask_t4or5 = str(year+4)
else:
    mask_t4or5 = str(year+5)
if quarter==1:
    mask_Rt1 = str(year) + months[quarter-1]
    mask_Rt2 = str(year+1) + months[quarter-1]
else:
    mask_Rt1 = str(year+1) + months[quarter-1]
    mask_Rt2 = str(year+2) + months[quarter-1]
                #
df.loc[df.loc[:,'target'] == mask_t0,'targetNew'] = 't'
df.loc[df.loc[:,'target'] == mask_t1,'targetNew'] = 't+1'
df.loc[df.loc[:,'target'] == mask_t2,'targetNew'] = 't+2'
df.loc[df.loc[:,'target'] == mask_t4or5,'targetNew'] = 't+4'
df.loc[df.loc[:,'target'] == mask_Rt1,'targetNew'] = 'roll 1'
df.loc[df.loc[:,'target'] == mask_Rt2,'targetNew'] = 'roll 2'

df['source'] = str(year) + '-Q' + str(quarter)

df = df[['source','target','targetNew','id','point','[-2.0,-1.1]','[-1.0,-0.6]','[-0.5,-0.1]',
         '[0.0,0.4]','[0.5,0.9]','[1.0,1.4]','[1.5,1.9]','[2.0,2.4]','[2.5,2.9]',
         '[3.0,3.4]','[3.5,3.9]', '[4.0,5.0]']]

In [112]:
df.head()


Out[112]:
source target targetNew id point [-2.0,-1.1] [-1.0,-0.6] [-0.5,-0.1] [0.0,0.4] [0.5,0.9] [1.0,1.4] [1.5,1.9] [2.0,2.4] [2.5,2.9] [3.0,3.4] [3.5,3.9] [4.0,5.0]
0 2016-Q1 2016 t 1 .3 0 0 20 60 20 0 0 0 0 0 0 0
1 2016-Q1 2016 t 2 .3 0 2 8 43 37 8 2 0 0 0 0 0
2 2016-Q1 2016 t 4 .9 0 5 10 25 45 10 5 0 0 0 0 0
3 2016-Q1 2016 t 5 .9 0 0 5 20 30 30 10 5 0 0 0 0
4 2016-Q1 2016 t 6 .6 0 0 5 25 45 25 0 0 0 0 0 0

In [116]:
df.to_csv('tmp.csv',index=False)

In [118]:
tt = pd.read_csv('tmp.csv')
tt


Out[118]:
source target targetNew id point [-2.0,-1.1] [-1.0,-0.6] [-0.5,-0.1] [0.0,0.4] [0.5,0.9] [1.0,1.4] [1.5,1.9] [2.0,2.4] [2.5,2.9] [3.0,3.4] [3.5,3.9] [4.0,5.0]
0 2016-Q1 2016 t 1 0.3000 0.000000 0.000000 20.000000 60.000000 20.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
1 2016-Q1 2016 t 2 0.3000 0.000000 2.000000 8.000000 43.000000 37.000000 8.000000 2.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
2 2016-Q1 2016 t 4 0.9000 0.000000 5.000000 10.000000 25.000000 45.000000 10.000000 5.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
3 2016-Q1 2016 t 5 0.9000 0.000000 0.000000 5.000000 20.000000 30.000000 30.000000 10.000000 5.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
4 2016-Q1 2016 t 6 0.6000 0.000000 0.000000 5.000000 25.000000 45.000000 25.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
5 2016-Q1 2016 t 14 0.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
6 2016-Q1 2016 t 15 1.2000 0.000000 0.000000 5.000000 5.000000 15.000000 40.000000 30.000000 5.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
7 2016-Q1 2016 t 16 0.7000 1.000000 2.000000 7.000000 15.000000 18.000000 16.000000 14.000000 11.000000 9.000000 4.000000 2.000000e+00 1.000000e+00
8 2016-Q1 2016 t 20 0.7000 0.000000 0.000000 5.000000 25.000000 45.000000 25.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
9 2016-Q1 2016 t 22 0.7000 0.000000 0.000000 5.000000 25.000000 40.000000 25.000000 5.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
10 2016-Q1 2016 t 23 0.6000 0.000000 1.000000 10.000000 25.000000 30.000000 23.000000 10.000000 1.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
11 2016-Q1 2016 t 24 0.9400 0.000000 0.000000 5.000000 18.000000 30.000000 25.000000 22.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
12 2016-Q1 2016 t 26 0.3200 0.000000 0.000000 20.000000 45.000000 30.000000 5.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
13 2016-Q1 2016 t 29 0.3000 0.000000 0.000000 20.000000 60.000000 20.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
14 2016-Q1 2016 t 31 1.1000 0.000000 0.000000 5.000000 10.000000 35.000000 40.000000 10.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
15 2016-Q1 2016 t 32 0.9000 0.001294 0.064923 1.704751 14.127882 38.421113 34.473543 10.196841 0.979459 0.029909 0.000284 8.295000e-07 7.000000e-10
16 2016-Q1 2016 t 35 0.9000 0.000000 0.000000 0.000000 20.000000 40.000000 30.000000 10.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
17 2016-Q1 2016 t 37 0.6000 0.000000 0.000000 2.000000 34.000000 55.000000 9.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
18 2016-Q1 2016 t 38 1.0000 0.000000 0.000000 0.000000 0.000000 20.000000 60.000000 20.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
19 2016-Q1 2016 t 39 0.6000 0.000000 0.000000 0.000000 25.000000 40.000000 20.000000 15.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
20 2016-Q1 2016 t 41 1.0000 0.000000 0.000000 0.000000 0.000000 20.000000 70.000000 10.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
21 2016-Q1 2016 t 45 0.8500 0.000000 0.000000 0.000000 15.000000 50.000000 25.000000 10.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
22 2016-Q1 2016 t 47 0.9000 0.000000 5.000000 10.000000 25.000000 45.000000 10.000000 5.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
23 2016-Q1 2016 t 48 0.9000 0.000000 0.000000 0.000000 5.000000 50.000000 45.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
24 2016-Q1 2016 t 49 0.7000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
25 2016-Q1 2016 t 52 0.5000 0.000000 0.000000 9.000000 34.000000 38.000000 15.000000 3.000000 1.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
26 2016-Q1 2016 t 54 0.9000 0.000000 0.000000 5.000000 10.000000 50.000000 30.000000 5.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
27 2016-Q1 2016 t 57 1.1000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
28 2016-Q1 2016 t 61 1.0000 0.000000 0.000000 0.000000 5.000000 40.000000 40.000000 15.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
29 2016-Q1 2016 t 63 0.5000 0.000000 0.000000 10.000000 30.000000 50.000000 10.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
262 2016-Q1 2020 t+4 37 2.2000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 34.000000 40.000000 23.000000 1.000000 0.000000e+00 0.000000e+00
263 2016-Q1 2020 t+4 39 1.9000 0.000000 0.000000 0.000000 5.000000 20.000000 27.000000 33.000000 15.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
264 2016-Q1 2020 t+4 45 1.6000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
265 2016-Q1 2020 t+4 47 1.9000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
266 2016-Q1 2020 t+4 48 1.5000 0.000000 0.000000 0.000000 0.000000 0.000000 50.000000 50.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
267 2016-Q1 2020 t+4 52 1.0000 0.000000 1.000000 8.000000 14.000000 19.000000 20.000000 16.000000 11.000000 7.000000 3.000000 1.000000e+00 0.000000e+00
268 2016-Q1 2020 t+4 54 2.0000 0.000000 0.000000 0.000000 0.000000 0.000000 15.000000 40.000000 40.000000 5.000000 0.000000 0.000000e+00 0.000000e+00
269 2016-Q1 2020 t+4 57 2.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
270 2016-Q1 2020 t+4 63 1.8000 0.000000 0.000000 0.000000 5.000000 10.000000 30.000000 40.000000 10.000000 5.000000 0.000000 0.000000e+00 0.000000e+00
271 2016-Q1 2020 t+4 68 1.9000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
272 2016-Q1 2020 t+4 70 1.8000 0.000000 0.000000 0.000000 0.000000 0.000000 35.000000 65.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
273 2016-Q1 2020 t+4 73 1.8000 0.000000 0.000000 0.000000 5.000000 20.000000 35.000000 35.000000 5.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
274 2016-Q1 2020 t+4 80 1.5000 0.343643 0.343643 4.467354 12.027491 16.494845 24.054983 24.054983 11.340206 5.154639 1.718213 0.000000e+00 0.000000e+00
275 2016-Q1 2020 t+4 84 1.9000 3.000000 2.000000 3.000000 6.000000 10.000000 14.000000 20.000000 19.000000 15.000000 5.000000 2.000000e+00 1.000000e+00
276 2016-Q1 2020 t+4 85 1.9500 0.114421 0.424194 1.479607 4.038854 8.628830 14.430063 18.890151 19.358359 15.529909 9.752687 4.794120e+00 2.558806e+00
277 2016-Q1 2020 t+4 88 1.8500 0.000000 0.000000 2.000000 7.500000 11.000000 18.000000 23.000000 18.000000 11.000000 7.500000 2.000000e+00 0.000000e+00
278 2016-Q1 2020 t+4 89 2.0000 0.000000 0.000000 0.000000 2.500000 10.000000 15.000000 25.000000 20.000000 15.000000 7.500000 2.500000e+00 2.500000e+00
279 2016-Q1 2020 t+4 91 1.7000 0.000000 0.000000 2.500000 5.000000 17.500000 25.000000 30.000000 12.500000 5.000000 2.500000 0.000000e+00 0.000000e+00
280 2016-Q1 2020 t+4 93 2.0000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
281 2016-Q1 2020 t+4 94 1.7000 0.500000 1.000000 3.000000 4.500000 8.000000 16.000000 32.000000 16.000000 8.000000 6.000000 3.000000e+00 2.000000e+00
282 2016-Q1 2020 t+4 95 1.7685 0.000000 0.000000 0.000000 0.000000 7.000000 17.000000 42.600000 22.100000 11.300000 0.000000 0.000000e+00 0.000000e+00
283 2016-Q1 2020 t+4 96 1.5000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
284 2016-Q1 2020 t+4 98 2.0000 0.000000 0.000000 0.000000 0.000000 0.000000 15.000000 35.000000 35.000000 15.000000 0.000000 0.000000e+00 0.000000e+00
285 2016-Q1 2020 t+4 99 1.7000 0.000000 0.000000 0.000000 0.000000 0.000000 30.000000 70.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
286 2016-Q1 2020 t+4 101 1.8000 0.000000 0.000000 5.000000 7.000000 15.000000 25.000000 35.000000 10.000000 3.000000 0.000000 0.000000e+00 0.000000e+00
287 2016-Q1 2020 t+4 102 1.4000 0.000000 0.000000 0.000000 5.000000 15.000000 60.000000 20.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
288 2016-Q1 2020 t+4 103 1.8000 0.000000 0.000000 0.000000 4.878049 12.195122 19.512195 29.268293 19.512195 12.195122 2.439024 0.000000e+00 0.000000e+00
289 2016-Q1 2020 t+4 105 2.2000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 20.000000 80.000000 0.000000 0.000000 0.000000e+00 0.000000e+00
290 2016-Q1 2020 t+4 112 2.0000 0.000000 0.000000 0.000000 0.000000 5.000000 10.000000 20.000000 30.000000 25.000000 10.000000 0.000000e+00 0.000000e+00
291 2016-Q1 2020 t+4 114 1.5000 0.000000 1.000000 3.000000 8.000000 10.000000 16.000000 40.000000 10.000000 5.000000 5.000000 2.000000e+00 0.000000e+00

292 rows × 17 columns


In [ ]:


In [ ]:
def parse(raw_file):
    ''' Parses a row SPF to a dictionary
    '''
    # get the year and quarter to use in naming the sourse
    year = raw_file.split('Q')[-2][-4:]
    quarter = raw_file.split('Q')[-1][0]

    # open the CSV file
    opened_file = open(data_dir + raw_file)
    # Read the CSV file data
    csv_data = csv.reader(opened_file, delimiter=',')

    # Setup an empty list
    parsed_data = []

    # Skip over the first line and save the second as headers
    next(csv_data)
    fields = next(csv_data)
    # remove empty elements of the fields
    fields = [item for item in fields if item != '']

# Iterate over each row of the csv file, zip together field -> value
# BREAK when growth forecestast are reached and remove previous row (empty)

    for row in csv_data:
        if "GROWTH" in row[0]:
            break
        row = get_row(row)
        parsed_data.append(dict(zip(fields, row)))

    del parsed_data[len(parsed_data)-1]

# Close the CSV file
    opened_file.close()

# return data as dictionary
# return parsed_data, fields


#def get_data_frame(parsed_data, fields):
#    ''' takes a dictionary with individual histograms and returns
#        a data frame
#    '''

    df = pd.DataFrame(parsed_data, columns=fields, dtype=float)

# make a column to keep the source (name of the raw data file)

    df['source'] = str(year) + '-Q' + str(quarter)

# rename columns
    df = df.rename(columns={'TARGET_PERIOD': 'target', 'FCT_SOURCE': 'id',
                            'POINT': 'point', 'TN1_0': '[-2.0,-1.1]',
                            'FN1_0TN0_6': '[-1.0,-0.6]', 'FN0_5TN0_1':
                            '[-0.5,-0.1]', 'F0_0T0_4': '[0.0,0.4]', 'F0_5T0_9':
                            '[0.5,0.9]', 'F1_0T1_4': '[1.0,1.4]', 'F1_5T1_9':
                            '[1.5,1.9]', 'F2_0T2_4': '[2.0,2.4]', 'F2_5T2_9':
                            '[2.5,2.9]', 'F3_0T3_4': '[3.0,3.4]', 'F3_5T3_9':
                            '[3.5,3.9]', 'F4_0': '[4.0,5.0]'})

# put the columns in the right order
    df = df[['source', 'target', 'id', 'point', '[-2.0,-1.1]', '[-1.0,-0.6]',
             '[-0.5,-0.1]', '[0.0,0.4]', '[0.5,0.9]', '[1.0,1.4]', '[1.5,1.9]',
             '[2.0,2.4]', '[2.5,2.9]', '[3.0,3.4]', '[3.5,3.9]', '[4.0,5.0]']]

# remove rows where point forecast is missing
    maskNaN = df.point.isnull()
    df = df[~maskNaN]
    df.fillna(0, inplace=True)

# select columns of the histograms
    mask = df.columns.str.contains(',')
# check if the columns of the individual histograms sum up to 100
# remove rows which do not
    maskHist = df.loc[:, mask].apply(np.sum, axis=1)!= 0.0
    df = df.loc[maskHist, :]

    return df