In [85]:
# this notebook is to develop and test a .py script (function) that takes raw data csv files and makes new and clean
# csv files only for inflation (GDP)
In [98]:
import csv
import pandas as pd
import numpy as np
import os
In [107]:
data_dir = "../data/"
year = 2016
quarter = 1
months = ['Dec','Mar','Jun','Sep']
# raw_file = "2016Q1.csv"
In [108]:
raw_file = str(year) + 'Q' + str(quarter) + '.csv'
output_file = 'HICP' + raw_file
print(raw_file)
print(output_file)
In [109]:
fname = data_dir + raw_file
print(fname)
not os.path.isfile(fname)
Out[109]:
In [110]:
fname
Out[110]:
In [111]:
if not os.path.isfile(fname):
print('File ' + fname + 'is not available')
else:
raw_df = pd.read_csv(fname,header = 1)
raw_df.head()
# find the row where the growth expectations start
dum = raw_df[raw_df['TARGET_PERIOD'] == 'GROWTH EXPECTATIONS; YEAR-ON-YEAR CHANGE IN REAL GDP'].index[0]
mask_columns = ~raw_df.columns.str.contains('Unnamed')
df = raw_df.iloc[0:dum-1,mask_columns]
df = df.rename(columns={'TARGET_PERIOD':'target','FCT_SOURCE':'id','POINT':'point',
'TN1_0':'[-2.0,-1.1]','FN1_0TN0_6':'[-1.0,-0.6]',
'FN0_5TN0_1':'[-0.5,-0.1]','F0_0T0_4':'[0.0,0.4]',
'F0_5T0_9':'[0.5,0.9]','F1_0T1_4':'[1.0,1.4]',
'F1_5T1_9':'[1.5,1.9]','F2_0T2_4':'[2.0,2.4]',
'F2_5T2_9':'[2.5,2.9]','F3_0T3_4':'[3.0,3.4]',
'F3_5T3_9':'[3.5,3.9]','F4_0':'[4.0,5.0]'})
# remove rows where point is missing
maskNaN = df.point.isnull()
df = df[~maskNaN]
df.fillna(0,inplace = True)
for colname in df.columns[3:]:
df[colname] = df[colname].astype('float')
# create a new target column
mask_t0 = str(year)
mask_t1 = str(year+1)
mask_t2 = str(year+2)
if quarter<3:
mask_t4or5 = str(year+4)
else:
mask_t4or5 = str(year+5)
if quarter==1:
mask_Rt1 = str(year) + months[quarter-1]
mask_Rt2 = str(year+1) + months[quarter-1]
else:
mask_Rt1 = str(year+1) + months[quarter-1]
mask_Rt2 = str(year+2) + months[quarter-1]
#
df.loc[df.loc[:,'target'] == mask_t0,'targetNew'] = 't'
df.loc[df.loc[:,'target'] == mask_t1,'targetNew'] = 't+1'
df.loc[df.loc[:,'target'] == mask_t2,'targetNew'] = 't+2'
df.loc[df.loc[:,'target'] == mask_t4or5,'targetNew'] = 't+4'
df.loc[df.loc[:,'target'] == mask_Rt1,'targetNew'] = 'roll 1'
df.loc[df.loc[:,'target'] == mask_Rt2,'targetNew'] = 'roll 2'
df['source'] = str(year) + '-Q' + str(quarter)
df = df[['source','target','targetNew','id','point','[-2.0,-1.1]','[-1.0,-0.6]','[-0.5,-0.1]',
'[0.0,0.4]','[0.5,0.9]','[1.0,1.4]','[1.5,1.9]','[2.0,2.4]','[2.5,2.9]',
'[3.0,3.4]','[3.5,3.9]', '[4.0,5.0]']]
In [112]:
df.head()
Out[112]:
In [116]:
df.to_csv('tmp.csv',index=False)
In [118]:
tt = pd.read_csv('tmp.csv')
tt
Out[118]:
In [ ]:
In [ ]:
def parse(raw_file):
''' Parses a row SPF to a dictionary
'''
# get the year and quarter to use in naming the sourse
year = raw_file.split('Q')[-2][-4:]
quarter = raw_file.split('Q')[-1][0]
# open the CSV file
opened_file = open(data_dir + raw_file)
# Read the CSV file data
csv_data = csv.reader(opened_file, delimiter=',')
# Setup an empty list
parsed_data = []
# Skip over the first line and save the second as headers
next(csv_data)
fields = next(csv_data)
# remove empty elements of the fields
fields = [item for item in fields if item != '']
# Iterate over each row of the csv file, zip together field -> value
# BREAK when growth forecestast are reached and remove previous row (empty)
for row in csv_data:
if "GROWTH" in row[0]:
break
row = get_row(row)
parsed_data.append(dict(zip(fields, row)))
del parsed_data[len(parsed_data)-1]
# Close the CSV file
opened_file.close()
# return data as dictionary
# return parsed_data, fields
#def get_data_frame(parsed_data, fields):
# ''' takes a dictionary with individual histograms and returns
# a data frame
# '''
df = pd.DataFrame(parsed_data, columns=fields, dtype=float)
# make a column to keep the source (name of the raw data file)
df['source'] = str(year) + '-Q' + str(quarter)
# rename columns
df = df.rename(columns={'TARGET_PERIOD': 'target', 'FCT_SOURCE': 'id',
'POINT': 'point', 'TN1_0': '[-2.0,-1.1]',
'FN1_0TN0_6': '[-1.0,-0.6]', 'FN0_5TN0_1':
'[-0.5,-0.1]', 'F0_0T0_4': '[0.0,0.4]', 'F0_5T0_9':
'[0.5,0.9]', 'F1_0T1_4': '[1.0,1.4]', 'F1_5T1_9':
'[1.5,1.9]', 'F2_0T2_4': '[2.0,2.4]', 'F2_5T2_9':
'[2.5,2.9]', 'F3_0T3_4': '[3.0,3.4]', 'F3_5T3_9':
'[3.5,3.9]', 'F4_0': '[4.0,5.0]'})
# put the columns in the right order
df = df[['source', 'target', 'id', 'point', '[-2.0,-1.1]', '[-1.0,-0.6]',
'[-0.5,-0.1]', '[0.0,0.4]', '[0.5,0.9]', '[1.0,1.4]', '[1.5,1.9]',
'[2.0,2.4]', '[2.5,2.9]', '[3.0,3.4]', '[3.5,3.9]', '[4.0,5.0]']]
# remove rows where point forecast is missing
maskNaN = df.point.isnull()
df = df[~maskNaN]
df.fillna(0, inplace=True)
# select columns of the histograms
mask = df.columns.str.contains(',')
# check if the columns of the individual histograms sum up to 100
# remove rows which do not
maskHist = df.loc[:, mask].apply(np.sum, axis=1)!= 0.0
df = df.loc[maskHist, :]
return df