In [ ]:
# this notebook is to develop and test a .py script that takes raw csv files and makes new and clean
# csv files for HICP, GDP, UNemployment and Assumptions
In [4]:
import csv
import pandas as pd
import numpy as np
import os
In [5]:
data_dir = "../data/"
year = 2016
quarter = 1
#months = ['Dec','Mar','Jun','Sep']
# raw_file = "2016Q1.csv"
In [6]:
raw_file = str(year) + 'Q' + str(quarter) + '.csv'
#output_file = 'GDP' + raw_file
print(raw_file)
#print(output_file)
In [7]:
fname = data_dir + raw_file
print(fname)
not os.path.isfile(fname)
Out[7]:
In [8]:
fname
Out[8]:
In [9]:
if not os.path.isfile(fname):
print('File ' + fname + 'is not available')
else:
raw_df = pd.read_csv(fname,header = 1)
#dum = raw_df['TARGET_PERIOD'].tolist()
#dum.index('GROWTH EXPECTATIONS; YEAR-ON-YEAR CHANGE IN REAL GDP')
id_GDP = [i for i,x in enumerate(raw_df['TARGET_PERIOD'].tolist()) if ('GDP' in str(x))][0]
id_UN = [i for i,x in enumerate(raw_df['TARGET_PERIOD'].tolist()) if ('UNEMP' in str(x))][0]
id_AS = [i for i,x in enumerate(raw_df['TARGET_PERIOD'].tolist()) if ('ASSUM' in str(x))][0]
mask_columns = ~raw_df.columns.str.contains('Unnamed')
df_HICP = raw_df.iloc[0:id_GDP-1,mask_columns]
df_GDP = raw_df.iloc[id_GDP+1:id_UN-1,mask_columns]
# UNEMP has more columns
df_UN = raw_df.iloc[id_UN+1:id_AS-1,:]
# reset indices for GDP and UN
df_GDP.reset_index(inplace=True,drop=True)
df_UN.reset_index(inplace=True,drop=True)
# reset column index for GDP (not really necessary because the HICP and GDP have the same columns)
new_cols = df_GDP.iloc[0,:].values
df_GDP.set_axis(axis=1,labels=new_cols)
df_GDP = df_GDP.drop(0,axis=0)
df_GDP.reset_index(inplace=True,drop=True)
# reset column index for UN
new_cols = df_UN.iloc[0,:].values
df_UN.set_axis(axis=1,labels=new_cols)
df_UN = df_UN.drop(0,axis=0)
df_UN.reset_index(inplace=True,drop=True)
# add new column 'source'
df_temp = pd.DataFrame(index=df_HICP.index)
df_temp['source'] = str(year) + '-Q' + str(quarter)
df_HICP.insert(0,'source',df_temp)
df_temp = pd.DataFrame(index=df_GDP.index)
df_temp['source'] = str(year) + '-Q' + str(quarter)
df_GDP.insert(0,'source',df_temp)
df_temp = pd.DataFrame(index=df_UN.index)
df_temp['source'] = str(year) + '-Q' + str(quarter)
df_UN.insert(0,'source',df_temp)
In [10]:
def rename_columns(df):
'''
funciton to rename the columns of a data frame
'''
if df.shape[1]==16: # HICP and GDP
df = df.rename(columns={'TARGET_PERIOD':'target','FCT_SOURCE':'id','POINT':'point',
'TN1_0':'[-2.0,-1.1]',
'FN1_0TN0_6':'[-1.0,-0.6]',
'FN0_5TN0_1':'[-0.5,-0.1]',
'F0_0T0_4':'[0.0,0.4]',
'F0_5T0_9':'[0.5,0.9]',
'F1_0T1_4':'[1.0,1.4]',
'F1_5T1_9':'[1.5,1.9]',
'F2_0T2_4':'[2.0,2.4]',
'F2_5T2_9':'[2.5,2.9]',
'F3_0T3_4':'[3.0,3.4]',
'F3_5T3_9':'[3.5,3.9]',
'F4_0':'[4.0,5.0]'})
elif df.shape[1]==23: # UNEMP
df = df.rename(columns={'TARGET_PERIOD':'target','FCT_SOURCE':'id','POINT':'point',
'T6_5':'[5.5,6.4]',
'F6_5T6_9':'[6.5,6.9]',
'F7_0T7_4':'[7.0,7.4]',
'F7_5T7_9':'[7.5,7.9]',
'F8_0T8_4':'[8.0,8.4]',
'F8_5T8_9':'[8.5,8.9]',
'F9_0T9_4':'[9.0,9.4]',
'F9_5T9_9':'[9.5,9.9]',
'F10_0T10_4':'[10.0,10.4]',
'F10_5T10_9':'[10.5,10.9]',
'F11_0T11_4':'[11.0,11.4]',
'F11_5T11_9':'[11.5,11.9]',
'F12_0T12_4':'[12.0,12.4]',
'F12_5T12_9':'[12.5,12.9]',
'F13_0T13_4':'[13.0,13.4]',
'F13_5T13_9':'[13.5,13.9]',
'F14_0T14_4':'[14.0,14.4]',
'F14_5T14_9':'[14.5,14.9]',
'F15_0':'[15.0,15.9]',})
return df
In [11]:
def get_newTarget(targets):
'''
this function makes a list of new targets given a list of targets
'''
targetNew = []
ts = ['t','t+1','t+2','t+4']
y = 0
mq = 1
for target in targets:
if len(target)==4:
targetNew.append(ts[y])
y+=1
elif len(target)>=6:
targetNew.append('roll ' + str(mq))
mq+=1
return targetNew
def get_newTargets(df):
'''
this function adds a column with new targets to the dataframe
'''
# get a list of targets
targets = df['target'].unique().tolist()
# get a list of new targets
newTargets = get_newTarget(targets)
# make a temp df with targets and then replace with newTargets
df_temp = df[['target']].copy()
for i in range(len(newTargets)):
target = targets[i]
newTarget = newTargets[i]
df_temp['target'].replace(target, newTarget, inplace=True)
# insert temp df in df adter 'target' column
pos_target = df.columns.get_loc('target')
df.insert(pos_target+1,'newTarget',df_temp)
return df
In [12]:
df_HICP = rename_columns(df_HICP)
df_GDP = rename_columns(df_GDP)
df_UN = rename_columns(df_UN)
#df_UN.head()
In [13]:
df_HICP = get_newTargets(df_HICP)
df_HICP.head()
df_GDP = get_newTargets(df_GDP)
df_GDP.head()
df_UN = get_newTargets(df_UN)
df_UN.head()
Out[13]:
In [14]:
print(df_GDP.target.unique().tolist())
print(df_GDP.newTarget.unique().tolist())
In [15]:
print(df_UN.target.unique().tolist())
print(df_UN.newTarget.unique().tolist())
In [16]:
print(df_HICP.target.unique().tolist())
print(df_HICP.newTarget.unique().tolist())
In [17]:
def clean_df(df):
'''
this function removes rows with missing values in "point"
and transforms cells to floats replacing NAs with zeros
'''
# remove rows where point is missing
# maskNaN = df['point'].isnull()
# df = df[~maskNaN].copy()
df = df[df["point"].notnull()].copy()
# replace missing with 0
df.fillna(0, inplace=True)
# cells to floats
# all columns after (including) 'point'
pos_point = df.columns.get_loc('point')
for colname in df.columns[pos_point:]:
df[colname] = df[colname].astype('float')
return df
In [18]:
df_HICP = clean_df(df_HICP)
df_HICP.head()
df_GDP = clean_df(df_GDP)
df_GDP.head()
df_UN = clean_df(df_UN)
df_UN.head()
Out[18]:
In [19]:
df_UN.dtypes
Out[19]:
In [20]:
# save to a csv file
# df.to_csv(output_dir + output_file, index=False)
In [21]:
output_dir = '../clean_data/'
raw_file
Out[21]:
In [22]:
output_dir + 'HICP' + raw_file
Out[22]:
In [24]:
df_HICP.to_csv(output_dir + 'HICP' + raw_file , index=False)
df_GDP.to_csv(output_dir + 'GDP' + raw_file , index=False)
df_UN.to_csv(output_dir + 'UN' + raw_file , index=False)
In [ ]: