develop-clean-SPF



In [ ]:
# this notebook is to develop and test a .py script that takes raw csv files and makes new and clean
# csv files for HICP, GDP, UNemployment and Assumptions

In [4]:
import csv
import pandas as pd
import numpy as np
import os

In [5]:
data_dir = "../data/"
year = 2016
quarter = 1
#months = ['Dec','Mar','Jun','Sep']
# raw_file = "2016Q1.csv"

In [6]:
raw_file = str(year) + 'Q' + str(quarter) + '.csv'
#output_file = 'GDP' + raw_file
print(raw_file)
#print(output_file)


2016Q1.csv

In [7]:
fname = data_dir + raw_file
print(fname)

not os.path.isfile(fname)


../data/2016Q1.csv
Out[7]:
False

In [8]:
fname


Out[8]:
'../data/2016Q1.csv'

In [9]:
if not os.path.isfile(fname):
    print('File ' + fname + 'is not available')
else:
    raw_df = pd.read_csv(fname,header = 1)
    #dum = raw_df['TARGET_PERIOD'].tolist()
    #dum.index('GROWTH EXPECTATIONS; YEAR-ON-YEAR CHANGE IN REAL GDP')
    id_GDP = [i for i,x in enumerate(raw_df['TARGET_PERIOD'].tolist()) if ('GDP' in str(x))][0]
    id_UN  = [i for i,x in enumerate(raw_df['TARGET_PERIOD'].tolist()) if ('UNEMP' in str(x))][0]
    id_AS  = [i for i,x in enumerate(raw_df['TARGET_PERIOD'].tolist()) if ('ASSUM' in str(x))][0]
    
    mask_columns = ~raw_df.columns.str.contains('Unnamed')
    df_HICP = raw_df.iloc[0:id_GDP-1,mask_columns]
    df_GDP = raw_df.iloc[id_GDP+1:id_UN-1,mask_columns]
    # UNEMP has more columns
    df_UN = raw_df.iloc[id_UN+1:id_AS-1,:]
    # reset indices for GDP and UN
    df_GDP.reset_index(inplace=True,drop=True)
    df_UN.reset_index(inplace=True,drop=True) 

    # reset column index for GDP (not really necessary because the HICP and GDP have the same columns)
    new_cols = df_GDP.iloc[0,:].values
    df_GDP.set_axis(axis=1,labels=new_cols)
    df_GDP = df_GDP.drop(0,axis=0)
    df_GDP.reset_index(inplace=True,drop=True) 
    # reset column index for UN 
    new_cols = df_UN.iloc[0,:].values
    df_UN.set_axis(axis=1,labels=new_cols)
    df_UN = df_UN.drop(0,axis=0)
    df_UN.reset_index(inplace=True,drop=True) 
    
    # add new column 'source'
    df_temp = pd.DataFrame(index=df_HICP.index)
    df_temp['source'] = str(year) + '-Q' + str(quarter)
    df_HICP.insert(0,'source',df_temp)

    df_temp = pd.DataFrame(index=df_GDP.index)
    df_temp['source'] = str(year) + '-Q' + str(quarter)
    df_GDP.insert(0,'source',df_temp)
    
    df_temp = pd.DataFrame(index=df_UN.index)
    df_temp['source'] = str(year) + '-Q' + str(quarter)
    df_UN.insert(0,'source',df_temp)

In [10]:
def rename_columns(df):
    '''
        funciton to rename the columns of a data frame
    '''
    if df.shape[1]==16: # HICP and GDP
        df = df.rename(columns={'TARGET_PERIOD':'target','FCT_SOURCE':'id','POINT':'point',
                                'TN1_0':'[-2.0,-1.1]',
                                'FN1_0TN0_6':'[-1.0,-0.6]',
                                'FN0_5TN0_1':'[-0.5,-0.1]',
                                'F0_0T0_4':'[0.0,0.4]',
                                'F0_5T0_9':'[0.5,0.9]',
                                'F1_0T1_4':'[1.0,1.4]',
                                'F1_5T1_9':'[1.5,1.9]',
                                'F2_0T2_4':'[2.0,2.4]',
                                'F2_5T2_9':'[2.5,2.9]',
                                'F3_0T3_4':'[3.0,3.4]',
                                'F3_5T3_9':'[3.5,3.9]',
                                'F4_0':'[4.0,5.0]'})
    elif df.shape[1]==23: # UNEMP
        df = df.rename(columns={'TARGET_PERIOD':'target','FCT_SOURCE':'id','POINT':'point',
                                'T6_5':'[5.5,6.4]',
                                'F6_5T6_9':'[6.5,6.9]',
                                'F7_0T7_4':'[7.0,7.4]',
                                'F7_5T7_9':'[7.5,7.9]',
                                'F8_0T8_4':'[8.0,8.4]',
                                'F8_5T8_9':'[8.5,8.9]',
                                'F9_0T9_4':'[9.0,9.4]',
                                'F9_5T9_9':'[9.5,9.9]',
                                'F10_0T10_4':'[10.0,10.4]',
                                'F10_5T10_9':'[10.5,10.9]',
                                'F11_0T11_4':'[11.0,11.4]',
                                'F11_5T11_9':'[11.5,11.9]',
                                'F12_0T12_4':'[12.0,12.4]',
                                'F12_5T12_9':'[12.5,12.9]',
                                'F13_0T13_4':'[13.0,13.4]',
                                'F13_5T13_9':'[13.5,13.9]',
                                'F14_0T14_4':'[14.0,14.4]',
                                'F14_5T14_9':'[14.5,14.9]',
                                'F15_0':'[15.0,15.9]',})    
    return df

In [11]:
def get_newTarget(targets):
    '''
    this function makes a list of new targets given a list of targets
    '''
    targetNew = []
    ts = ['t','t+1','t+2','t+4']
    y = 0
    mq = 1
    for target in targets:
        if len(target)==4:
            targetNew.append(ts[y])
            y+=1
        elif len(target)>=6:
            targetNew.append('roll ' + str(mq))
            mq+=1
    
    return targetNew

def get_newTargets(df):
    '''
    this function adds a column with new targets to the dataframe
    '''
    # get a list of targets
    targets = df['target'].unique().tolist()
    # get a list of new targets
    newTargets = get_newTarget(targets)
    # make a temp df with targets and then replace with newTargets
    df_temp = df[['target']].copy()
    for i in range(len(newTargets)):
        target = targets[i]
        newTarget = newTargets[i]
        df_temp['target'].replace(target, newTarget, inplace=True)
    # insert temp df in df adter 'target' column
    pos_target = df.columns.get_loc('target')
    df.insert(pos_target+1,'newTarget',df_temp)
    
    return df

In [12]:
df_HICP = rename_columns(df_HICP)
df_GDP = rename_columns(df_GDP)
df_UN = rename_columns(df_UN)
#df_UN.head()

In [13]:
df_HICP = get_newTargets(df_HICP)
df_HICP.head()

df_GDP = get_newTargets(df_GDP)
df_GDP.head()

df_UN = get_newTargets(df_UN)
df_UN.head()


Out[13]:
source target newTarget id point [5.5,6.4] [6.5,6.9] [7.0,7.4] [7.5,7.9] [8.0,8.4] ... [10.5,10.9] [11.0,11.4] [11.5,11.9] [12.0,12.4] [12.5,12.9] [13.0,13.4] [13.5,13.9] [14.0,14.4] [14.5,14.9] [15.0,15.9]
0 2016-Q1 2016 t 1 10 NaN NaN NaN NaN NaN ... 10 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 2016-Q1 2016 t 2 10.4 NaN NaN NaN NaN NaN ... 39 10 NaN NaN NaN NaN NaN NaN NaN NaN
2 2016-Q1 2016 t 4 10.3 0 0 0 0 0 ... 10 5 0 0 0 0 0 0 0 0
3 2016-Q1 2016 t 5 10.1 NaN NaN NaN NaN NaN ... 20 5 NaN NaN NaN NaN NaN NaN NaN NaN
4 2016-Q1 2016 t 6 10.3 NaN NaN NaN NaN NaN ... 20 NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 24 columns


In [14]:
print(df_GDP.target.unique().tolist())
print(df_GDP.newTarget.unique().tolist())


['2016', '2016Q3', '2017', '2017Q3', '2018', '2020']
['t', 'roll 1', 't+1', 'roll 2', 't+2', 't+4']

In [15]:
print(df_UN.target.unique().tolist())
print(df_UN.newTarget.unique().tolist())


['2016', '2016Nov', '2017', '2017Nov', '2018', '2020']
['t', 'roll 1', 't+1', 'roll 2', 't+2', 't+4']

In [16]:
print(df_HICP.target.unique().tolist())
print(df_HICP.newTarget.unique().tolist())


['2016', '2016Dec', '2017', '2017Dec', '2018', '2020']
['t', 'roll 1', 't+1', 'roll 2', 't+2', 't+4']

In [17]:
def clean_df(df):
    '''
    this function removes rows with missing values in "point"
    and transforms cells to floats replacing NAs with zeros
    '''
    # remove rows where point is missing
    # maskNaN = df['point'].isnull()
    # df = df[~maskNaN].copy()
    df = df[df["point"].notnull()].copy()
    # replace missing with 0
    df.fillna(0, inplace=True)
    # cells to floats
    # all columns after (including) 'point'
    pos_point = df.columns.get_loc('point')
    for colname in df.columns[pos_point:]:
        df[colname] = df[colname].astype('float')
        
    return df

In [18]:
df_HICP = clean_df(df_HICP)
df_HICP.head()

df_GDP = clean_df(df_GDP)
df_GDP.head()

df_UN = clean_df(df_UN)
df_UN.head()


Out[18]:
source target newTarget id point [5.5,6.4] [6.5,6.9] [7.0,7.4] [7.5,7.9] [8.0,8.4] ... [10.5,10.9] [11.0,11.4] [11.5,11.9] [12.0,12.4] [12.5,12.9] [13.0,13.4] [13.5,13.9] [14.0,14.4] [14.5,14.9] [15.0,15.9]
0 2016-Q1 2016 t 1 10.0 0 0 0 0 0 ... 10 0 0 0 0 0 0 0 0 0
1 2016-Q1 2016 t 2 10.4 0 0 0 0 0 ... 39 10 0 0 0 0 0 0 0 0
2 2016-Q1 2016 t 4 10.3 0 0 0 0 0 ... 10 5 0 0 0 0 0 0 0 0
3 2016-Q1 2016 t 5 10.1 0 0 0 0 0 ... 20 5 0 0 0 0 0 0 0 0
4 2016-Q1 2016 t 6 10.3 0 0 0 0 0 ... 20 0 0 0 0 0 0 0 0 0

5 rows × 24 columns


In [19]:
df_UN.dtypes


Out[19]:
source          object
target          object
newTarget       object
id              object
point          float64
[5.5,6.4]      float64
[6.5,6.9]      float64
[7.0,7.4]      float64
[7.5,7.9]      float64
[8.0,8.4]      float64
[8.5,8.9]      float64
[9.0,9.4]      float64
[9.5,9.9]      float64
[10.0,10.4]    float64
[10.5,10.9]    float64
[11.0,11.4]    float64
[11.5,11.9]    float64
[12.0,12.4]    float64
[12.5,12.9]    float64
[13.0,13.4]    float64
[13.5,13.9]    float64
[14.0,14.4]    float64
[14.5,14.9]    float64
[15.0,15.9]    float64
dtype: object

In [20]:
# save to a csv file
#        df.to_csv(output_dir + output_file, index=False)

In [21]:
output_dir = '../clean_data/'
raw_file


Out[21]:
'2016Q1.csv'

In [22]:
output_dir + 'HICP' + raw_file


Out[22]:
'../clean_data/HICP2016Q1.csv'

In [24]:
df_HICP.to_csv(output_dir + 'HICP' + raw_file , index=False) 
df_GDP.to_csv(output_dir + 'GDP' + raw_file , index=False) 
df_UN.to_csv(output_dir + 'UN' + raw_file , index=False)

In [ ]: