In [ ]:
import pandas as pd
import numpy as np
import glob # to find all files in folder
from datetime import datetime
from datetime import date, time
from dateutil.parser import parse
pd.options.mode.chained_assignment = None  # default='warn'

Advanced Exo from 'Intro to Pandas'


In [ ]:
# load all data and parse the 'date' column
def load_data():
    sl_files=glob.glob('Data/ebola/sl_data/*.csv')
    guinea_files=glob.glob('Data/ebola/guinea_data/*.csv')
    liberia_files=glob.glob('Data/ebola/liberia_data/*.csv')
    sl = pd.concat((pd.read_csv(file,  parse_dates=['date']) for file in sl_files), ignore_index=True)
    guinea = pd.concat((pd.read_csv(file , parse_dates=['Date']) for file in guinea_files), ignore_index=True)
    liberia = pd.concat((pd.read_csv(file , parse_dates=['Date']) for file in liberia_files), ignore_index=True)
    return (sl, guinea, liberia)

In [ ]:
(sl, guinea, liberia) = load_data()

In [ ]:
# look at the sl data
sl.columns

In [ ]:
sl['variable'].unique()

we decide to only take the 'confirmed' cases and not the suspected or probable ones since 'suspected' and 'probable' are very subjective terms and may not be the same over the 3 countries.


In [ ]:
sl_variables_to_use = ['new_confirmed', 'death_confirmed']

In [ ]:
# look at the guinea data
guinea.columns

In [ ]:
guinea['Description'].unique()

In [ ]:
guinea_variables_to_use = ['New cases of confirmed', 'New deaths registered today (confirmed)']

In [ ]:
# look at the liberia data
liberia.columns

In [ ]:
liberia['Variable'].unique()

In [ ]:
liberia_variables_to_use = ['New case/s (confirmed)', 'Total death/s in confirmed cases']

In [ ]:
def select_features(data, var_name, features):
    return data[data[var_name].isin(features)]

In [ ]:
# take the relevant variables
sl_relevant = select_features(sl, 'variable', sl_variables_to_use)
guinea_relevant = select_features(guinea, 'Description', guinea_variables_to_use)
liberia_relevant = select_features(liberia, 'Variable', liberia_variables_to_use)

A problem is that the columnames and the variables are not the same over the 3 countries. So we harmonize it somewhat.


In [ ]:
# rename the columns
var_name = 'vars'
sl_relevant.rename(columns={'variable': var_name}, inplace=True)
guinea_relevant.rename(columns={'Description': var_name, 'Date': 'date'}, inplace=True)
liberia_relevant.rename(columns={'Variable': var_name, 'Date': 'date'}, inplace=True)

#rename the variables 
new_infected = 'new_infected'
new_death= 'new_death'
sl_relevant[var_name][sl_relevant[var_name] == sl_variables_to_use[0]] = new_infected
sl_relevant[var_name][sl_relevant[var_name] == sl_variables_to_use[1]] = new_death

guinea_relevant[var_name][guinea_relevant[var_name] == guinea_variables_to_use[0]] = new_infected
guinea_relevant[var_name][guinea_relevant[var_name] == guinea_variables_to_use[1]] = new_death

liberia_relevant[var_name][liberia_relevant[var_name] == liberia_variables_to_use[0]] = new_infected
liberia_relevant[var_name][liberia_relevant[var_name] == liberia_variables_to_use[1]] = new_death

In [ ]:
# rename the data
sl_clean = sl_relevant.copy()
guinea_clean = guinea_relevant.copy()
liberia_clean = liberia_relevant.copy()

Handle missing data


In [ ]:
#remove al rows and columns that consist only of NaNs
def remove_rows_and_cols_with_only_nan(data):
    return data.dropna(axis=1, how='all').dropna(axis=0, thresh=3)
sl_clean = remove_rows_and_cols_with_only_nan(sl_clean)
guinea_clean = remove_rows_and_cols_with_only_nan(guinea_clean)
liberia_clean = remove_rows_and_cols_with_only_nan(liberia_clean)

Then we can replace all NaN values with 0. We don't know anything about that data to put something else, and removing is no option since there would not be much left if we removed all rows/cols that contain at least one NaN


In [ ]:
# replace all NaNs with 0 (inplace)
sl_clean.fillna(value=0, inplace=True)
guinea_clean.fillna(value=0, inplace=True)
liberia_clean.fillna(value=0, inplace=True)

not all values are numerical (most are objects)


In [ ]:
sl_clean.dtypes

make all types numerical (excluding the date and variable columns)


In [ ]:
def change_to_numeric(data):
    col_list = list(data.columns)
    col_list.remove('date')
    col_list.remove(var_name)
    data[col_list] = data[col_list].apply(pd.to_numeric)
    
change_to_numeric(sl_clean)
change_to_numeric(guinea_clean)
change_to_numeric(liberia_clean)

Now we can summ over all cities and store it in a 'Total' column. Note that all countries have a 'National' or 'total' column, but they are inconsistent with the sumed values in each city, so we ignore it.


In [ ]:
#  create a total colon
def add_and_fill_total_col(data, ignore_cols_list):
    col_list = list(data.columns)
    for c in ignore_cols_list:
        col_list.remove(c) 
    data['total'] = data[col_list].sum(axis=1)
add_and_fill_total_col(sl_clean, ['date', var_name, 'National'])
add_and_fill_total_col(guinea_clean, ['date', var_name, 'Totals'])
add_and_fill_total_col(liberia_clean, ['date', var_name, 'National'])

In [ ]:
# remove unused cols:
sl_clean = sl_clean[['date', var_name, 'total']]
guinea_clean = guinea_clean[['date', var_name, 'total']]
liberia_clean = liberia_clean[['date', var_name, 'total']]

In [ ]:
#rename data again
sl_final = sl_clean.copy()
liberia_final = liberia_clean.copy()
guinea_final = guinea_clean.copy()

Show the data


In [ ]:
liberia_final.head()

In [ ]:
guinea_final.head()

In [ ]:
sl_final.head()

Move the variables into the columns


In [ ]:
# create infected and death cols
def create_inf_death_cols(data):
    inf = data[data['vars'] == new_infected]
    inf[new_infected] = inf['total']
    death = data[data['vars'] == new_death]
    death[new_death] = death['total']
    res = data.join(inf[new_infected], how='outer')
    return res.join(death[new_death], how='outer')

sl_final = create_inf_death_cols(sl_final)
liberia_final = create_inf_death_cols(liberia_final)
guinea_final = create_inf_death_cols(guinea_final)

In [ ]:
sl_final.head()

In [ ]:
# remove vars & total col
sl_final = sl_final.drop(var_name, 1).drop('total', 1)
liberia_final = liberia_final.drop(var_name, 1).drop('total', 1)
guinea_final = guinea_final.drop(var_name, 1).drop('total', 1)

In [ ]:
sl_final.head()

Then we need to merge the data


In [ ]:
# group by date to merge the cols
liberia_final = liberia_final.groupby('date', as_index=False).sum()
sl_final = sl_final.groupby('date', as_index=False).sum()
guinea_final = guinea_final.groupby('date', as_index=False).sum()

add 'country' col to distinguish the dataframes when they are put together


In [ ]:
sl_final['country'] = 'sl'
guinea_final['country'] = 'guinea'
liberia_final['country'] = 'liberia'

In [ ]:
guinea_final.head()

In [ ]:
liberia_final.head()

In [ ]:
sl_final.head()

Concat the dataframes


In [ ]:
final_data = pd.concat([sl_final, guinea_final, liberia_final], ignore_index=True)

And sort the data:


In [ ]:
final_data.sort_values(by='date').set_index(['date', 'country'])