In [ ]:
import pandas as pd
import numpy as np
import glob # to find all files in folder
from datetime import datetime
from datetime import date, time
from dateutil.parser import parse
pd.options.mode.chained_assignment = None # default='warn'
In [ ]:
# load all data and parse the 'date' column
def load_data():
sl_files=glob.glob('Data/ebola/sl_data/*.csv')
guinea_files=glob.glob('Data/ebola/guinea_data/*.csv')
liberia_files=glob.glob('Data/ebola/liberia_data/*.csv')
sl = pd.concat((pd.read_csv(file, parse_dates=['date']) for file in sl_files), ignore_index=True)
guinea = pd.concat((pd.read_csv(file , parse_dates=['Date']) for file in guinea_files), ignore_index=True)
liberia = pd.concat((pd.read_csv(file , parse_dates=['Date']) for file in liberia_files), ignore_index=True)
return (sl, guinea, liberia)
In [ ]:
(sl, guinea, liberia) = load_data()
In [ ]:
# look at the sl data
sl.columns
In [ ]:
sl['variable'].unique()
we decide to only take the 'confirmed' cases and not the suspected or probable ones since 'suspected' and 'probable' are very subjective terms and may not be the same over the 3 countries.
In [ ]:
sl_variables_to_use = ['new_confirmed', 'death_confirmed']
In [ ]:
# look at the guinea data
guinea.columns
In [ ]:
guinea['Description'].unique()
In [ ]:
guinea_variables_to_use = ['New cases of confirmed', 'New deaths registered today (confirmed)']
In [ ]:
# look at the liberia data
liberia.columns
In [ ]:
liberia['Variable'].unique()
In [ ]:
liberia_variables_to_use = ['New case/s (confirmed)', 'Total death/s in confirmed cases']
In [ ]:
def select_features(data, var_name, features):
return data[data[var_name].isin(features)]
In [ ]:
# take the relevant variables
sl_relevant = select_features(sl, 'variable', sl_variables_to_use)
guinea_relevant = select_features(guinea, 'Description', guinea_variables_to_use)
liberia_relevant = select_features(liberia, 'Variable', liberia_variables_to_use)
A problem is that the columnames and the variables are not the same over the 3 countries. So we harmonize it somewhat.
In [ ]:
# rename the columns
var_name = 'vars'
sl_relevant.rename(columns={'variable': var_name}, inplace=True)
guinea_relevant.rename(columns={'Description': var_name, 'Date': 'date'}, inplace=True)
liberia_relevant.rename(columns={'Variable': var_name, 'Date': 'date'}, inplace=True)
#rename the variables
new_infected = 'new_infected'
new_death= 'new_death'
sl_relevant[var_name][sl_relevant[var_name] == sl_variables_to_use[0]] = new_infected
sl_relevant[var_name][sl_relevant[var_name] == sl_variables_to_use[1]] = new_death
guinea_relevant[var_name][guinea_relevant[var_name] == guinea_variables_to_use[0]] = new_infected
guinea_relevant[var_name][guinea_relevant[var_name] == guinea_variables_to_use[1]] = new_death
liberia_relevant[var_name][liberia_relevant[var_name] == liberia_variables_to_use[0]] = new_infected
liberia_relevant[var_name][liberia_relevant[var_name] == liberia_variables_to_use[1]] = new_death
In [ ]:
# rename the data
sl_clean = sl_relevant.copy()
guinea_clean = guinea_relevant.copy()
liberia_clean = liberia_relevant.copy()
In [ ]:
#remove al rows and columns that consist only of NaNs
def remove_rows_and_cols_with_only_nan(data):
return data.dropna(axis=1, how='all').dropna(axis=0, thresh=3)
sl_clean = remove_rows_and_cols_with_only_nan(sl_clean)
guinea_clean = remove_rows_and_cols_with_only_nan(guinea_clean)
liberia_clean = remove_rows_and_cols_with_only_nan(liberia_clean)
Then we can replace all NaN values with 0. We don't know anything about that data to put something else, and removing is no option since there would not be much left if we removed all rows/cols that contain at least one NaN
In [ ]:
# replace all NaNs with 0 (inplace)
sl_clean.fillna(value=0, inplace=True)
guinea_clean.fillna(value=0, inplace=True)
liberia_clean.fillna(value=0, inplace=True)
not all values are numerical (most are objects)
In [ ]:
sl_clean.dtypes
make all types numerical (excluding the date and variable columns)
In [ ]:
def change_to_numeric(data):
col_list = list(data.columns)
col_list.remove('date')
col_list.remove(var_name)
data[col_list] = data[col_list].apply(pd.to_numeric)
change_to_numeric(sl_clean)
change_to_numeric(guinea_clean)
change_to_numeric(liberia_clean)
Now we can summ over all cities and store it in a 'Total' column. Note that all countries have a 'National' or 'total' column, but they are inconsistent with the sumed values in each city, so we ignore it.
In [ ]:
# create a total colon
def add_and_fill_total_col(data, ignore_cols_list):
col_list = list(data.columns)
for c in ignore_cols_list:
col_list.remove(c)
data['total'] = data[col_list].sum(axis=1)
add_and_fill_total_col(sl_clean, ['date', var_name, 'National'])
add_and_fill_total_col(guinea_clean, ['date', var_name, 'Totals'])
add_and_fill_total_col(liberia_clean, ['date', var_name, 'National'])
In [ ]:
# remove unused cols:
sl_clean = sl_clean[['date', var_name, 'total']]
guinea_clean = guinea_clean[['date', var_name, 'total']]
liberia_clean = liberia_clean[['date', var_name, 'total']]
In [ ]:
#rename data again
sl_final = sl_clean.copy()
liberia_final = liberia_clean.copy()
guinea_final = guinea_clean.copy()
Show the data
In [ ]:
liberia_final.head()
In [ ]:
guinea_final.head()
In [ ]:
sl_final.head()
Move the variables into the columns
In [ ]:
# create infected and death cols
def create_inf_death_cols(data):
inf = data[data['vars'] == new_infected]
inf[new_infected] = inf['total']
death = data[data['vars'] == new_death]
death[new_death] = death['total']
res = data.join(inf[new_infected], how='outer')
return res.join(death[new_death], how='outer')
sl_final = create_inf_death_cols(sl_final)
liberia_final = create_inf_death_cols(liberia_final)
guinea_final = create_inf_death_cols(guinea_final)
In [ ]:
sl_final.head()
In [ ]:
# remove vars & total col
sl_final = sl_final.drop(var_name, 1).drop('total', 1)
liberia_final = liberia_final.drop(var_name, 1).drop('total', 1)
guinea_final = guinea_final.drop(var_name, 1).drop('total', 1)
In [ ]:
sl_final.head()
Then we need to merge the data
In [ ]:
# group by date to merge the cols
liberia_final = liberia_final.groupby('date', as_index=False).sum()
sl_final = sl_final.groupby('date', as_index=False).sum()
guinea_final = guinea_final.groupby('date', as_index=False).sum()
add 'country' col to distinguish the dataframes when they are put together
In [ ]:
sl_final['country'] = 'sl'
guinea_final['country'] = 'guinea'
liberia_final['country'] = 'liberia'
In [ ]:
guinea_final.head()
In [ ]:
liberia_final.head()
In [ ]:
sl_final.head()
In [ ]:
final_data = pd.concat([sl_final, guinea_final, liberia_final], ignore_index=True)
In [ ]:
final_data.sort_values(by='date').set_index(['date', 'country'])