In [1]:
from itertools import product

import altair as alt
import geopandas as gpd
import pandas as pd
import pandera as pa
import requests
from pandera import Check, Column, check_output
from pyprojroot import here

In [2]:
pd.set_option("display.max_rows", 100)

Targa Data


In [3]:
raw_covid_df = pd.read_csv(here("data/targa_covid19.csv"))
raw_covid_df.head()


Out[3]:
date Gouvernorat الولاية Cas Rétablis Décès Hospitalisé nbre tests Reanimation Pays
0 02/03/2020 Gafsa قفصة 1.0 1.0 NaN NaN 67.0 NaN Tunisie
1 08/03/2020 Mahdia مهدية 1.0 NaN NaN NaN 30.0 NaN Tunisie
2 09/03/2020 Bizerte بنزرت 1.0 NaN NaN NaN 20.0 NaN Tunisie
3 09/03/2020 Tunis تونس 1.0 NaN NaN NaN NaN NaN Tunisie
4 09/03/2020 Mahdia مهدية 1.0 1.0 NaN NaN NaN NaN Tunisie

In [4]:
targa_schema = pa.DataFrameSchema(
    {
        "date": pa.Column(pa.DateTime),
        "gouvernorat": pa.Column(pa.Object),
        "cas": pa.Column(pa.Int),
        "retablis": pa.Column(pa.Int),
        "deces": pa.Column(pa.Int),
        "hospitalise": pa.Column(pa.Int),
        "nbre_tests": pa.Column(pa.Int),
        "reanimation": pa.Column(pa.Int),
    }
)

Cleaning pipeline


In [5]:
def format_column_names(df):
    return df.rename(
        columns=lambda col: str(col)
        .replace("é", "e")
        .replace("è", "e")
        .replace(" ", "_")
        .lower()
    )


def drop_unnecessary_columns(df):
    return df.drop(columns=["Pays", "الولاية"])


def fix_empty_strings(df):
    return df.replace("^\s*$", float("nan"), regex=True)


def set_dtypes(df):
    to_numeric_columns = df.columns.difference(["gouvernorat", "date"])
    numeric = {col_name: pd.to_numeric(df[col_name]) for col_name in to_numeric_columns}
    return df.assign(**numeric).pipe(
        lambda df: df.assign(date=pd.to_datetime(df["date"], format="%d/%m/%Y"))
    )


def correct_typos(df):
    return df.assign(
        gouvernorat=df["gouvernorat"]
        .str.title()  # Fix an issue with 'Zaghouan' being entered as 'zaghouan'
        .replace({"Seliana": "Siliana"})
    )


def reindex_to_full_date_range(df):
    full_date_range = pd.date_range(df["date"].min(), df["date"].max())
    governorates = df["gouvernorat"].unique()
    full_date_range_index = list(product(full_date_range, governorates))
    return (
        df.groupby(["date", "gouvernorat"])
        .sum()
        .reindex(full_date_range_index, fill_value=0)
    )


# @check_output(targa_schema)
def base_pipeline(df):
    return (
        df.pipe(drop_unnecessary_columns)
        .pipe(format_column_names)
        .pipe(fix_empty_strings)
        .pipe(correct_typos)
        .pipe(set_dtypes)
        .pipe(reindex_to_full_date_range)
    )

In [6]:
tidy_covid_df = raw_covid_df.pipe(base_pipeline).reset_index()
tidy_covid_df.head()


Out[6]:
date gouvernorat cas retablis deces hospitalise nbre_tests reanimation
0 2020-03-02 Gafsa 1.0 1.0 0.0 0.0 67.0 0.0
1 2020-03-02 Mahdia 0.0 0.0 0.0 0.0 0.0 0.0
2 2020-03-02 Bizerte 0.0 0.0 0.0 0.0 0.0 0.0
3 2020-03-02 Tunis 0.0 0.0 0.0 0.0 0.0 0.0
4 2020-03-02 Ariana 0.0 0.0 0.0 0.0 0.0 0.0

Checks and verifications

Date range


In [156]:
start = tidy_covid_df["date"].min()
end = tidy_covid_df["date"].max()

In [157]:
print(start)
print(end)
print(end - start)


2020-03-02 00:00:00
2020-05-14 00:00:00
73 days 00:00:00

We expect duration + 1 observations in our dataset for each governorate


In [9]:
tidy_covid_df.groupby("gouvernorat").agg({"date": "nunique"}).describe().loc["mean"]


Out[9]:
date    74.0
Name: mean, dtype: float64

We expect the total number of observations per day to be 24, the number of governorates


In [10]:
tidy_covid_df.groupby("date").agg({"gouvernorat": "nunique"}).describe().loc["mean"]


Out[10]:
gouvernorat    24.0
Name: mean, dtype: float64

Investigation of duplicate observations

There are duplicate observations per day, assuming that they are the result of not summing the cases for that governorate for that particualr day, the pipeline should sum the values of observations for each day for each governorate.


In [159]:
duplicate_govs_per_day = (
    tidy_covid_df.groupby(["date", "gouvernorat"])
    .agg("count")
    .query(
        "cas > 1 | retablis > 1 | deces > 1 | hospitalise > 1 | nbre_tests > 1 | reanimation > 1"
    )
)
duplicate_govs_per_day


Out[159]:
cas retablis deces hospitalise nbre_tests reanimation
date gouvernorat

In [160]:
tidy_covid_df.set_index(["date", "gouvernorat"]).sort_index().loc[
    duplicate_govs_per_day.index
]


Out[160]:
cas retablis deces hospitalise nbre_tests reanimation
date gouvernorat

Visual exploration


In [218]:
governorates_by_cum_total_cases = (
    tidy_covid_df.groupby(["gouvernorat"], as_index=False)
    .agg({"cas": "sum"})
    .sort_values("cas", ascending=False)
    .loc[:, "gouvernorat"]
    .to_list()
)
tidy_covid_df.pipe(
    lambda df: alt.Chart(df)
    .mark_bar()
    .encode(
        x="date",
        y="cas",
        tooltip=["cas", "date"],
        facet=alt.Facet(
            "gouvernorat:N", columns=6, sort=alt.Sort(governorates_by_cum_total_cases),
        ),
    )
    .properties(height=100, width=200)
).interactive()


Out[218]:

I couldn't interpret the meaning of the negative values, I will look into using a cumulative sum.


In [242]:
def highlight_negative(x):
    return "font-weight: bold; color: red;" if x < 0 else "color: black;"


tidy_covid_df.set_index(["gouvernorat", "date"]).pipe(
    lambda df: df.query(" | ".join([f"{col} < 0" for col in df.columns]))
).sort_index().style.applymap(highlight_negative).format("{:n}")


Out[242]:
cas retablis deces hospitalise nbre_tests reanimation
gouvernorat date
Ariana 2020-04-06 00:00:00 -1 0 0 0 0 0
2020-04-14 00:00:00 -1 0 1 0 0 0
Beja 2020-05-03 00:00:00 0 -18 0 0 0 0
2020-05-12 00:00:00 0 -1 0 0 0 0
Ben Arous 2020-04-01 00:00:00 2 0 0 0 -288 0
Bizerte 2020-04-10 00:00:00 0 0 -1 0 0 0
Gafsa 2020-05-05 00:00:00 -3 11 0 0 0 0
2020-05-06 00:00:00 0 -1 0 0 0 0
Kasserine 2020-05-05 00:00:00 -1 -1 0 0 0 0
Kebili 2020-04-11 00:00:00 -1 0 0 0 0 0
Le Kef 2020-03-31 00:00:00 -1 0 0 0 0 0
Mahdia 2020-04-22 00:00:00 -1 0 0 0 0 0
Manouba 2020-03-30 00:00:00 -1 0 0 0 0 0
2020-05-05 00:00:00 -1 9 0 0 0 0
Monastir 2020-04-05 00:00:00 -1 0 0 0 0 0
2020-04-19 00:00:00 -1 0 0 0 0 0
Nabeul 2020-04-11 00:00:00 -1 0 0 0 0 0
Sousse 2020-04-08 00:00:00 -6 0 0 0 0 0
Tozeur 2020-03-30 00:00:00 -5 0 0 0 0 0
2020-05-05 00:00:00 -1 -1 0 0 0 0
Tunis 2020-04-20 00:00:00 2 20 0 -197 878 32
2020-04-23 00:00:00 0 0 0 -4 438 -12
2020-04-24 00:00:00 3 13 0 -6 559 -1
2020-04-25 00:00:00 3 9 0 -6 429 1
2020-04-26 00:00:00 4 63 0 0 263 -2
2020-04-27 00:00:00 2 0 1 -2 396 -1
2020-04-29 00:00:00 12 15 0 -6 895 4
2020-04-30 00:00:00 0 11 0 -1 569 0
2020-05-01 00:00:00 5 7 1 -6 529 1
2020-05-03 00:00:00 1 -163 0 -12 447 -6
2020-05-04 00:00:00 2 2 0 -11 312 -2
2020-05-06 00:00:00 0 0 0 -4 416 3
2020-05-08 00:00:00 0 14 0 1 1500 -5
2020-05-09 00:00:00 0 8 0 -24 292 0
2020-05-10 00:00:00 0 5 0 -4 448 -19
2020-05-11 00:00:00 0 2 0 -2 614 0
2020-05-13 00:00:00 0 5 0 -2 555 0
Zaghouan 2020-04-28 00:00:00 0 15 0 -4 585 3
2020-05-03 00:00:00 0 -12 0 0 0 0

Cumulative daily obeservations


In [239]:
tidy_covid_cumu_df = (
    tidy_covid_df.groupby(["gouvernorat", "date"])
    .sum()
    .groupby(level=[0], as_index=False)
    .cumsum()
)

In [244]:
tidy_covid_cumu_df.pipe(
    lambda df: df.query(" | ".join([f"{col} < 0" for col in df.columns]))
).sort_index().style.applymap(highlight_negative).format("{:n}")


Out[244]:
cas retablis deces hospitalise nbre_tests reanimation
gouvernorat date
Ben Arous 2020-04-01 00:00:00 45 0 0 0 -288 0
2020-04-02 00:00:00 49 0 0 0 -288 0
2020-04-03 00:00:00 54 0 0 0 -288 0
2020-04-04 00:00:00 56 0 1 0 -288 0
2020-04-05 00:00:00 57 0 1 0 -288 0
2020-04-06 00:00:00 61 0 1 0 -288 0
2020-04-07 00:00:00 62 0 1 0 -288 0
2020-04-08 00:00:00 63 0 1 0 -288 0
2020-04-09 00:00:00 65 0 1 0 -288 0
2020-04-10 00:00:00 66 0 1 0 -288 0
2020-04-11 00:00:00 71 0 2 0 -288 0
2020-04-12 00:00:00 74 0 2 0 -288 0
2020-04-13 00:00:00 79 0 2 0 -288 0
2020-04-14 00:00:00 81 0 2 0 -288 0
2020-04-15 00:00:00 81 0 3 0 -288 0
2020-04-16 00:00:00 87 0 3 0 -288 0
2020-04-17 00:00:00 87 0 3 0 -288 0
2020-04-18 00:00:00 87 0 3 0 -288 0
2020-04-19 00:00:00 87 0 3 0 -288 0
2020-04-20 00:00:00 91 0 3 0 -288 0
2020-04-21 00:00:00 91 0 3 0 -288 0
2020-04-22 00:00:00 91 0 3 0 -288 0
2020-04-23 00:00:00 91 0 3 0 -288 0
2020-04-24 00:00:00 92 0 3 0 -288 0
2020-04-25 00:00:00 92 0 3 0 -288 0
2020-04-26 00:00:00 93 0 4 0 -288 0
2020-04-27 00:00:00 93 0 4 0 -288 0
2020-04-28 00:00:00 94 0 4 0 -288 0
2020-04-29 00:00:00 94 0 4 0 -288 0
2020-04-30 00:00:00 94 0 4 0 -288 0
2020-05-01 00:00:00 96 0 4 0 -288 0
2020-05-02 00:00:00 96 0 4 0 -288 0
2020-05-03 00:00:00 97 35 4 0 -288 0
2020-05-04 00:00:00 97 35 4 0 -288 0
2020-05-05 00:00:00 97 62 4 0 -288 0
2020-05-06 00:00:00 97 62 4 0 -288 0
2020-05-07 00:00:00 97 71 4 0 -288 0
2020-05-08 00:00:00 97 73 4 0 -288 0
2020-05-09 00:00:00 97 74 4 0 -288 0
2020-05-10 00:00:00 97 78 4 0 -288 0
2020-05-11 00:00:00 97 78 4 0 -288 0
2020-05-12 00:00:00 97 80 4 0 -288 0
2020-05-13 00:00:00 97 81 4 0 -288 0
2020-05-14 00:00:00 97 83 4 0 -288 0
Tunis 2020-04-20 00:00:00 195 165 6 -197 15422 32
2020-04-21 00:00:00 198 165 6 -197 16118 32
2020-04-22 00:00:00 199 165 6 -197 16668 32
2020-04-23 00:00:00 199 165 6 -201 17106 20
2020-04-24 00:00:00 202 178 6 -207 17665 19
2020-04-25 00:00:00 205 187 6 -213 18094 20
2020-04-26 00:00:00 209 250 6 -213 18357 18
2020-04-27 00:00:00 211 250 7 -215 18753 17
2020-04-28 00:00:00 211 250 7 -215 18753 17
2020-04-29 00:00:00 223 265 7 -221 19648 21
2020-04-30 00:00:00 223 276 7 -222 20217 21
2020-05-01 00:00:00 228 283 8 -228 20746 22
2020-05-02 00:00:00 228 288 8 -228 21097 22
2020-05-03 00:00:00 229 125 8 -240 21544 16
2020-05-04 00:00:00 231 127 8 -251 21856 14
2020-05-05 00:00:00 232 127 8 -250 22246 15
2020-05-06 00:00:00 232 127 8 -254 22662 18
2020-05-07 00:00:00 232 147 8 -253 24097 21
2020-05-08 00:00:00 232 161 8 -252 25597 16
2020-05-09 00:00:00 232 169 8 -276 25889 16
2020-05-10 00:00:00 232 174 8 -280 26337 -3
2020-05-11 00:00:00 232 176 8 -282 26951 -3
2020-05-12 00:00:00 232 177 8 -282 27394 -3
2020-05-13 00:00:00 232 182 8 -284 27949 -3
2020-05-14 00:00:00 233 186 8 -284 28392 -3
Zaghouan 2020-04-28 00:00:00 3 15 0 -4 585 3
2020-04-29 00:00:00 3 15 0 -4 585 3
2020-04-30 00:00:00 3 15 0 -4 585 3
2020-05-01 00:00:00 3 15 0 -4 585 3
2020-05-02 00:00:00 3 15 0 -4 585 3
2020-05-03 00:00:00 3 3 0 -4 585 3
2020-05-04 00:00:00 3 3 0 -4 585 3
2020-05-05 00:00:00 3 3 0 -4 585 3
2020-05-06 00:00:00 3 3 0 -4 585 3
2020-05-07 00:00:00 3 3 0 -4 585 3
2020-05-08 00:00:00 3 3 0 -4 585 3
2020-05-09 00:00:00 3 3 0 -4 585 3
2020-05-10 00:00:00 3 3 0 -4 585 3
2020-05-11 00:00:00 3 3 0 -4 585 3
2020-05-12 00:00:00 3 3 0 -4 585 3
2020-05-13 00:00:00 3 3 0 -4 585 3
2020-05-14 00:00:00 3 3 0 -4 585 3

hospitalise, nbre_tests and reanimation columns still show negative values even when accumulated. For now I will ignore them.


In [246]:
tidy_covid_cumu_no_neg = tidy_covid_cumu_df.drop(
    columns=["hospitalise", "nbre_tests", "reanimation"]
)

In [252]:
cumulative_daily_cases_plot = (
    alt.Chart(tidy_covid_cumu_no_neg.reset_index())
    .mark_area()
    .encode(
        x="date:T",
        y="retablis:Q",
        tooltip=["cas:Q", "date:T"],
        facet=alt.Facet(
            "gouvernorat:N", columns=6, sort=alt.Sort(governorates_by_cum_total_cases),
        ),
    )
    .properties(width=200, height=120)
)
cumulative_daily_cases_plot


Out[252]:

Exporting results


In [257]:
tidy_covid_cumu_no_neg.to_csv(here("data/processed/targa_covid_cumulative_no_neg.csv"))

Official data


In [258]:
official_raw = gpd.read_file(
    "https://services6.arcgis.com/BiTAc9ApDDtL9okN/arcgis/rest/services/COVID19_Table_DATESetTOTAL/FeatureServer/0/query?where=1%3D1&objectIds=&time=&resultType=standard&outFields=*&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&sqlFormat=none&f=pjson&token="
)

In [259]:
official_raw.head()


Out[259]:
OBJECTID Dates Total_Confirmed Analyses Nb_test Isolement Suspects Retablis Décès importe ... Nb_tests_journalier Origine_encours cas_actifs cas_actifs2 GlobalID CreationDate Creator EditDate Editor geometry
0 1 1583280000000 1.0 None NaN NaN NaN NaN NaN 1.0 ... NaN NaN NaN None 6cfbb829-1b14-4ea7-9bb0-4cd3450281e0 1589370484037 ageos.admins 1589547557566 ageos.admins None
1 2 1583366400000 1.0 None NaN NaN NaN NaN NaN NaN ... NaN NaN NaN None 90f52142-7b24-4b3d-91ed-9f9480d8496b 1589370484037 ageos.admins 1589547557566 ageos.admins None
2 3 1583452800000 1.0 None NaN NaN NaN NaN NaN NaN ... NaN NaN NaN None c8a498b5-025f-4b56-9525-4a75376a884c 1589370484037 ageos.admins 1589547557566 ageos.admins None
3 4 1583539200000 1.0 None NaN NaN NaN NaN NaN NaN ... NaN NaN NaN None 0713a99d-7155-4b4f-a0c0-4dfdeaeab97f 1589370484037 ageos.admins 1589547557566 ageos.admins None
4 5 1583625600000 2.0 None NaN NaN NaN NaN NaN 1.0 ... NaN NaN NaN None feacc7d9-7ba5-4b6c-ad05-ddbbaef92626 1589370484037 ageos.admins 1589547557566 ageos.admins None

5 rows × 22 columns

Cleaning pipeline


In [339]:
def drop_invalid_rows(df):
    return df.dropna(subset=["Dates"])


def drop_empty_columns(df):
    return df.dropna(how="all", axis="columns")


def drop_unnecessary_columns(df):
    return df.drop(
        columns=[
            "Editor",
            "EditDate",
            "Creator",
            "CreationDate",
            "GlobalID",
            "OBJECTID",
        ]
    )


def set_dtypes(df):
    return df.assign(dates=pd.to_datetime(df["dates"], unit="ms"))


def fill_missing_values(df):
    return df.fillna(method="ffill").fillna(0)


def clean_column_names(df):
    return df.rename(
        columns=lambda col: col.lower().replace("é", "e").replace("è", "e")
    )


def official_data_pipeline(df):
    return (
        df.pipe(drop_invalid_rows)
        .pipe(drop_empty_columns)
        .pipe(drop_unnecessary_columns)
        .pipe(clean_column_names)
        .pipe(set_dtypes)
        .pipe(fill_missing_values)
    )


tidy_official = official_raw.pipe(official_data_pipeline).pipe(
    lambda df: pd.DataFrame(df)
)
tidy_official.tail(5)


Out[339]:
dates total_confirmed nb_test isolement suspects retablis deces importe autochtone nb_cas_journalier nb_tests_journalier origine_encours cas_actifs
68 2020-05-10 23:00:00 1032.0 33880.0 19122.0 5.0 740.0 45.0 246.0 784.0 0.0 614.0 0.0 247.0
69 2020-05-11 23:00:00 1032.0 34323.0 19122.0 5.0 759.0 45.0 246.0 784.0 0.0 443.0 0.0 228.0
70 2020-05-12 23:00:00 1032.0 36523.0 19122.0 3.0 770.0 45.0 247.0 785.0 0.0 2200.0 0.0 217.0
71 2020-05-13 23:00:00 1035.0 37862.0 19122.0 3.0 802.0 45.0 247.0 785.0 3.0 1339.0 0.0 188.0
72 2020-05-14 23:00:00 1037.0 39778.0 19122.0 3.0 807.0 45.0 247.0 785.0 2.0 1916.0 0.0 185.0

In [335]:
alt.themes.enable("default")
tidy_official.melt(
    id_vars="dates", value_vars=["cas_actifs", "retablis", "deces"]
).pipe(
    lambda df: df.assign(
        ix=df["variable"].replace({"cas_actifs": 2, "retablis": 3, "deces": 1})
    )
).pipe(
    lambda df: alt.Chart(df)
    .mark_area()
    .encode(
        x="dates:T",
        y=alt.Y("value"),
        color=alt.Color("variable", sort=["retablis", "cas_actifs", "retablis"]),
        tooltip=["value", "dates"],
        order="ix",
    )
)


Out[335]:

In [337]:
tidy_official.pipe(
    lambda df: df.assign(
        day=df["dates"].dt.day, month=df["dates"].dt.month, dow=df["dates"].dt.dayofweek
    )
).pipe(
    lambda df: alt.Chart(df)
    .mark_rect()
    .encode(x="day:O", y="month:O", color="nb_cas_journalier:Q",)
)


Out[337]:

Export data


In [342]:
tidy_official.to_csv(here("data/processed/official_covid.csv"), index=False)

In [ ]: