In [1]:

    
from itertools import product

import altair as alt
import geopandas as gpd
import pandas as pd
import pandera as pa
import requests
from pandera import Check, Column, check_output
from pyprojroot import here



In [2]:

    
pd.set_option("display.max_rows", 100)

Targa Data

https://docs.google.com/spreadsheets/d/1RoRjMglIY37gL1shEj-8T_EeyiWa1Oa9Vmt-8T5amq0/edit#gid=0



In [3]:

    
raw_covid_df = pd.read_csv(here("data/targa_covid19.csv"))
raw_covid_df.head()









    Out[3]:







  
    
      
      date
      Gouvernorat
      الولاية
      Cas
      Rétablis
      Décès
      Hospitalisé
      nbre tests
      Reanimation
      Pays
    
  
  
    
      0
      02/03/2020
      Gafsa
      قفصة
      1.0
      1.0
      NaN
      NaN
      67.0
      NaN
      Tunisie
    
    
      1
      08/03/2020
      Mahdia
      مهدية
      1.0
      NaN
      NaN
      NaN
      30.0
      NaN
      Tunisie
    
    
      2
      09/03/2020
      Bizerte
      بنزرت
      1.0
      NaN
      NaN
      NaN
      20.0
      NaN
      Tunisie
    
    
      3
      09/03/2020
      Tunis
      تونس
      1.0
      NaN
      NaN
      NaN
      NaN
      NaN
      Tunisie
    
    
      4
      09/03/2020
      Mahdia
      مهدية
      1.0
      1.0
      NaN
      NaN
      NaN
      NaN
      Tunisie



In [4]:

    
targa_schema = pa.DataFrameSchema(
    {
        "date": pa.Column(pa.DateTime),
        "gouvernorat": pa.Column(pa.Object),
        "cas": pa.Column(pa.Int),
        "retablis": pa.Column(pa.Int),
        "deces": pa.Column(pa.Int),
        "hospitalise": pa.Column(pa.Int),
        "nbre_tests": pa.Column(pa.Int),
        "reanimation": pa.Column(pa.Int),
    }
)

Cleaning pipeline



In [5]:

    
def format_column_names(df):
    return df.rename(
        columns=lambda col: str(col)
        .replace("é", "e")
        .replace("è", "e")
        .replace(" ", "_")
        .lower()
    )


def drop_unnecessary_columns(df):
    return df.drop(columns=["Pays", "الولاية"])


def fix_empty_strings(df):
    return df.replace("^\s*$", float("nan"), regex=True)


def set_dtypes(df):
    to_numeric_columns = df.columns.difference(["gouvernorat", "date"])
    numeric = {col_name: pd.to_numeric(df[col_name]) for col_name in to_numeric_columns}
    return df.assign(**numeric).pipe(
        lambda df: df.assign(date=pd.to_datetime(df["date"], format="%d/%m/%Y"))
    )


def correct_typos(df):
    return df.assign(
        gouvernorat=df["gouvernorat"]
        .str.title()  # Fix an issue with 'Zaghouan' being entered as 'zaghouan'
        .replace({"Seliana": "Siliana"})
    )


def reindex_to_full_date_range(df):
    full_date_range = pd.date_range(df["date"].min(), df["date"].max())
    governorates = df["gouvernorat"].unique()
    full_date_range_index = list(product(full_date_range, governorates))
    return (
        df.groupby(["date", "gouvernorat"])
        .sum()
        .reindex(full_date_range_index, fill_value=0)
    )


# @check_output(targa_schema)
def base_pipeline(df):
    return (
        df.pipe(drop_unnecessary_columns)
        .pipe(format_column_names)
        .pipe(fix_empty_strings)
        .pipe(correct_typos)
        .pipe(set_dtypes)
        .pipe(reindex_to_full_date_range)
    )



In [6]:

    
tidy_covid_df = raw_covid_df.pipe(base_pipeline).reset_index()
tidy_covid_df.head()









    Out[6]:







  
    
      
      date
      gouvernorat
      cas
      retablis
      deces
      hospitalise
      nbre_tests
      reanimation
    
  
  
    
      0
      2020-03-02
      Gafsa
      1.0
      1.0
      0.0
      0.0
      67.0
      0.0
    
    
      1
      2020-03-02
      Mahdia
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      2020-03-02
      Bizerte
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      2020-03-02
      Tunis
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      2020-03-02
      Ariana
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0

Checks and verifications

Date range



In [156]:

    
start = tidy_covid_df["date"].min()
end = tidy_covid_df["date"].max()



In [157]:

    
print(start)
print(end)
print(end - start)









    



2020-03-02 00:00:00
2020-05-14 00:00:00
73 days 00:00:00

We expect duration + 1 observations in our dataset for each governorate



In [9]:

    
tidy_covid_df.groupby("gouvernorat").agg({"date": "nunique"}).describe().loc["mean"]









    Out[9]:





date    74.0
Name: mean, dtype: float64

We expect the total number of observations per day to be 24, the number of governorates



In [10]:

    
tidy_covid_df.groupby("date").agg({"gouvernorat": "nunique"}).describe().loc["mean"]









    Out[10]:





gouvernorat    24.0
Name: mean, dtype: float64

Investigation of duplicate observations

There are duplicate observations per day, assuming that they are the result of not summing the cases for that governorate for that particualr day, the pipeline should sum the values of observations for each day for each governorate.



In [159]:

    
duplicate_govs_per_day = (
    tidy_covid_df.groupby(["date", "gouvernorat"])
    .agg("count")
    .query(
        "cas > 1 | retablis > 1 | deces > 1 | hospitalise > 1 | nbre_tests > 1 | reanimation > 1"
    )
)
duplicate_govs_per_day









    Out[159]:







  
    
      
      
      cas
      retablis
      deces
      hospitalise
      nbre_tests
      reanimation
    
    
      date
      gouvernorat



In [160]:

    
tidy_covid_df.set_index(["date", "gouvernorat"]).sort_index().loc[
    duplicate_govs_per_day.index
]









    Out[160]:







  
    
      
      
      cas
      retablis
      deces
      hospitalise
      nbre_tests
      reanimation
    
    
      date
      gouvernorat

Visual exploration



In [218]:

    
governorates_by_cum_total_cases = (
    tidy_covid_df.groupby(["gouvernorat"], as_index=False)
    .agg({"cas": "sum"})
    .sort_values("cas", ascending=False)
    .loc[:, "gouvernorat"]
    .to_list()
)
tidy_covid_df.pipe(
    lambda df: alt.Chart(df)
    .mark_bar()
    .encode(
        x="date",
        y="cas",
        tooltip=["cas", "date"],
        facet=alt.Facet(
            "gouvernorat:N", columns=6, sort=alt.Sort(governorates_by_cum_total_cases),
        ),
    )
    .properties(height=100, width=200)
).interactive()









    Out[218]:

I couldn't interpret the meaning of the negative values, I will look into using a cumulative sum.



In [242]:

    
def highlight_negative(x):
    return "font-weight: bold; color: red;" if x < 0 else "color: black;"


tidy_covid_df.set_index(["gouvernorat", "date"]).pipe(
    lambda df: df.query(" | ".join([f"{col} < 0" for col in df.columns]))
).sort_index().style.applymap(highlight_negative).format("{:n}")









    Out[242]:




                            cas         retablis         deces         hospitalise         nbre_tests         reanimation     
            gouvernorat         date                                                     

                
                        Ariana
                        2020-04-06 00:00:00
                        -1
                        0
                        0
                        0
                        0
                        0
            
            
                                2020-04-14 00:00:00
                        -1
                        0
                        1
                        0
                        0
                        0
            
            
                        Beja
                        2020-05-03 00:00:00
                        0
                        -18
                        0
                        0
                        0
                        0
            
            
                                2020-05-12 00:00:00
                        0
                        -1
                        0
                        0
                        0
                        0
            
            
                        Ben Arous
                        2020-04-01 00:00:00
                        2
                        0
                        0
                        0
                        -288
                        0
            
            
                        Bizerte
                        2020-04-10 00:00:00
                        0
                        0
                        -1
                        0
                        0
                        0
            
            
                        Gafsa
                        2020-05-05 00:00:00
                        -3
                        11
                        0
                        0
                        0
                        0
            
            
                                2020-05-06 00:00:00
                        0
                        -1
                        0
                        0
                        0
                        0
            
            
                        Kasserine
                        2020-05-05 00:00:00
                        -1
                        -1
                        0
                        0
                        0
                        0
            
            
                        Kebili
                        2020-04-11 00:00:00
                        -1
                        0
                        0
                        0
                        0
                        0
            
            
                        Le Kef
                        2020-03-31 00:00:00
                        -1
                        0
                        0
                        0
                        0
                        0
            
            
                        Mahdia
                        2020-04-22 00:00:00
                        -1
                        0
                        0
                        0
                        0
                        0
            
            
                        Manouba
                        2020-03-30 00:00:00
                        -1
                        0
                        0
                        0
                        0
                        0
            
            
                                2020-05-05 00:00:00
                        -1
                        9
                        0
                        0
                        0
                        0
            
            
                        Monastir
                        2020-04-05 00:00:00
                        -1
                        0
                        0
                        0
                        0
                        0
            
            
                                2020-04-19 00:00:00
                        -1
                        0
                        0
                        0
                        0
                        0
            
            
                        Nabeul
                        2020-04-11 00:00:00
                        -1
                        0
                        0
                        0
                        0
                        0
            
            
                        Sousse
                        2020-04-08 00:00:00
                        -6
                        0
                        0
                        0
                        0
                        0
            
            
                        Tozeur
                        2020-03-30 00:00:00
                        -5
                        0
                        0
                        0
                        0
                        0
            
            
                                2020-05-05 00:00:00
                        -1
                        -1
                        0
                        0
                        0
                        0
            
            
                        Tunis
                        2020-04-20 00:00:00
                        2
                        20
                        0
                        -197
                        878
                        32
            
            
                                2020-04-23 00:00:00
                        0
                        0
                        0
                        -4
                        438
                        -12
            
            
                                2020-04-24 00:00:00
                        3
                        13
                        0
                        -6
                        559
                        -1
            
            
                                2020-04-25 00:00:00
                        3
                        9
                        0
                        -6
                        429
                        1
            
            
                                2020-04-26 00:00:00
                        4
                        63
                        0
                        0
                        263
                        -2
            
            
                                2020-04-27 00:00:00
                        2
                        0
                        1
                        -2
                        396
                        -1
            
            
                                2020-04-29 00:00:00
                        12
                        15
                        0
                        -6
                        895
                        4
            
            
                                2020-04-30 00:00:00
                        0
                        11
                        0
                        -1
                        569
                        0
            
            
                                2020-05-01 00:00:00
                        5
                        7
                        1
                        -6
                        529
                        1
            
            
                                2020-05-03 00:00:00
                        1
                        -163
                        0
                        -12
                        447
                        -6
            
            
                                2020-05-04 00:00:00
                        2
                        2
                        0
                        -11
                        312
                        -2
            
            
                                2020-05-06 00:00:00
                        0
                        0
                        0
                        -4
                        416
                        3
            
            
                                2020-05-08 00:00:00
                        0
                        14
                        0
                        1
                        1500
                        -5
            
            
                                2020-05-09 00:00:00
                        0
                        8
                        0
                        -24
                        292
                        0
            
            
                                2020-05-10 00:00:00
                        0
                        5
                        0
                        -4
                        448
                        -19
            
            
                                2020-05-11 00:00:00
                        0
                        2
                        0
                        -2
                        614
                        0
            
            
                                2020-05-13 00:00:00
                        0
                        5
                        0
                        -2
                        555
                        0
            
            
                        Zaghouan
                        2020-04-28 00:00:00
                        0
                        15
                        0
                        -4
                        585
                        3
            
            
                                2020-05-03 00:00:00
                        0
                        -12
                        0
                        0
                        0
                        0

Cumulative daily obeservations



In [239]:

    
tidy_covid_cumu_df = (
    tidy_covid_df.groupby(["gouvernorat", "date"])
    .sum()
    .groupby(level=[0], as_index=False)
    .cumsum()
)



In [244]:

    
tidy_covid_cumu_df.pipe(
    lambda df: df.query(" | ".join([f"{col} < 0" for col in df.columns]))
).sort_index().style.applymap(highlight_negative).format("{:n}")









    Out[244]:




                            cas         retablis         deces         hospitalise         nbre_tests         reanimation     
            gouvernorat         date                                                     

                
                        Ben Arous
                        2020-04-01 00:00:00
                        45
                        0
                        0
                        0
                        -288
                        0
            
            
                                2020-04-02 00:00:00
                        49
                        0
                        0
                        0
                        -288
                        0
            
            
                                2020-04-03 00:00:00
                        54
                        0
                        0
                        0
                        -288
                        0
            
            
                                2020-04-04 00:00:00
                        56
                        0
                        1
                        0
                        -288
                        0
            
            
                                2020-04-05 00:00:00
                        57
                        0
                        1
                        0
                        -288
                        0
            
            
                                2020-04-06 00:00:00
                        61
                        0
                        1
                        0
                        -288
                        0
            
            
                                2020-04-07 00:00:00
                        62
                        0
                        1
                        0
                        -288
                        0
            
            
                                2020-04-08 00:00:00
                        63
                        0
                        1
                        0
                        -288
                        0
            
            
                                2020-04-09 00:00:00
                        65
                        0
                        1
                        0
                        -288
                        0
            
            
                                2020-04-10 00:00:00
                        66
                        0
                        1
                        0
                        -288
                        0
            
            
                                2020-04-11 00:00:00
                        71
                        0
                        2
                        0
                        -288
                        0
            
            
                                2020-04-12 00:00:00
                        74
                        0
                        2
                        0
                        -288
                        0
            
            
                                2020-04-13 00:00:00
                        79
                        0
                        2
                        0
                        -288
                        0
            
            
                                2020-04-14 00:00:00
                        81
                        0
                        2
                        0
                        -288
                        0
            
            
                                2020-04-15 00:00:00
                        81
                        0
                        3
                        0
                        -288
                        0
            
            
                                2020-04-16 00:00:00
                        87
                        0
                        3
                        0
                        -288
                        0
            
            
                                2020-04-17 00:00:00
                        87
                        0
                        3
                        0
                        -288
                        0
            
            
                                2020-04-18 00:00:00
                        87
                        0
                        3
                        0
                        -288
                        0
            
            
                                2020-04-19 00:00:00
                        87
                        0
                        3
                        0
                        -288
                        0
            
            
                                2020-04-20 00:00:00
                        91
                        0
                        3
                        0
                        -288
                        0
            
            
                                2020-04-21 00:00:00
                        91
                        0
                        3
                        0
                        -288
                        0
            
            
                                2020-04-22 00:00:00
                        91
                        0
                        3
                        0
                        -288
                        0
            
            
                                2020-04-23 00:00:00
                        91
                        0
                        3
                        0
                        -288
                        0
            
            
                                2020-04-24 00:00:00
                        92
                        0
                        3
                        0
                        -288
                        0
            
            
                                2020-04-25 00:00:00
                        92
                        0
                        3
                        0
                        -288
                        0
            
            
                                2020-04-26 00:00:00
                        93
                        0
                        4
                        0
                        -288
                        0
            
            
                                2020-04-27 00:00:00
                        93
                        0
                        4
                        0
                        -288
                        0
            
            
                                2020-04-28 00:00:00
                        94
                        0
                        4
                        0
                        -288
                        0
            
            
                                2020-04-29 00:00:00
                        94
                        0
                        4
                        0
                        -288
                        0
            
            
                                2020-04-30 00:00:00
                        94
                        0
                        4
                        0
                        -288
                        0
            
            
                                2020-05-01 00:00:00
                        96
                        0
                        4
                        0
                        -288
                        0
            
            
                                2020-05-02 00:00:00
                        96
                        0
                        4
                        0
                        -288
                        0
            
            
                                2020-05-03 00:00:00
                        97
                        35
                        4
                        0
                        -288
                        0
            
            
                                2020-05-04 00:00:00
                        97
                        35
                        4
                        0
                        -288
                        0
            
            
                                2020-05-05 00:00:00
                        97
                        62
                        4
                        0
                        -288
                        0
            
            
                                2020-05-06 00:00:00
                        97
                        62
                        4
                        0
                        -288
                        0
            
            
                                2020-05-07 00:00:00
                        97
                        71
                        4
                        0
                        -288
                        0
            
            
                                2020-05-08 00:00:00
                        97
                        73
                        4
                        0
                        -288
                        0
            
            
                                2020-05-09 00:00:00
                        97
                        74
                        4
                        0
                        -288
                        0
            
            
                                2020-05-10 00:00:00
                        97
                        78
                        4
                        0
                        -288
                        0
            
            
                                2020-05-11 00:00:00
                        97
                        78
                        4
                        0
                        -288
                        0
            
            
                                2020-05-12 00:00:00
                        97
                        80
                        4
                        0
                        -288
                        0
            
            
                                2020-05-13 00:00:00
                        97
                        81
                        4
                        0
                        -288
                        0
            
            
                                2020-05-14 00:00:00
                        97
                        83
                        4
                        0
                        -288
                        0
            
            
                        Tunis
                        2020-04-20 00:00:00
                        195
                        165
                        6
                        -197
                        15422
                        32
            
            
                                2020-04-21 00:00:00
                        198
                        165
                        6
                        -197
                        16118
                        32
            
            
                                2020-04-22 00:00:00
                        199
                        165
                        6
                        -197
                        16668
                        32
            
            
                                2020-04-23 00:00:00
                        199
                        165
                        6
                        -201
                        17106
                        20
            
            
                                2020-04-24 00:00:00
                        202
                        178
                        6
                        -207
                        17665
                        19
            
            
                                2020-04-25 00:00:00
                        205
                        187
                        6
                        -213
                        18094
                        20
            
            
                                2020-04-26 00:00:00
                        209
                        250
                        6
                        -213
                        18357
                        18
            
            
                                2020-04-27 00:00:00
                        211
                        250
                        7
                        -215
                        18753
                        17
            
            
                                2020-04-28 00:00:00
                        211
                        250
                        7
                        -215
                        18753
                        17
            
            
                                2020-04-29 00:00:00
                        223
                        265
                        7
                        -221
                        19648
                        21
            
            
                                2020-04-30 00:00:00
                        223
                        276
                        7
                        -222
                        20217
                        21
            
            
                                2020-05-01 00:00:00
                        228
                        283
                        8
                        -228
                        20746
                        22
            
            
                                2020-05-02 00:00:00
                        228
                        288
                        8
                        -228
                        21097
                        22
            
            
                                2020-05-03 00:00:00
                        229
                        125
                        8
                        -240
                        21544
                        16
            
            
                                2020-05-04 00:00:00
                        231
                        127
                        8
                        -251
                        21856
                        14
            
            
                                2020-05-05 00:00:00
                        232
                        127
                        8
                        -250
                        22246
                        15
            
            
                                2020-05-06 00:00:00
                        232
                        127
                        8
                        -254
                        22662
                        18
            
            
                                2020-05-07 00:00:00
                        232
                        147
                        8
                        -253
                        24097
                        21
            
            
                                2020-05-08 00:00:00
                        232
                        161
                        8
                        -252
                        25597
                        16
            
            
                                2020-05-09 00:00:00
                        232
                        169
                        8
                        -276
                        25889
                        16
            
            
                                2020-05-10 00:00:00
                        232
                        174
                        8
                        -280
                        26337
                        -3
            
            
                                2020-05-11 00:00:00
                        232
                        176
                        8
                        -282
                        26951
                        -3
            
            
                                2020-05-12 00:00:00
                        232
                        177
                        8
                        -282
                        27394
                        -3
            
            
                                2020-05-13 00:00:00
                        232
                        182
                        8
                        -284
                        27949
                        -3
            
            
                                2020-05-14 00:00:00
                        233
                        186
                        8
                        -284
                        28392
                        -3
            
            
                        Zaghouan
                        2020-04-28 00:00:00
                        3
                        15
                        0
                        -4
                        585
                        3
            
            
                                2020-04-29 00:00:00
                        3
                        15
                        0
                        -4
                        585
                        3
            
            
                                2020-04-30 00:00:00
                        3
                        15
                        0
                        -4
                        585
                        3
            
            
                                2020-05-01 00:00:00
                        3
                        15
                        0
                        -4
                        585
                        3
            
            
                                2020-05-02 00:00:00
                        3
                        15
                        0
                        -4
                        585
                        3
            
            
                                2020-05-03 00:00:00
                        3
                        3
                        0
                        -4
                        585
                        3
            
            
                                2020-05-04 00:00:00
                        3
                        3
                        0
                        -4
                        585
                        3
            
            
                                2020-05-05 00:00:00
                        3
                        3
                        0
                        -4
                        585
                        3
            
            
                                2020-05-06 00:00:00
                        3
                        3
                        0
                        -4
                        585
                        3
            
            
                                2020-05-07 00:00:00
                        3
                        3
                        0
                        -4
                        585
                        3
            
            
                                2020-05-08 00:00:00
                        3
                        3
                        0
                        -4
                        585
                        3
            
            
                                2020-05-09 00:00:00
                        3
                        3
                        0
                        -4
                        585
                        3
            
            
                                2020-05-10 00:00:00
                        3
                        3
                        0
                        -4
                        585
                        3
            
            
                                2020-05-11 00:00:00
                        3
                        3
                        0
                        -4
                        585
                        3
            
            
                                2020-05-12 00:00:00
                        3
                        3
                        0
                        -4
                        585
                        3
            
            
                                2020-05-13 00:00:00
                        3
                        3
                        0
                        -4
                        585
                        3
            
            
                                2020-05-14 00:00:00
                        3
                        3
                        0
                        -4
                        585
                        3

hospitalise, nbre_tests and reanimation columns still show negative values even when accumulated. For now I will ignore them.



In [246]:

    
tidy_covid_cumu_no_neg = tidy_covid_cumu_df.drop(
    columns=["hospitalise", "nbre_tests", "reanimation"]
)



In [252]:

    
cumulative_daily_cases_plot = (
    alt.Chart(tidy_covid_cumu_no_neg.reset_index())
    .mark_area()
    .encode(
        x="date:T",
        y="retablis:Q",
        tooltip=["cas:Q", "date:T"],
        facet=alt.Facet(
            "gouvernorat:N", columns=6, sort=alt.Sort(governorates_by_cum_total_cases),
        ),
    )
    .properties(width=200, height=120)
)
cumulative_daily_cases_plot









    Out[252]:

Exporting results



In [257]:

    
tidy_covid_cumu_no_neg.to_csv(here("data/processed/targa_covid_cumulative_no_neg.csv"))

Official data



In [258]:

    
official_raw = gpd.read_file(
    "https://services6.arcgis.com/BiTAc9ApDDtL9okN/arcgis/rest/services/COVID19_Table_DATESetTOTAL/FeatureServer/0/query?where=1%3D1&objectIds=&time=&resultType=standard&outFields=*&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&sqlFormat=none&f=pjson&token="
)



In [259]:

    
official_raw.head()









    Out[259]:







  
    
      
      OBJECTID
      Dates
      Total_Confirmed
      Analyses
      Nb_test
      Isolement
      Suspects
      Retablis
      Décès
      importe
      ...
      Nb_tests_journalier
      Origine_encours
      cas_actifs
      cas_actifs2
      GlobalID
      CreationDate
      Creator
      EditDate
      Editor
      geometry
    
  
  
    
      0
      1
      1583280000000
      1.0
      None
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
      ...
      NaN
      NaN
      NaN
      None
      6cfbb829-1b14-4ea7-9bb0-4cd3450281e0
      1589370484037
      ageos.admins
      1589547557566
      ageos.admins
      None
    
    
      1
      2
      1583366400000
      1.0
      None
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      None
      90f52142-7b24-4b3d-91ed-9f9480d8496b
      1589370484037
      ageos.admins
      1589547557566
      ageos.admins
      None
    
    
      2
      3
      1583452800000
      1.0
      None
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      None
      c8a498b5-025f-4b56-9525-4a75376a884c
      1589370484037
      ageos.admins
      1589547557566
      ageos.admins
      None
    
    
      3
      4
      1583539200000
      1.0
      None
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      NaN
      NaN
      NaN
      None
      0713a99d-7155-4b4f-a0c0-4dfdeaeab97f
      1589370484037
      ageos.admins
      1589547557566
      ageos.admins
      None
    
    
      4
      5
      1583625600000
      2.0
      None
      NaN
      NaN
      NaN
      NaN
      NaN
      1.0
      ...
      NaN
      NaN
      NaN
      None
      feacc7d9-7ba5-4b6c-ad05-ddbbaef92626
      1589370484037
      ageos.admins
      1589547557566
      ageos.admins
      None
    
  

5 rows × 22 columns

Cleaning pipeline



In [339]:

    
def drop_invalid_rows(df):
    return df.dropna(subset=["Dates"])


def drop_empty_columns(df):
    return df.dropna(how="all", axis="columns")


def drop_unnecessary_columns(df):
    return df.drop(
        columns=[
            "Editor",
            "EditDate",
            "Creator",
            "CreationDate",
            "GlobalID",
            "OBJECTID",
        ]
    )


def set_dtypes(df):
    return df.assign(dates=pd.to_datetime(df["dates"], unit="ms"))


def fill_missing_values(df):
    return df.fillna(method="ffill").fillna(0)


def clean_column_names(df):
    return df.rename(
        columns=lambda col: col.lower().replace("é", "e").replace("è", "e")
    )


def official_data_pipeline(df):
    return (
        df.pipe(drop_invalid_rows)
        .pipe(drop_empty_columns)
        .pipe(drop_unnecessary_columns)
        .pipe(clean_column_names)
        .pipe(set_dtypes)
        .pipe(fill_missing_values)
    )


tidy_official = official_raw.pipe(official_data_pipeline).pipe(
    lambda df: pd.DataFrame(df)
)
tidy_official.tail(5)









    Out[339]:







  
    
      
      dates
      total_confirmed
      nb_test
      isolement
      suspects
      retablis
      deces
      importe
      autochtone
      nb_cas_journalier
      nb_tests_journalier
      origine_encours
      cas_actifs
    
  
  
    
      68
      2020-05-10 23:00:00
      1032.0
      33880.0
      19122.0
      5.0
      740.0
      45.0
      246.0
      784.0
      0.0
      614.0
      0.0
      247.0
    
    
      69
      2020-05-11 23:00:00
      1032.0
      34323.0
      19122.0
      5.0
      759.0
      45.0
      246.0
      784.0
      0.0
      443.0
      0.0
      228.0
    
    
      70
      2020-05-12 23:00:00
      1032.0
      36523.0
      19122.0
      3.0
      770.0
      45.0
      247.0
      785.0
      0.0
      2200.0
      0.0
      217.0
    
    
      71
      2020-05-13 23:00:00
      1035.0
      37862.0
      19122.0
      3.0
      802.0
      45.0
      247.0
      785.0
      3.0
      1339.0
      0.0
      188.0
    
    
      72
      2020-05-14 23:00:00
      1037.0
      39778.0
      19122.0
      3.0
      807.0
      45.0
      247.0
      785.0
      2.0
      1916.0
      0.0
      185.0



In [335]:

    
alt.themes.enable("default")
tidy_official.melt(
    id_vars="dates", value_vars=["cas_actifs", "retablis", "deces"]
).pipe(
    lambda df: df.assign(
        ix=df["variable"].replace({"cas_actifs": 2, "retablis": 3, "deces": 1})
    )
).pipe(
    lambda df: alt.Chart(df)
    .mark_area()
    .encode(
        x="dates:T",
        y=alt.Y("value"),
        color=alt.Color("variable", sort=["retablis", "cas_actifs", "retablis"]),
        tooltip=["value", "dates"],
        order="ix",
    )
)









    Out[335]:



In [337]:

    
tidy_official.pipe(
    lambda df: df.assign(
        day=df["dates"].dt.day, month=df["dates"].dt.month, dow=df["dates"].dt.dayofweek
    )
).pipe(
    lambda df: alt.Chart(df)
    .mark_rect()
    .encode(x="day:O", y="month:O", color="nb_cas_journalier:Q",)
)









    Out[337]:

Export data



In [342]:

    
tidy_official.to_csv(here("data/processed/official_covid.csv"), index=False)



In [ ]:

	date	Gouvernorat	الولاية	Cas	Rétablis	Décès	Hospitalisé	nbre tests	Reanimation	Pays
0	02/03/2020	Gafsa	قفصة	1.0	1.0	NaN	NaN	67.0	NaN	Tunisie
1	08/03/2020	Mahdia	مهدية	1.0	NaN	NaN	NaN	30.0	NaN	Tunisie
2	09/03/2020	Bizerte	بنزرت	1.0	NaN	NaN	NaN	20.0	NaN	Tunisie
3	09/03/2020	Tunis	تونس	1.0	NaN	NaN	NaN	NaN	NaN	Tunisie
4	09/03/2020	Mahdia	مهدية	1.0	1.0	NaN	NaN	NaN	NaN	Tunisie

	date	gouvernorat	cas	retablis	nbre_tests
0	2020-03-02	Gafsa	1.0	1.0	67.0
1	2020-03-02	Mahdia	0.0	0.0	0.0
2	2020-03-02	Bizerte	0.0	0.0	0.0
3	2020-03-02	Tunis	0.0	0.0	0.0
4	2020-03-02	Ariana	0.0	0.0	0.0

		cas	retablis	deces	hospitalise	nbre_tests	reanimation
gouvernorat	date
Ariana	2020-04-06 00:00:00	-1	0	0	0	0	0
Ariana	2020-04-14 00:00:00	-1	0	1	0	0	0
Beja	2020-05-03 00:00:00	0	-18	0	0	0	0
Beja	2020-05-12 00:00:00	0	-1	0	0	0	0
Ben Arous	2020-04-01 00:00:00	2	0	0	0	-288	0
Bizerte	2020-04-10 00:00:00	0	0	-1	0	0	0
Gafsa	2020-05-05 00:00:00	-3	11	0	0	0	0
Gafsa	2020-05-06 00:00:00	0	-1	0	0	0	0
Kasserine	2020-05-05 00:00:00	-1	-1	0	0	0	0
Kebili	2020-04-11 00:00:00	-1	0	0	0	0	0
Le Kef	2020-03-31 00:00:00	-1	0	0	0	0	0
Mahdia	2020-04-22 00:00:00	-1	0	0	0	0	0
Manouba	2020-03-30 00:00:00	-1	0	0	0	0	0
Manouba	2020-05-05 00:00:00	-1	9	0	0	0	0
Monastir	2020-04-05 00:00:00	-1	0	0	0	0	0
Monastir	2020-04-19 00:00:00	-1	0	0	0	0	0
Nabeul	2020-04-11 00:00:00	-1	0	0	0	0	0
Sousse	2020-04-08 00:00:00	-6	0	0	0	0	0
Tozeur	2020-03-30 00:00:00	-5	0	0	0	0	0
Tozeur	2020-05-05 00:00:00	-1	-1	0	0	0	0
Tunis	2020-04-20 00:00:00	2	20	0	-197	878	32
	2020-04-23 00:00:00	0	0	0	-4	438	-12
	2020-04-24 00:00:00	3	13	0	-6	559	-1
	2020-04-25 00:00:00	3	9	0	-6	429	1
	2020-04-26 00:00:00	4	63	0	0	263	-2
	2020-04-27 00:00:00	2	0	1	-2	396	-1
	2020-04-29 00:00:00	12	15	0	-6	895	4
	2020-04-30 00:00:00	0	11	0	-1	569	0
	2020-05-01 00:00:00	5	7	1	-6	529	1
	2020-05-03 00:00:00	1	-163	0	-12	447	-6
	2020-05-04 00:00:00	2	2	0	-11	312	-2
	2020-05-06 00:00:00	0	0	0	-4	416	3
	2020-05-08 00:00:00	0	14	0	1	1500	-5
	2020-05-09 00:00:00	0	8	0	-24	292	0
	2020-05-10 00:00:00	0	5	0	-4	448	-19
	2020-05-11 00:00:00	0	2	0	-2	614	0
	2020-05-13 00:00:00	0	5	0	-2	555	0
Zaghouan	2020-04-28 00:00:00	0	15	0	-4	585	3
Zaghouan	2020-05-03 00:00:00	0	-12	0	0	0	0

	OBJECTID	Dates	Total_Confirmed	Analyses	Nb_test	Isolement	Suspects	Retablis	Décès	importe	...	Nb_tests_journalier	Origine_encours	cas_actifs	cas_actifs2	GlobalID	CreationDate	Creator	EditDate	Editor	geometry
0	1	1583280000000	1.0	None	NaN	NaN	NaN	NaN	NaN	1.0	...	NaN	NaN	NaN	None	6cfbb829-1b14-4ea7-9bb0-4cd3450281e0	1589370484037	ageos.admins	1589547557566	ageos.admins	None
1	2	1583366400000	1.0	None	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	None	90f52142-7b24-4b3d-91ed-9f9480d8496b	1589370484037	ageos.admins	1589547557566	ageos.admins	None
2	3	1583452800000	1.0	None	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	None	c8a498b5-025f-4b56-9525-4a75376a884c	1589370484037	ageos.admins	1589547557566	ageos.admins	None
3	4	1583539200000	1.0	None	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	None	0713a99d-7155-4b4f-a0c0-4dfdeaeab97f	1589370484037	ageos.admins	1589547557566	ageos.admins	None
4	5	1583625600000	2.0	None	NaN	NaN	NaN	NaN	NaN	1.0	...	NaN	NaN	NaN	None	feacc7d9-7ba5-4b6c-ad05-ddbbaef92626	1589370484037	ageos.admins	1589547557566	ageos.admins	None

	dates	total_confirmed	nb_test	isolement	suspects	retablis	deces	importe	autochtone	nb_cas_journalier	nb_tests_journalier	cas_actifs
68	2020-05-10 23:00:00	1032.0	33880.0	19122.0	5.0	740.0	45.0	246.0	784.0	0.0	614.0	247.0
69	2020-05-11 23:00:00	1032.0	34323.0	19122.0	5.0	759.0	45.0	246.0	784.0	0.0	443.0	228.0
70	2020-05-12 23:00:00	1032.0	36523.0	19122.0	3.0	770.0	45.0	247.0	785.0	0.0	2200.0	217.0
71	2020-05-13 23:00:00	1035.0	37862.0	19122.0	3.0	802.0	45.0	247.0	785.0	3.0	1339.0	188.0
72	2020-05-14 23:00:00	1037.0	39778.0	19122.0	3.0	807.0	45.0	247.0	785.0	2.0	1916.0	185.0