In [1]:
from itertools import product
import altair as alt
import geopandas as gpd
import pandas as pd
import pandera as pa
import requests
from pandera import Check, Column, check_output
from pyprojroot import here
In [2]:
pd.set_option("display.max_rows", 100)
In [3]:
raw_covid_df = pd.read_csv(here("data/targa_covid19.csv"))
raw_covid_df.head()
Out[3]:
In [4]:
targa_schema = pa.DataFrameSchema(
{
"date": pa.Column(pa.DateTime),
"gouvernorat": pa.Column(pa.Object),
"cas": pa.Column(pa.Int),
"retablis": pa.Column(pa.Int),
"deces": pa.Column(pa.Int),
"hospitalise": pa.Column(pa.Int),
"nbre_tests": pa.Column(pa.Int),
"reanimation": pa.Column(pa.Int),
}
)
In [5]:
def format_column_names(df):
return df.rename(
columns=lambda col: str(col)
.replace("é", "e")
.replace("è", "e")
.replace(" ", "_")
.lower()
)
def drop_unnecessary_columns(df):
return df.drop(columns=["Pays", "الولاية"])
def fix_empty_strings(df):
return df.replace("^\s*$", float("nan"), regex=True)
def set_dtypes(df):
to_numeric_columns = df.columns.difference(["gouvernorat", "date"])
numeric = {col_name: pd.to_numeric(df[col_name]) for col_name in to_numeric_columns}
return df.assign(**numeric).pipe(
lambda df: df.assign(date=pd.to_datetime(df["date"], format="%d/%m/%Y"))
)
def correct_typos(df):
return df.assign(
gouvernorat=df["gouvernorat"]
.str.title() # Fix an issue with 'Zaghouan' being entered as 'zaghouan'
.replace({"Seliana": "Siliana"})
)
def reindex_to_full_date_range(df):
full_date_range = pd.date_range(df["date"].min(), df["date"].max())
governorates = df["gouvernorat"].unique()
full_date_range_index = list(product(full_date_range, governorates))
return (
df.groupby(["date", "gouvernorat"])
.sum()
.reindex(full_date_range_index, fill_value=0)
)
# @check_output(targa_schema)
def base_pipeline(df):
return (
df.pipe(drop_unnecessary_columns)
.pipe(format_column_names)
.pipe(fix_empty_strings)
.pipe(correct_typos)
.pipe(set_dtypes)
.pipe(reindex_to_full_date_range)
)
In [6]:
tidy_covid_df = raw_covid_df.pipe(base_pipeline).reset_index()
tidy_covid_df.head()
Out[6]:
In [156]:
start = tidy_covid_df["date"].min()
end = tidy_covid_df["date"].max()
In [157]:
print(start)
print(end)
print(end - start)
We expect duration + 1 observations in our dataset for each governorate
In [9]:
tidy_covid_df.groupby("gouvernorat").agg({"date": "nunique"}).describe().loc["mean"]
Out[9]:
We expect the total number of observations per day to be 24, the number of governorates
In [10]:
tidy_covid_df.groupby("date").agg({"gouvernorat": "nunique"}).describe().loc["mean"]
Out[10]:
There are duplicate observations per day, assuming that they are the result of not summing the cases for that governorate for that particualr day, the pipeline should sum the values of observations for each day for each governorate.
In [159]:
duplicate_govs_per_day = (
tidy_covid_df.groupby(["date", "gouvernorat"])
.agg("count")
.query(
"cas > 1 | retablis > 1 | deces > 1 | hospitalise > 1 | nbre_tests > 1 | reanimation > 1"
)
)
duplicate_govs_per_day
Out[159]:
In [160]:
tidy_covid_df.set_index(["date", "gouvernorat"]).sort_index().loc[
duplicate_govs_per_day.index
]
Out[160]:
In [218]:
governorates_by_cum_total_cases = (
tidy_covid_df.groupby(["gouvernorat"], as_index=False)
.agg({"cas": "sum"})
.sort_values("cas", ascending=False)
.loc[:, "gouvernorat"]
.to_list()
)
tidy_covid_df.pipe(
lambda df: alt.Chart(df)
.mark_bar()
.encode(
x="date",
y="cas",
tooltip=["cas", "date"],
facet=alt.Facet(
"gouvernorat:N", columns=6, sort=alt.Sort(governorates_by_cum_total_cases),
),
)
.properties(height=100, width=200)
).interactive()
Out[218]:
I couldn't interpret the meaning of the negative values, I will look into using a cumulative sum.
In [242]:
def highlight_negative(x):
return "font-weight: bold; color: red;" if x < 0 else "color: black;"
tidy_covid_df.set_index(["gouvernorat", "date"]).pipe(
lambda df: df.query(" | ".join([f"{col} < 0" for col in df.columns]))
).sort_index().style.applymap(highlight_negative).format("{:n}")
Out[242]:
In [239]:
tidy_covid_cumu_df = (
tidy_covid_df.groupby(["gouvernorat", "date"])
.sum()
.groupby(level=[0], as_index=False)
.cumsum()
)
In [244]:
tidy_covid_cumu_df.pipe(
lambda df: df.query(" | ".join([f"{col} < 0" for col in df.columns]))
).sort_index().style.applymap(highlight_negative).format("{:n}")
Out[244]:
hospitalise, nbre_tests and reanimation columns still show negative values even when accumulated. For now I will ignore them.
In [246]:
tidy_covid_cumu_no_neg = tidy_covid_cumu_df.drop(
columns=["hospitalise", "nbre_tests", "reanimation"]
)
In [252]:
cumulative_daily_cases_plot = (
alt.Chart(tidy_covid_cumu_no_neg.reset_index())
.mark_area()
.encode(
x="date:T",
y="retablis:Q",
tooltip=["cas:Q", "date:T"],
facet=alt.Facet(
"gouvernorat:N", columns=6, sort=alt.Sort(governorates_by_cum_total_cases),
),
)
.properties(width=200, height=120)
)
cumulative_daily_cases_plot
Out[252]:
In [257]:
tidy_covid_cumu_no_neg.to_csv(here("data/processed/targa_covid_cumulative_no_neg.csv"))
In [258]:
official_raw = gpd.read_file(
"https://services6.arcgis.com/BiTAc9ApDDtL9okN/arcgis/rest/services/COVID19_Table_DATESetTOTAL/FeatureServer/0/query?where=1%3D1&objectIds=&time=&resultType=standard&outFields=*&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&sqlFormat=none&f=pjson&token="
)
In [259]:
official_raw.head()
Out[259]:
In [339]:
def drop_invalid_rows(df):
return df.dropna(subset=["Dates"])
def drop_empty_columns(df):
return df.dropna(how="all", axis="columns")
def drop_unnecessary_columns(df):
return df.drop(
columns=[
"Editor",
"EditDate",
"Creator",
"CreationDate",
"GlobalID",
"OBJECTID",
]
)
def set_dtypes(df):
return df.assign(dates=pd.to_datetime(df["dates"], unit="ms"))
def fill_missing_values(df):
return df.fillna(method="ffill").fillna(0)
def clean_column_names(df):
return df.rename(
columns=lambda col: col.lower().replace("é", "e").replace("è", "e")
)
def official_data_pipeline(df):
return (
df.pipe(drop_invalid_rows)
.pipe(drop_empty_columns)
.pipe(drop_unnecessary_columns)
.pipe(clean_column_names)
.pipe(set_dtypes)
.pipe(fill_missing_values)
)
tidy_official = official_raw.pipe(official_data_pipeline).pipe(
lambda df: pd.DataFrame(df)
)
tidy_official.tail(5)
Out[339]:
In [335]:
alt.themes.enable("default")
tidy_official.melt(
id_vars="dates", value_vars=["cas_actifs", "retablis", "deces"]
).pipe(
lambda df: df.assign(
ix=df["variable"].replace({"cas_actifs": 2, "retablis": 3, "deces": 1})
)
).pipe(
lambda df: alt.Chart(df)
.mark_area()
.encode(
x="dates:T",
y=alt.Y("value"),
color=alt.Color("variable", sort=["retablis", "cas_actifs", "retablis"]),
tooltip=["value", "dates"],
order="ix",
)
)
Out[335]:
In [337]:
tidy_official.pipe(
lambda df: df.assign(
day=df["dates"].dt.day, month=df["dates"].dt.month, dow=df["dates"].dt.dayofweek
)
).pipe(
lambda df: alt.Chart(df)
.mark_rect()
.encode(x="day:O", y="month:O", color="nb_cas_journalier:Q",)
)
Out[337]:
In [342]:
tidy_official.to_csv(here("data/processed/official_covid.csv"), index=False)
In [ ]: