In [1]:
import pandas as pd

Explore & clean disease data from MOH

Extract data from all excel sheets


In [36]:
weekly_disease = pd.read_excel("./../data/raw/disease_SG/weekly-infectious-bulletin_caseswk09y2019.xlsx", sheet_name=None, na_values="na", skiprows=1)

In [122]:
for key in weekly_disease.keys():
    print(key)


Year 2019
Year 2018 
Year 2017 
Year 2016
Year 2015
Year 2014
Year 2013
Year 2012

In [55]:
type(weekly_disease)


Out[55]:
collections.OrderedDict

explore data


In [124]:
df_2012 = weekly_disease.get('Year 2012')

In [125]:
df_2012.columns


Out[125]:
Index(['Epidemiology Wk', 'Start', 'End', 'Cholera', 'Paratyphoid ',
       'Typhoid ', 'Viral Hepatitis A', 'Viral Hepatitis E', 'Poliomyelitis',
       'Plague', 'Yellow Fever', 'Dengue Fever', 'Dengue Haemorrhagic Fever',
       'Malaria', 'Chikungunya', 'Hand, Foot and Mouth Disease', 'Diphtheria',
       'Measles', 'Mumps  ', 'Rubella', 'SARS', 'Nipah',
       'Acute Viral hepatitis B', 'Encephalitis', 'Legionellosis',
       'Campylobacteriosis', 'Acute Viral hepatitis C', 'Melioidosis',
       'Meningococcal Infection', 'Pertussis',
       'Pneumococcal Disease (invasive)', 'Haemophilus influenzae type b',
       'Salmonellosis(non-enteric fevers)', 'Avian Influenza',
       'Acute Upper Respiratory Tract infections', 'Acute Conjunctivitis',
       'Acute Diarrhoea', 'Chickenpox'],
      dtype='object')

In [126]:
df_2012.head(3)


Out[126]:
Epidemiology Wk Start End Cholera Paratyphoid Typhoid Viral Hepatitis A Viral Hepatitis E Poliomyelitis Plague ... Meningococcal Infection Pertussis Pneumococcal Disease (invasive) Haemophilus influenzae type b Salmonellosis(non-enteric fevers) Avian Influenza Acute Upper Respiratory Tract infections Acute Conjunctivitis Acute Diarrhoea Chickenpox
0 1 2012-01-01 2012-01-07 0 1 6 1 3 0 0 ... 0 2 3 0 26 0 2932.222222 119.777778 490.888889 18.000000
1 2 2012-01-08 2012-01-14 0 1 3 2 0 0 0 ... 0 0 3 0 24 0 3188.727273 113.818182 575.090909 19.090909
2 3 2012-01-15 2012-01-21 0 1 5 3 0 0 0 ... 0 5 3 0 21 0 3184.545455 106.000000 538.909091 16.363636

3 rows × 38 columns

Get diseases transmitted by mosquitoes


In [48]:
df_2019 = df_2019[['Epidemiology Wk', 'Yellow Fever', 'Dengue ', 'DHF', 'Malaria', 'Chikungunya', 'Zika']]

In [50]:
df_2019.head(10)


Out[50]:
Epidemiology Wk Yellow Fever Dengue DHF Malaria Chikungunya Zika
0 1 0.0 205.0 2.0 1.0 1.0 0.0
1 2 0.0 245.0 0.0 0.0 0.0 0.0
2 3 0.0 207.0 8.0 3.0 2.0 2.0
3 4 0.0 221.0 1.0 0.0 1.0 0.0
4 5 0.0 179.0 3.0 1.0 0.0 0.0
5 6 0.0 135.0 3.0 0.0 0.0 1.0
6 7 0.0 232.0 3.0 2.0 0.0 0.0
7 8 0.0 181.0 1.0 0.0 1.0 0.0
8 9 0.0 157.0 1.0 0.0 2.0 0.0
9 10 NaN NaN NaN NaN NaN NaN

In [54]:
df_2019 = df_2019.dropna(thresh=2)
df_2019.shape


Out[54]:
(9, 7)

Process to clean data


In [101]:
df_combined = pd.DataFrame()
for key, value in weekly_disease.items():
    year = int(key.split()[1])
    if 'DHF' in value.columns:
        df = value[['Epidemiology Wk', 'Dengue ', 'DHF', 'Malaria']]
    else:
        df = value[['Epidemiology Wk', 'Dengue Fever', 'Dengue Haemorrhagic Fever', 'Malaria']]
        df = df.rename(index=str, columns={"Dengue Fever": "Dengue ", "Dengue Haemorrhagic Fever": "DHF"})
    df.dropna(thresh=3, inplace=True)
    df['year'] = year
    df_combined = pd.concat([df_combined,df], ignore_index=True)

In [102]:
df_combined.head()


Out[102]:
Epidemiology Wk Dengue DHF Malaria year
0 1.0 205.0 2.0 1.0 2019
1 2.0 245.0 0.0 0.0 2019
2 3.0 207.0 8.0 3.0 2019
3 4.0 221.0 1.0 0.0 2019
4 5.0 179.0 3.0 1.0 2019

In [103]:
df_combined.shape


Out[103]:
(374, 5)

In [104]:
df_combined.rename(str.strip, axis='columns', inplace=True)

In [105]:
df_combined.columns


Out[105]:
Index(['Epidemiology Wk', 'Dengue', 'DHF', 'Malaria', 'year'], dtype='object')

In [106]:
df_combined = df_combined.rename(index=str, columns={"Epidemiology Wk": "week"})
df_combined.head()


Out[106]:
week Dengue DHF Malaria year
0 1.0 205.0 2.0 1.0 2019
1 2.0 245.0 0.0 0.0 2019
2 3.0 207.0 8.0 3.0 2019
3 4.0 221.0 1.0 0.0 2019
4 5.0 179.0 3.0 1.0 2019

In [107]:
df_combined.fillna(0, inplace=True)

In [109]:
df_combined = df_combined.applymap(lambda x: int(x))

In [115]:
df_combined['Dengue'] = df_combined['Dengue'] + df_combined['DHF']

In [117]:
df_combined.drop('DHF', axis=1, inplace=True)

In [119]:
df_combined = df_combined[['year', 'week', 'Dengue', 'Malaria']]
df_combined.head()


Out[119]:
year week Dengue Malaria
0 2019 1 207 1
1 2019 2 245 0
2 2019 3 215 3
3 2019 4 222 0
4 2019 5 182 1

In [118]:
df_combined.columns


Out[118]:
Index(['week', 'Dengue', 'Malaria', 'year'], dtype='object')

In [121]:
print(df_combined['year'].min())
print(df_combined['year'].max())


2012
2019

Weather

Convert weather data into weekly data.


In [152]:
import os
import numpy as np

In [147]:
INPUT_DIR = '../data/raw/weather_SG'

In [134]:
files = os.listdir(INPUT_DIR)

In [135]:
files[:3]


Out[135]:
['S115_201601.csv', 'S108_201308.csv', 'S104_201603.csv']

In [136]:
stations = [x.split('_')[0] for x in files]

In [138]:
stations = list(set(stations))

In [139]:
stations


Out[139]:
['S43',
 'S115',
 'S24',
 'S107',
 'S108',
 'S86',
 'S60',
 'S44',
 'S106',
 'S104',
 'S109',
 'S102',
 'S50',
 'S25',
 'S80',
 'S111',
 'S23']

In [140]:
len(stations)


Out[140]:
17

In [142]:
station_files = [x for x in files if x.startswith('S23')]
len(station_files)


Out[142]:
86

In [144]:
station_files.sort()

In [ ]:
for filename in station_files:

In [150]:
file_path = os.path.join(INPUT_DIR, 'S23_201201.csv')
dfWeather = pd.read_csv(file_path,encoding='latin1')

In [156]:
dfWeather.columns


Out[156]:
Index(['Station', 'Year', 'Month', 'Day', 'Daily Rainfall Total (mm)',
       'Highest 30 Min Rainfall (mm)', 'Highest 60 Min Rainfall (mm)',
       'Highest 120 Min Rainfall (mm)', 'Mean Temperature (°C)',
       'Maximum Temperature (°C)', 'Minimum Temperature (°C)',
       'Mean Wind Speed (km/h)', 'Max Wind Speed (km/h)'],
      dtype='object')

In [169]:
dfWeather = dfWeather.rename(index=str, columns=dict_rename)

In [202]:
col_numeric = ['year', 'month', 'day', 'Rainfall Total',
       'Max 30Min Rainfall', 'Max 60Min Rainfall',
       'Max 120Min Rainfall', 'Mean Temperature',
       'Max Temperature', 'Min Temperature',
       'Mean Wind Speed', 'Max Wind Speed']
for col in col_numeric:
    dfWeather[col] = pd.to_numeric(dfWeather[col], errors='coerce')
dfWeather.head()


Out[202]:
location year month day Rainfall Total Max 30Min Rainfall Max 60Min Rainfall Max 120Min Rainfall Mean Temperature Max Temperature Min Temperature Mean Wind Speed Max Wind Speed
0 Tengah 2012 1 1 0.0 NaN NaN NaN 27.8 33.4 24.5 7.8 22.3
1 Tengah 2012 1 2 2.6 NaN NaN NaN 27.5 32.6 24.9 13.7 29.5
2 Tengah 2012 1 3 0.0 NaN NaN NaN 27.4 31.7 23.6 16.2 35.3
3 Tengah 2012 1 4 0.0 NaN NaN NaN 27.4 32.2 23.3 14.9 35.3
4 Tengah 2012 1 5 0.0 NaN NaN NaN NaN 32.2 23.0 14.7 37.1

In [168]:
dict_rename={'Station':'location', 
             'Year':'year', 'Month':'month', 'Day':'day', 
             'Daily Rainfall Total (mm)':'Rainfall Total',
             'Highest 30 Min Rainfall (mm)':'Max 30Min Rainfall', 
             'Highest 60 Min Rainfall (mm)':'Max 60Min Rainfall',
             'Highest 120 Min Rainfall (mm)':'Max 120Min Rainfall', 
             'Mean Temperature (°C)':'Mean Temperature',
             'Maximum Temperature (°C)':'Max Temperature', 
             'Minimum Temperature (°C)':'Min Temperature',
             'Mean Wind Speed (km/h)':'Mean Wind Speed', 
             'Max Wind Speed (km/h)':'Max Wind Speed'}
col_rename = ['location', 'year', 'month', 'day', 'Rainfall Total',
       'Max 30Min Rainfall', 'Max 60Min Rainfall',
       'Max 120Min Rainfall', 'Mean Temperature',
       'Max Temperature', 'Min Temperature',
       'Mean Wind Speed', 'Max Wind Speed']

In [206]:
logic = {
#    'location' : 'first', 
#    'year' : 'first', 
#    'month' : 'first',
#    'day' : 'first', 
    'Rainfall Total' : 'sum',
    'Max 30Min Rainfall' : 'max', 
    'Max 60Min Rainfall' : 'max',
    'Max 120Min Rainfall' : 'max', 
    'Mean Temperature' : 'mean',
    'Max Temperature' : 'max', 
    'Min Temperature' : 'min',
    'Mean Wind Speed' : 'mean', 
    'Max Wind Speed' : 'max'
}

In [203]:
dfWeeklyWeather = pd.DataFrame(columns=col_rename)
for i in range(int(len(dfWeather)/7)):
    df = dfWeather[i*7:i*7+7]
    #print(df)
    next_idx = i*7+7
    
print(next_idx)


28

In [204]:
df


Out[204]:
location year month day Rainfall Total Max 30Min Rainfall Max 60Min Rainfall Max 120Min Rainfall Mean Temperature Max Temperature Min Temperature Mean Wind Speed Max Wind Speed
21 Tengah 2012 1 22 21.3 NaN NaN NaN 26.3 31.8 23.1 9.5 29.5
22 Tengah 2012 1 23 1.8 NaN NaN NaN 24.9 31.3 22.0 9.7 35.3
23 Tengah 2012 1 24 0.0 NaN NaN NaN NaN 30.8 22.7 14.4 37.1
24 Tengah 2012 1 25 0.0 NaN NaN NaN NaN 32.2 23.4 15.9 37.1
25 Tengah 2012 1 26 0.0 NaN NaN NaN 26.9 30.9 23.3 16.3 37.1
26 Tengah 2012 1 27 9.7 NaN NaN NaN 26.2 32.7 23.1 12.1 37.1
27 Tengah 2012 1 28 0.0 NaN NaN NaN NaN 29.0 23.7 11.1 29.5

In [208]:
week_data = df.agg(logic)
week_data


Out[208]:
Rainfall Total         32.800000
Max 30Min Rainfall           NaN
Max 60Min Rainfall           NaN
Max 120Min Rainfall          NaN
Mean Temperature       26.075000
Max Temperature        32.700000
Min Temperature        22.000000
Mean Wind Speed        12.714286
Max Wind Speed         37.100000
dtype: float64

In [210]:
pd.Series(['Tengah', 2012, 6], index=['location','year','week']).append(week_data)


Out[210]:
location                Tengah
year                      2012
week                         6
Rainfall Total            32.8
Max 30Min Rainfall         NaN
Max 60Min Rainfall         NaN
Max 120Min Rainfall        NaN
Mean Temperature        26.075
Max Temperature           32.7
Min Temperature             22
Mean Wind Speed        12.7143
Max Wind Speed            37.1
dtype: object

In [212]:
df.iloc[0]['year']


Out[212]:
2012

In [227]:
dfWeeklyWeather = pd.DataFrame(columns=col_rename)
dfRemain = pd.DataFrame(columns=col_rename)
year = 0
week = 0
for filename in station_files:
    # get monthly data, read file
    file_path = os.path.join(INPUT_DIR, filename)
    dfMonth = pd.read_csv(file_path,encoding='latin1')
    dfMonth = dfMonth.rename(index=str, columns=dict_rename)
    # prepare data
    for col in col_numeric:
        dfMonth[col] = pd.to_numeric(dfMonth[col], errors='coerce')
    if dfMonth.iloc[0]['year'] != year:
        print(year,dfMonth.iloc[0]['year'])
        year = dfMonth.iloc[0]['year']
        week = 1
    if (len(dfRemain) > 0) and (len(dfRemain) < 7):
        dfMonth = dfRemain.append(dfMonth, ignore_index=True)
    else:
        if(len(dfRemain) > 7):
            print(dfRemain)
    # aggregate weekly data 
    for i in range(int(len(dfMonth)/7)):
        df = dfMonth[i*7:i*7+7]
        next_idx = i*7+7
        week_data = df.agg(logic)
        week_s = pd.Series([df.iloc[0]['location'], df.iloc[0]['year'], df.iloc[0]['month'], df.iloc[0]['day'], week], 
                           index=['location','year','month','day','week']).append(week_data)
        dfWeeklyWeather = dfWeeklyWeather.append(week_s, ignore_index=True)
        week += 1
    dfRemain = dfMonth[next_idx:]


0 2012
2012 2013
2013 2014
2014 2015
2015 2016
2016 2017
2017 2018
2018 2019

In [232]:
dfWeeklyWeather.head()


Out[232]:
location year month day Rainfall Total Max 30Min Rainfall Max 60Min Rainfall Max 120Min Rainfall Mean Temperature Max Temperature Min Temperature Mean Wind Speed Max Wind Speed week
0 Tengah 2012 1 1 2.6 NaN NaN NaN 27.525000 33.4 23.0 14.628571 46.4 1.0
1 Tengah 2012 1 8 32.2 NaN NaN NaN 25.800000 33.7 22.1 12.014286 44.3 2.0
2 Tengah 2012 1 15 70.7 NaN NaN NaN 27.350000 34.4 22.5 7.628571 40.7 3.0
3 Tengah 2012 1 22 32.8 NaN NaN NaN 26.075000 32.7 22.0 12.714286 37.1 4.0
4 Tengah 2012 1 29 50.8 NaN NaN NaN 26.357143 32.8 22.8 11.100000 37.1 5.0

In [231]:
dfWeeklyWeather.describe()


Out[231]:
Rainfall Total Max 30Min Rainfall Max 60Min Rainfall Max 120Min Rainfall Mean Temperature Max Temperature Min Temperature Mean Wind Speed Max Wind Speed week
count 373.000000 0.0 0.0 0.0 270.000000 373.000000 373.000000 373.000000 373.000000 373.000000
mean 51.947453 NaN NaN NaN 27.566989 33.942359 22.780965 9.357277 43.629223 26.099196
std 43.016111 NaN NaN NaN 0.835618 0.896305 2.200596 2.535895 9.555620 15.253003
min 0.000000 NaN NaN NaN 25.442857 30.600000 0.000000 3.900000 20.500000 1.000000
25% 16.000000 NaN NaN NaN 27.032143 33.400000 22.400000 7.614286 37.100000 13.000000
50% 45.200000 NaN NaN NaN 27.580000 33.900000 23.000000 8.742857 42.500000 26.000000
75% 78.100000 NaN NaN NaN 28.110714 34.600000 23.500000 10.400000 48.200000 39.000000
max 215.200000 NaN NaN NaN 29.885714 36.300000 25.500000 20.157143 133.200000 53.000000

In [ ]: