In [1]:
import pandas as pd
In [36]:
weekly_disease = pd.read_excel("./../data/raw/disease_SG/weekly-infectious-bulletin_caseswk09y2019.xlsx", sheet_name=None, na_values="na", skiprows=1)
In [122]:
for key in weekly_disease.keys():
print(key)
In [55]:
type(weekly_disease)
Out[55]:
In [124]:
df_2012 = weekly_disease.get('Year 2012')
In [125]:
df_2012.columns
Out[125]:
In [126]:
df_2012.head(3)
Out[126]:
In [48]:
df_2019 = df_2019[['Epidemiology Wk', 'Yellow Fever', 'Dengue ', 'DHF', 'Malaria', 'Chikungunya', 'Zika']]
In [50]:
df_2019.head(10)
Out[50]:
In [54]:
df_2019 = df_2019.dropna(thresh=2)
df_2019.shape
Out[54]:
In [101]:
df_combined = pd.DataFrame()
for key, value in weekly_disease.items():
year = int(key.split()[1])
if 'DHF' in value.columns:
df = value[['Epidemiology Wk', 'Dengue ', 'DHF', 'Malaria']]
else:
df = value[['Epidemiology Wk', 'Dengue Fever', 'Dengue Haemorrhagic Fever', 'Malaria']]
df = df.rename(index=str, columns={"Dengue Fever": "Dengue ", "Dengue Haemorrhagic Fever": "DHF"})
df.dropna(thresh=3, inplace=True)
df['year'] = year
df_combined = pd.concat([df_combined,df], ignore_index=True)
In [102]:
df_combined.head()
Out[102]:
In [103]:
df_combined.shape
Out[103]:
In [104]:
df_combined.rename(str.strip, axis='columns', inplace=True)
In [105]:
df_combined.columns
Out[105]:
In [106]:
df_combined = df_combined.rename(index=str, columns={"Epidemiology Wk": "week"})
df_combined.head()
Out[106]:
In [107]:
df_combined.fillna(0, inplace=True)
In [109]:
df_combined = df_combined.applymap(lambda x: int(x))
In [115]:
df_combined['Dengue'] = df_combined['Dengue'] + df_combined['DHF']
In [117]:
df_combined.drop('DHF', axis=1, inplace=True)
In [119]:
df_combined = df_combined[['year', 'week', 'Dengue', 'Malaria']]
df_combined.head()
Out[119]:
In [118]:
df_combined.columns
Out[118]:
In [121]:
print(df_combined['year'].min())
print(df_combined['year'].max())
In [152]:
import os
import numpy as np
In [147]:
INPUT_DIR = '../data/raw/weather_SG'
In [134]:
files = os.listdir(INPUT_DIR)
In [135]:
files[:3]
Out[135]:
In [136]:
stations = [x.split('_')[0] for x in files]
In [138]:
stations = list(set(stations))
In [139]:
stations
Out[139]:
In [140]:
len(stations)
Out[140]:
In [142]:
station_files = [x for x in files if x.startswith('S23')]
len(station_files)
Out[142]:
In [144]:
station_files.sort()
In [ ]:
for filename in station_files:
In [150]:
file_path = os.path.join(INPUT_DIR, 'S23_201201.csv')
dfWeather = pd.read_csv(file_path,encoding='latin1')
In [156]:
dfWeather.columns
Out[156]:
In [169]:
dfWeather = dfWeather.rename(index=str, columns=dict_rename)
In [202]:
col_numeric = ['year', 'month', 'day', 'Rainfall Total',
'Max 30Min Rainfall', 'Max 60Min Rainfall',
'Max 120Min Rainfall', 'Mean Temperature',
'Max Temperature', 'Min Temperature',
'Mean Wind Speed', 'Max Wind Speed']
for col in col_numeric:
dfWeather[col] = pd.to_numeric(dfWeather[col], errors='coerce')
dfWeather.head()
Out[202]:
In [168]:
dict_rename={'Station':'location',
'Year':'year', 'Month':'month', 'Day':'day',
'Daily Rainfall Total (mm)':'Rainfall Total',
'Highest 30 Min Rainfall (mm)':'Max 30Min Rainfall',
'Highest 60 Min Rainfall (mm)':'Max 60Min Rainfall',
'Highest 120 Min Rainfall (mm)':'Max 120Min Rainfall',
'Mean Temperature (°C)':'Mean Temperature',
'Maximum Temperature (°C)':'Max Temperature',
'Minimum Temperature (°C)':'Min Temperature',
'Mean Wind Speed (km/h)':'Mean Wind Speed',
'Max Wind Speed (km/h)':'Max Wind Speed'}
col_rename = ['location', 'year', 'month', 'day', 'Rainfall Total',
'Max 30Min Rainfall', 'Max 60Min Rainfall',
'Max 120Min Rainfall', 'Mean Temperature',
'Max Temperature', 'Min Temperature',
'Mean Wind Speed', 'Max Wind Speed']
In [206]:
logic = {
# 'location' : 'first',
# 'year' : 'first',
# 'month' : 'first',
# 'day' : 'first',
'Rainfall Total' : 'sum',
'Max 30Min Rainfall' : 'max',
'Max 60Min Rainfall' : 'max',
'Max 120Min Rainfall' : 'max',
'Mean Temperature' : 'mean',
'Max Temperature' : 'max',
'Min Temperature' : 'min',
'Mean Wind Speed' : 'mean',
'Max Wind Speed' : 'max'
}
In [203]:
dfWeeklyWeather = pd.DataFrame(columns=col_rename)
for i in range(int(len(dfWeather)/7)):
df = dfWeather[i*7:i*7+7]
#print(df)
next_idx = i*7+7
print(next_idx)
In [204]:
df
Out[204]:
In [208]:
week_data = df.agg(logic)
week_data
Out[208]:
In [210]:
pd.Series(['Tengah', 2012, 6], index=['location','year','week']).append(week_data)
Out[210]:
In [212]:
df.iloc[0]['year']
Out[212]:
In [227]:
dfWeeklyWeather = pd.DataFrame(columns=col_rename)
dfRemain = pd.DataFrame(columns=col_rename)
year = 0
week = 0
for filename in station_files:
# get monthly data, read file
file_path = os.path.join(INPUT_DIR, filename)
dfMonth = pd.read_csv(file_path,encoding='latin1')
dfMonth = dfMonth.rename(index=str, columns=dict_rename)
# prepare data
for col in col_numeric:
dfMonth[col] = pd.to_numeric(dfMonth[col], errors='coerce')
if dfMonth.iloc[0]['year'] != year:
print(year,dfMonth.iloc[0]['year'])
year = dfMonth.iloc[0]['year']
week = 1
if (len(dfRemain) > 0) and (len(dfRemain) < 7):
dfMonth = dfRemain.append(dfMonth, ignore_index=True)
else:
if(len(dfRemain) > 7):
print(dfRemain)
# aggregate weekly data
for i in range(int(len(dfMonth)/7)):
df = dfMonth[i*7:i*7+7]
next_idx = i*7+7
week_data = df.agg(logic)
week_s = pd.Series([df.iloc[0]['location'], df.iloc[0]['year'], df.iloc[0]['month'], df.iloc[0]['day'], week],
index=['location','year','month','day','week']).append(week_data)
dfWeeklyWeather = dfWeeklyWeather.append(week_s, ignore_index=True)
week += 1
dfRemain = dfMonth[next_idx:]
In [232]:
dfWeeklyWeather.head()
Out[232]:
In [231]:
dfWeeklyWeather.describe()
Out[231]:
In [ ]: