In [1]:
import pandas as pd
import getpass, os
os.environ['PSQL_USER']='dengueadmin'
os.environ['PSQL_HOST']='localhost'
os.environ['PSQL_DB']='dengue'
os.environ['PSQL_PASSWORD']=getpass.getpass("Enter the database password: ")
In [2]:
os.chdir('..')
from infodenguepredict.data.infodengue import get_temperature_data, get_alerta_table, get_tweet_data
%pylab inline
In [3]:
A = get_alerta_table(3304557)#(3303500)
T = get_temperature_data(3304557)#(3303500)
Tw = get_tweet_data(3304557)#(3303500)
Let's look at the tables
In [4]:
A.head()
Out[4]:
In [5]:
T = T[~T.index.duplicated()]
T.to_csv('temperature_rio.csv', header=True, sep=',')
T.head()
Out[5]:
In [6]:
Tw = Tw[~Tw.index.duplicated()]
Tw.head()
Out[6]:
Let's try to join the tables by date. To align them, we must downsample each one to a weekly time frame
In [7]:
T.resample('W-SUN').mean().tail()
Out[7]:
In [8]:
Full = A.join(T.resample('W-SUN').mean()).join(Tw.resample('W-SUN').sum())
Full.head()
Out[8]:
Note que as datas para as datas mais antigas os dados faltantes de Temperatura e Tweets, foram substituídos por NaN. Podemos remover estas datas, ficando com uma tabela sem dados faltantes. Mas perde-se mais de dois anos de dados.
In [9]:
Short = Full.dropna()
Short.head()
Out[9]:
In [10]:
Short[['casos_est', 'temp_min', 'umid_min', 'numero']].plot(subplots=True, figsize=(15,10),grid=True);
In [12]:
from infodenguepredict.models import sarimax,GAS,GASX
import statsmodels.api as sm
In [ ]:
fig, axes = plt.subplots(1, 2, figsize=(15, 4))
fig = sm.graphics.tsa.plot_acf(Full.ix[1:, 'casos'], lags=52, ax=axes[0])
fig = sm.graphics.tsa.plot_pacf(Full.ix[1:, 'casos'], lags=52, ax=axes[1])
In [13]:
# Short.casos = Short.casos.apply(pd.np.log)
model_1 = sarimax.build_model(Full, 'casos', [])
In [14]:
fit_1 = model_1.fit()
In [15]:
fit_1.summary()
Out[15]:
In [16]:
def plot_pred(fit):
plt.Figure(figsize=(10,7))
predict = fit.get_prediction(start='2017-01-01', dynamic=False)
predict_ci = predict.conf_int()
Full.casos.plot(style='o',label='obs')
predict.predicted_mean.plot(style='r--', label='In sample')
plt.fill_between(predict_ci.index, predict_ci.ix[:, 0], predict_ci.ix[:, 1], color='r', alpha=0.1)
forecast = fit.get_prediction(start='2017-03-05', end='2017-06-21', dynamic=False)
forecast_ci = forecast.conf_int()
forecast.predicted_mean.plot(style='b--', label='Out of Sample')
plt.fill_between(forecast_ci.index, forecast_ci.ix[:, 0], forecast_ci.ix[:, 1], color='b', alpha=0.1)
plt.legend(loc=0)
plot_pred(fit_1)
In [17]:
model_2 = GAS.build_model(Full, ar=2, sc=6, target='casos')
fit_2 = model_2.fit()
In [18]:
fit_2.summary()
In [19]:
model_2.plot_fit()
plt.savefig('GAS_in_sample.png')
Full.casos.plot(style='ko')
model_2.plot_predict(h=10, past_values=52)
In [20]:
model_2.plot_z(figsize=(15,5))
In [38]:
plt.figure()
ax = plt.gca()
train = Full.loc[Full.index<'2015-01-01']
model_3 = GAS.build_model(train, ar=2, sc=6, target='casos')
fit_3 = model_3.fit()
Full.casos.plot(style='ko', ax=ax, figsize=(15,10))
plt.hold(True)
model_3.plot_predict(h=10, past_values=20, ax=ax, intervals=True, figsize=(15,10))
In [34]:
model_4 = GASX.build_model(Full.dropna(), ar=4, sc=6, formula='casos~1+temp_min')
In [35]:
fit_4 = model_4.fit()
In [36]:
fit_4.summary()
In [37]:
model_4.plot_fit()
In [26]:
model_4.plot_predict(h=10, past_values=15)
In [46]:
rio = get_alerta_table(state='RJ')
In [47]:
rio.head()
Out[47]:
Let's keep only the columns we want to use
In [48]:
for col in ['casos_est_min', 'casos_est_max', 'Localidade_id', 'versao_modelo', 'municipio_nome']:
del rio[col]
In [49]:
rio.head()
Out[49]:
In [50]:
riopiv = rio.pivot(index=rio.index, columns='municipio_geocodigo')
In [51]:
riopiv.head()
Out[51]:
In [52]:
riopiv['SE'].head()
Out[52]:
Now we have a multi-level column index. It may be preferable to flatten it.
In [53]:
riopiv.columns = ['{}_{}'.format(*col).strip() for col in riopiv.columns.values]
riopiv.head()
Out[53]:
In [54]:
riopiv.shape
Out[54]:
In [ ]: