LSTM Time Series Forecasting for Air Quality


In [1]:
from tensorflow import keras

In [2]:
import pandas as pd
import numpy as np

df_raw = pd.read_csv("datasets/AirQualityUCI.csv", sep=';')

In [3]:
df_raw.head()


Out[3]:
Date Time CO(GT) PT08.S1(CO) NMHC(GT) C6H6(GT) PT08.S2(NMHC) NOx(GT) PT08.S3(NOx) NO2(GT) PT08.S4(NO2) PT08.S5(O3) T RH AH Unnamed: 15 Unnamed: 16
0 10/03/2004 18.00.00 2,6 1360.0 150.0 11,9 1046.0 166.0 1056.0 113.0 1692.0 1268.0 13,6 48,9 0,7578 NaN NaN
1 10/03/2004 19.00.00 2 1292.0 112.0 9,4 955.0 103.0 1174.0 92.0 1559.0 972.0 13,3 47,7 0,7255 NaN NaN
2 10/03/2004 20.00.00 2,2 1402.0 88.0 9,0 939.0 131.0 1140.0 114.0 1555.0 1074.0 11,9 54,0 0,7502 NaN NaN
3 10/03/2004 21.00.00 2,2 1376.0 80.0 9,2 948.0 172.0 1092.0 122.0 1584.0 1203.0 11,0 60,0 0,7867 NaN NaN
4 10/03/2004 22.00.00 1,6 1272.0 51.0 6,5 836.0 131.0 1205.0 116.0 1490.0 1110.0 11,2 59,6 0,7888 NaN NaN

In [4]:
df = df_raw.dropna(subset=['Date'])

In [5]:
df_raw.describe()


Out[5]:
PT08.S1(CO) NMHC(GT) PT08.S2(NMHC) NOx(GT) PT08.S3(NOx) NO2(GT) PT08.S4(NO2) PT08.S5(O3) Unnamed: 15 Unnamed: 16
count 9357.000000 9357.000000 9357.000000 9357.000000 9357.000000 9357.000000 9357.000000 9357.000000 0.0 0.0
mean 1048.990061 -159.090093 894.595276 168.616971 794.990168 58.148873 1391.479641 975.072032 NaN NaN
std 329.832710 139.789093 342.333252 257.433866 321.993552 126.940455 467.210125 456.938184 NaN NaN
min -200.000000 -200.000000 -200.000000 -200.000000 -200.000000 -200.000000 -200.000000 -200.000000 NaN NaN
25% 921.000000 -200.000000 711.000000 50.000000 637.000000 53.000000 1185.000000 700.000000 NaN NaN
50% 1053.000000 -200.000000 895.000000 141.000000 794.000000 96.000000 1446.000000 942.000000 NaN NaN
75% 1221.000000 -200.000000 1105.000000 284.000000 960.000000 133.000000 1662.000000 1255.000000 NaN NaN
max 2040.000000 1189.000000 2214.000000 1479.000000 2683.000000 340.000000 2775.000000 2523.000000 NaN NaN

In [6]:
df.describe()


Out[6]:
PT08.S1(CO) NMHC(GT) PT08.S2(NMHC) NOx(GT) PT08.S3(NOx) NO2(GT) PT08.S4(NO2) PT08.S5(O3) Unnamed: 15 Unnamed: 16
count 9357.000000 9357.000000 9357.000000 9357.000000 9357.000000 9357.000000 9357.000000 9357.000000 0.0 0.0
mean 1048.990061 -159.090093 894.595276 168.616971 794.990168 58.148873 1391.479641 975.072032 NaN NaN
std 329.832710 139.789093 342.333252 257.433866 321.993552 126.940455 467.210125 456.938184 NaN NaN
min -200.000000 -200.000000 -200.000000 -200.000000 -200.000000 -200.000000 -200.000000 -200.000000 NaN NaN
25% 921.000000 -200.000000 711.000000 50.000000 637.000000 53.000000 1185.000000 700.000000 NaN NaN
50% 1053.000000 -200.000000 895.000000 141.000000 794.000000 96.000000 1446.000000 942.000000 NaN NaN
75% 1221.000000 -200.000000 1105.000000 284.000000 960.000000 133.000000 1662.000000 1255.000000 NaN NaN
max 2040.000000 1189.000000 2214.000000 1479.000000 2683.000000 340.000000 2775.000000 2523.000000 NaN NaN

In [13]:
df.columns


Out[13]:
Index(['Date', 'Time', 'CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)',
       'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)',
       'PT08.S5(O3)', 'T', 'RH', 'AH', 'Unnamed: 15', 'Unnamed: 16',
       'Timestamp'],
      dtype='object')

In [7]:
df["Date"]


Out[7]:
0       10/03/2004
1       10/03/2004
2       10/03/2004
3       10/03/2004
4       10/03/2004
           ...    
9352    04/04/2005
9353    04/04/2005
9354    04/04/2005
9355    04/04/2005
9356    04/04/2005
Name: Date, Length: 9357, dtype: object

In [8]:
df["Time"]


Out[8]:
0       18.00.00
1       19.00.00
2       20.00.00
3       21.00.00
4       22.00.00
          ...   
9352    10.00.00
9353    11.00.00
9354    12.00.00
9355    13.00.00
9356    14.00.00
Name: Time, Length: 9357, dtype: object

In [9]:
df["Timestamp"] = df["Date"].map(str) + " " + df["Time"]
df["Timestamp"] = pd.to_datetime(df["Timestamp"], format="%d/%m/%Y %H.%M.%S")


c:\users\mcama\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
c:\users\mcama\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [10]:
df["Timestamp"]


Out[10]:
0      2004-03-10 18:00:00
1      2004-03-10 19:00:00
2      2004-03-10 20:00:00
3      2004-03-10 21:00:00
4      2004-03-10 22:00:00
               ...        
9352   2005-04-04 10:00:00
9353   2005-04-04 11:00:00
9354   2005-04-04 12:00:00
9355   2005-04-04 13:00:00
9356   2005-04-04 14:00:00
Name: Timestamp, Length: 9357, dtype: datetime64[ns]

Visualize Data


In [11]:
%matplotlib notebook

import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [9.5, 6]

In [18]:
figure = plt.figure()

plt.plot(df["Timestamp"], df["RH"])


Out[18]:
[<matplotlib.lines.Line2D at 0x1cd0d3901d0>]

In [ ]:


In [ ]: