Source de données utilisée (fichiers CSV en open data): https://data.iledefrance.fr/explore/dataset/qualite-de-lair-mesuree-dans-la-station-chatelet/
In [ ]:
%matplotlib inline
#%matplotlib notebook
import matplotlib
matplotlib.rcParams['figure.figsize'] = (9, 9)
import pandas as pd
In [ ]:
def conv_func(s):
s = s.replace('<', '')
if s == 'ND':
return np.nan
elif s.strip() == '':
return np.nan
else:
return float(s)
In [ ]:
url = "https://data.iledefrance.fr/explore/dataset/qualite-de-lair-mesuree-dans-la-station-chatelet/download/?format=csv&timezone=Europe/Berlin&use_labels_for_header=true"
#dtype_dict = {'NO': np.float64,
# 'NO2': np.float64,
# 'PM10': np.float64,
# 'CO2': np.float64,
# 'TEMP': np.float64,
# 'HUMI': np.float64}
converter_dict = {'NO': conv_func,
'NO2': conv_func,
'PM10': conv_func,
'CO2': conv_func,
'TEMP': conv_func,
'HUMI': conv_func}
df = pd.read_csv(url,
#encoding='iso-8859-1',
index_col=0,
sep=';',
decimal=',',
parse_dates=["DATE/HEURE"],
#dtype=dtype_dict,
#na_values='ND',
converters=converter_dict)
In [ ]:
df = df.sort_index()
In [ ]:
df.head()
In [ ]:
df.columns
In [ ]:
df.dtypes
In [ ]:
df.index
In [ ]:
df.PM10.plot(figsize=(18,6));
In [ ]:
df.PM10.resample('7D').mean().plot(figsize=(18,6));
In [ ]:
df.PM10.rolling('7D').mean().plot(figsize=(18,6));
In [ ]:
df.PM10.resample('1M').mean().plot(figsize=(18,6));
In [ ]:
ts = df.PM10
# https://jakevdp.github.io/PythonDataScienceHandbook/03.11-working-with-time-series.html#Digging-into-the-data
ts_mean = ts.groupby(ts.index.time).mean()
ts_median = ts.groupby(ts.index.time).median()
ts_quartile_1 = ts.groupby(ts.index.time).quantile(0.25)
ts_quartile_3 = ts.groupby(ts.index.time).quantile(0.75)
ts_percentile_5 = ts.groupby(ts.index.time).quantile(0.05)
ts_percentile_95 = ts.groupby(ts.index.time).quantile(0.95)
ts_min = ts.groupby(ts.index.time).min()
ts_max = ts.groupby(ts.index.time).max()
color = "blue"
ax = ts_mean.plot(y='duration', figsize=(18, 12), color=color, label="mean", alpha=0.75)
ts_median.plot(ax=ax, color=color, label="median", style="--", alpha=0.75)
ts_quartile_1.plot(ax=ax, color=color, alpha=0.5, style="-.", label="1st quartile")
ts_quartile_3.plot(ax=ax, color=color, alpha=0.5, style="-.", label="3rd quartile")
ts_percentile_5.plot(ax=ax, color=color, alpha=0.25, style=":", label="5th percentile")
ts_percentile_95.plot(ax=ax, color=color, alpha=0.25, style=":", label="95th percentile")
ts_min.plot(ax=ax, color=color, alpha=0.2, style=":", label="min")
ts_max.plot(ax=ax, color=color, alpha=0.2, style=":", label="max")
plt.fill_between(ts_percentile_5.index, ts_percentile_5.values, ts_percentile_95.values, facecolor=color, alpha=0.1)
plt.fill_between(ts_quartile_1.index, ts_quartile_1.values, ts_quartile_3.values, facecolor=color, alpha=0.1)
ts = df.TEMP
ax2 = ax.twinx()
# https://jakevdp.github.io/PythonDataScienceHandbook/03.11-working-with-time-series.html#Digging-into-the-data
ts_mean = ts.groupby(ts.index.time).mean()
ts_median = ts.groupby(ts.index.time).median()
ts_quartile_1 = ts.groupby(ts.index.time).quantile(0.25)
ts_quartile_3 = ts.groupby(ts.index.time).quantile(0.75)
ts_percentile_5 = ts.groupby(ts.index.time).quantile(0.05)
ts_percentile_95 = ts.groupby(ts.index.time).quantile(0.95)
ts_min = ts.groupby(ts.index.time).min()
ts_max = ts.groupby(ts.index.time).max()
color = "red"
ax2 = ts_mean.plot(y='duration', figsize=(18, 12), color=color, label="mean", alpha=0.75)
ts_median.plot(ax=ax2, color=color, label="median", style="--", alpha=0.75)
ts_quartile_1.plot(ax=ax2, color=color, alpha=0.5, style="-.", label="1st quartile")
ts_quartile_3.plot(ax=ax2, color=color, alpha=0.5, style="-.", label="3rd quartile")
ts_percentile_5.plot(ax=ax2, color=color, alpha=0.25, style=":", label="5th percentile")
ts_percentile_95.plot(ax=ax2, color=color, alpha=0.25, style=":", label="95th percentile")
ts_min.plot(ax=ax2, color=color, alpha=0.2, style=":", label="min")
ts_max.plot(ax=ax2, color=color, alpha=0.2, style=":", label="max")
plt.fill_between(ts_percentile_5.index, ts_percentile_5.values, ts_percentile_95.values, facecolor=color, alpha=0.1)
plt.fill_between(ts_quartile_1.index, ts_quartile_1.values, ts_quartile_3.values, facecolor=color, alpha=0.1)
ax.legend(loc='upper left')
ax2.legend(loc='upper right');
ax.set_xlabel('Time')
ax.set_ylabel('PM10');
ax2.set_ylabel('Temperature');
In [ ]:
ax = df.PM10.groupby(df.index.time).mean().plot(figsize=(18,6), color="blue")
ax.set_xlabel("Time")
ax2 = ax.twinx()
df.TEMP.groupby(df.index.time).mean().plot(ax=ax2, color="red")
ax.legend(loc='upper left')
ax2.legend(loc='upper right');
In [ ]:
ax = df.PM10.groupby(df.index.weekday).mean().plot(figsize=(18,6), color="blue")
ax.set_xlabel("Weekday")
ax2 = ax.twinx()
df.TEMP.groupby(df.index.weekday).mean().plot(ax=ax2, color="red")
ax.legend(loc='upper left')
ax2.legend(loc='upper right');
In [ ]:
ax = df.PM10.groupby(df.index.month).mean().plot(figsize=(18,6), color="blue")
ax.set_xlabel("Month")
ax2 = ax.twinx()
df.TEMP.groupby(df.index.month).mean().plot(ax=ax2, color="red")
ax.legend(loc='upper left')
ax2.legend(loc='upper right');