In [1]:
from __future__ import print_function
import os
import pandas as pd
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt

In [26]:
#Read dataset into pandas DataFrame
df = pd.read_csv('datasets/chemical-concentration-readings.csv')

In [27]:
#Let's see the shape of the dataset
print('Shape of the dataset:', df.shape)


Shape of the dataset: (197, 2)

In [28]:
#Let's see first 10 rows of the DataFrame
df.head(10)


Out[28]:
Timestamp Chemical conc.
0 1975-01-01 00:00:00 17.0
1 1975-01-01 02:00:00 16.6
2 1975-01-01 04:00:00 16.3
3 1975-01-01 06:00:00 16.1
4 1975-01-01 08:00:00 17.1
5 1975-01-01 10:00:00 16.9
6 1975-01-01 12:00:00 16.8
7 1975-01-01 14:00:00 17.4
8 1975-01-01 16:00:00 17.1
9 1975-01-01 18:00:00 17.0

In [29]:
#The observations seem to be taken at an interval of 2 hours

In [30]:
#Parse the timestamp to datetime row index of the DataFrame
datetime_rowid = df['Timestamp'].map(lambda t: pd.to_datetime(t, format='%Y-%m-%d %H:%M:%S'))
df.index = datetime_rowid
df.head(10)


Out[30]:
Timestamp Chemical conc.
Timestamp
1975-01-01 00:00:00 1975-01-01 00:00:00 17.0
1975-01-01 02:00:00 1975-01-01 02:00:00 16.6
1975-01-01 04:00:00 1975-01-01 04:00:00 16.3
1975-01-01 06:00:00 1975-01-01 06:00:00 16.1
1975-01-01 08:00:00 1975-01-01 08:00:00 17.1
1975-01-01 10:00:00 1975-01-01 10:00:00 16.9
1975-01-01 12:00:00 1975-01-01 12:00:00 16.8
1975-01-01 14:00:00 1975-01-01 14:00:00 17.4
1975-01-01 16:00:00 1975-01-01 16:00:00 17.1
1975-01-01 18:00:00 1975-01-01 18:00:00 17.0

In [38]:
#Resample and compute daily mean
daily = df['Chemical conc.'].resample('D')
daily_mean = daily.mean()

In [39]:
#Plot original time series and daily mean
fig = plt.figure(figsize=(5.5, 5.5))
ax = fig.add_subplot(1,1,1)

df['Chemical conc.'].plot(ax=ax, color='b')
daily_mean.plot(ax=ax, color='r')

ax.set_title('Bi-hourly reading (blue) & Daily Mean (red)')
ax.set_xlabel('Days in Jan 1975')
ax.set_ylabel('Chemical concentration')

plt.savefig('plots/ch2/B07887_02_02.png', format='png', dpi=300)



In [11]:
"""
Let us shown an example of grouping by a period
"""


Out[11]:
'\nLet us shown an example of grouping by a period\n'

In [12]:
#Load the DataFrame and re-index the row to datetime64
df = pd.read_csv('datasets/mean-daily-temperature-fisher-river.csv')
df.index = df['Date'].map(lambda d: pd.to_datetime(d, format = '%Y-%m-%d'))

In [13]:
#Display shape of the DataFrame
print('Shape of dataframe:', df.shape)


Shape of dataframe: (1461, 2)

In [14]:
#Let's see first 10 rows
df.head(10)


Out[14]:
Date Mean temparature
Date
1988-01-01 1988-01-01 -23.00
1988-01-02 1988-01-02 -20.50
1988-01-03 1988-01-03 -22.00
1988-01-04 1988-01-04 -30.50
1988-01-05 1988-01-05 -31.00
1988-01-06 1988-01-06 -27.50
1988-01-07 1988-01-07 -26.25
1988-01-08 1988-01-08 -26.50
1988-01-09 1988-01-09 -23.00
1988-01-10 1988-01-10 -23.50

In [15]:
#Plot original time series on daily mean temparature
fig = plt.figure(figsize=(5.5, 5.5))
ax = fig.add_subplot(1,1,1)

df['Mean temparature'].plot(ax=ax, color='b')

ax.set_title('Mean daily temparature')

plt.savefig('plots/ch2/B07887_02_03.png', format='png', dpi=300)



In [16]:
#We need to groupby the data for every month and find aggregate statistics

#Let's start by adding a Month_Year column
df['Month_Year'] = df.index.map(lambda d: d.strftime('%m-%Y'))

df.head(10)


Out[16]:
Date Mean temparature Month_Year
Date
1988-01-01 1988-01-01 -23.00 01-1988
1988-01-02 1988-01-02 -20.50 01-1988
1988-01-03 1988-01-03 -22.00 01-1988
1988-01-04 1988-01-04 -30.50 01-1988
1988-01-05 1988-01-05 -31.00 01-1988
1988-01-06 1988-01-06 -27.50 01-1988
1988-01-07 1988-01-07 -26.25 01-1988
1988-01-08 1988-01-08 -26.50 01-1988
1988-01-09 1988-01-09 -23.00 01-1988
1988-01-10 1988-01-10 -23.50 01-1988

In [17]:
#Calculate month wise statistics
monthly_stats = df.groupby(by='Month_Year')['Mean temparature'].aggregate([np.mean, np.median,
                                                                           np.std
                                                                          ])
monthly_stats.reset_index(inplace=True)
monthly_stats.head(10)


Out[17]:
Month_Year mean median std
0 01-1988 -22.137097 -23.00 5.260640
1 01-1989 -17.129032 -18.00 8.250725
2 01-1990 -15.112903 -12.00 6.606764
3 01-1991 -23.038710 -24.50 7.095570
4 02-1988 -19.025862 -19.50 8.598522
5 02-1989 -19.267857 -19.25 8.092042
6 02-1990 -17.482143 -16.50 8.018477
7 02-1991 -10.967857 -12.15 8.220753
8 03-1988 -8.258065 -9.25 5.341459
9 03-1989 -12.508065 -9.50 8.289925

In [18]:
#Let's create month and year columns and sort by them to reorder the rows
monthly_stats['Year'] = monthly_stats['Month_Year']\
                        .map(lambda m: pd.to_datetime(m, format='%m-%Y').strftime('%Y'))
monthly_stats['Month'] = monthly_stats['Month_Year']\
                        .map(lambda m: pd.to_datetime(m, format='%m-%Y').strftime('%m'))
monthly_stats.sort_values(by=['Year', 'Month'], inplace=True)
monthly_stats.head(10)


Out[18]:
Month_Year mean median std Year Month
0 01-1988 -22.137097 -23.000 5.260640 1988 01
4 02-1988 -19.025862 -19.500 8.598522 1988 02
8 03-1988 -8.258065 -9.250 5.341459 1988 03
12 04-1988 2.641667 1.875 5.057720 1988 04
16 05-1988 11.290323 11.000 6.254364 1988 05
20 06-1988 19.291667 19.000 3.909032 1988 06
24 07-1988 19.048387 18.500 3.073692 1988 07
28 08-1988 17.379032 18.000 3.183205 1988 08
32 09-1988 10.675000 10.750 3.880294 1988 09
36 10-1988 2.467742 3.000 6.697245 1988 10

In [19]:
#Let's set the Month_Year as the row index
monthly_stats.index = monthly_stats['Month_Year']

In [20]:
#Plot original time series and daily mean
fig = plt.figure(figsize=(5.5, 5.5))
ax = fig.add_subplot(1,1,1)

monthly_stats['mean'].plot(ax=ax, color='b')
monthly_stats['std'].plot(ax=ax, color='r')

ax.set_title('Monthly statistics: Mean (blue) & Std. Dev. (red)')

plt.savefig('plots/ch2/B07887_02_04.png', format='png', dpi=300)



In [21]:
#Now we will calculate weekly moving average on the original time series of mean daily temparature
weekly_moving_average = df['Mean temparature'].rolling(7).mean()

In [22]:
#Now we will calculate monthly moving average on the original time series of mean daily temparature
monthly_moving_average = df['Mean temparature'].rolling(30).mean()

In [23]:
#Let's caluclate the weekly and monthly avergaes with a stride of length 2
weekly_moving_average_2stride = df['Mean temparature'].rolling(7).mean()[::2]
monthly_moving_average_2stride = df['Mean temparature'].rolling(30).mean()[::2]

In [24]:
#Plot original time series and weekly moving average
fig, axarr = plt.subplots(3, sharex=True)
fig.set_size_inches(5.5, 5,5)

df['Mean temparature'].plot(ax=axarr[0], color='b')
axarr[0].set_title('Daily mean temparature')

weekly_moving_average.plot(ax=axarr[1], color='r')
axarr[1].set_title('Weekly moving average')

monthly_moving_average.plot(ax=axarr[2], color='g')
axarr[2].set_title('Monthly moving average')

plt.savefig('plots/ch2/B07887_02_05.png', format='png', dpi=300)