Part 4: Time Series Analysis in Pandas


In [ ]:
"""
----------------------------------------------------------------------
Filename : 04_time_series_in_pandas.py
Date     : 12th Dec, 2013
Author   : Jaidev Deshpande
Purpose  : Introduction to time series analysis and plotting in Pandas
Libraries: Pandas, NumPy, Matplotlib
----------------------------------------------------------------------
"""

In [ ]:
# standard library imports
from datetime import date

# system library imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [ ]:
def create_timestamp_sample():
    """
    Create a Pandas `TimeSeries` object with random data, 
    timestamped from 1st January 2000 to 31st December 2013
    """
    times = pd.date_range(start='1/1/2000', end='31/12/2013')
    data = np.random.randn(len(times))
    return pd.Series(data, index=times)

In [ ]:
ts = create_timestamp_sample()

In [ ]:
print(ts['1/1/2000'])

In [ ]:
# Create a datetime object
dt = date(2000,1,1)
print(ts[dt])

In [ ]:
# Slicing a timeseries by dates:
print(ts['1/1/2000':'31/1/2000'])

In [ ]:
# Changing the frequency of a TimeSeries
converted_s = ts.asfreq('M',method='bfill')
fig, axes = plt.subplots(nrows=2,ncols=1)
ts.plot(ax=axes[0],figsize=(20,10))
axes[0].set_title('Daily, for 14 years')
converted_s.plot(ax=axes[1])
axes[1].set_title('Monthly, for 14 years')
plt.show()

In [ ]:
def series_interpolation(n_sample):
    # Create a Series with some empty data
    x = np.linspace(-4*np.pi, 4*np.pi, 1000)
    y = np.sin(x)
    samples = np.arange(0,1000,1000.0/n_sample).astype(int)
    y_sampled = np.zeros((1000,),float)
    y_sampled[samples] = y[samples]
    y_sampled[y_sampled==0] = np.nan
    Y = pd.Series(y_sampled)
    return Y

In [ ]:
# Series interpolation demo
sample_sizes = [10,25,50,75,100]
sampled_data = [series_interpolation(n) for n in sample_sizes]
fig, axes = plt.subplots(nrows=len(sample_sizes),ncols=1, figsize=(20,10))
for i in range(len(sample_sizes)):
   data = sampled_data[i]
   interpolated = data.interpolate()
   data[data.notnull()].plot(style='ro',ax=axes[i])
   interpolated.plot(style='b-', ax=axes[i])
   axes[i].set_title("Sample Size = "+str(sample_sizes[i]))
plt.show()

In [ ]:
# Upsampling and downsampling
print(ts.index.freq)
upsampled = ts.resample('12H',fill_method='bfill')
downsampled = ts.resample('M')
print(upsampled.shape)
print(downsampled.shape)
fig, axes = plt.subplots(3,1, figsize=(20,10))
ts.plot(ax=axes[0],title='Original Data')
upsampled.plot(ax=axes[1],title='Upsampled Data')
downsampled.plot(ax=axes[2], title='Downsampled Data')
plt.show()

Exercise: Datetime Indexing

  1. Read the stock_px.csv file in the data folder into a DataFrame
  2. It contains stock market ticker data for four stocks
  3. The DataFrame should have the timestamp column as index
  4. Calculate the mean value of the 'AAPL' stock during the month of December for all leap years in the dataset.

Exercise: Interpolation


In [ ]:
def create_interpolation_exercise():
    random.seed(0)
    x = zeros((1000,),dtype=float)
    inds = random.randint(low=0, high=1000, size=(100,))
    values = random.randn(100)
    x[inds] = values
    x[x==0] = nan
    return pd.Series(x)
  1. Use the function create_interpolation_exercise to create a Series randomly filled with NaNs.
  2. Interpolate this series to fill up the NaN values.
  3. Calculate the summary statistics of this interpolated series.