Working with time-series data is an important part of data analysis.
Starting with v0.8, the pandas library has included a rich API for time-series manipulations.
The pandas time-series API includes:
In [1]:
from datetime import datetime, date, time
import sys
sys.version
Out[1]:
In [2]:
import pandas as pd
from pandas import Series, DataFrame, Panel
pd.__version__
Out[2]:
In [3]:
import numpy as np
In [4]:
np.__version__
Out[4]:
In [ ]:
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rc('figure', figsize=(10, 8))
mpl.__version__
Sample trade ticks from 2011-11-01 to 2011-11-03 for a single security
In [ ]:
import os.path
os.path.exists('data.csv')
In [ ]:
In [ ]:
with open('data/data.csv', 'r') as fh:
print(fh.readline()) # headers
print(fh.readline()) # first row
parse_dates: use a list or dict for flexible (possibly multi-column) date parsing
In [ ]:
data = pd.read_csv('data/data.csv',
parse_dates={'Timestamp': ['Date', 'Time']},
index_col='Timestamp')
data.head()
Narrow Data down to just Timestamp (key), Price and Volume. head shows the first few rows.
In [ ]:
ticks = data.ix[:, ['Price', 'Volume']]
ticks.head()
In [ ]:
type(data)
In [ ]:
ticks.count()
In [ ]:
bars = ticks.Price.resample('1min').ohlc()
bars
In [ ]:
bars.describe()
In [ ]:
minute_range = bars.high - bars.low
minute_range.describe()
In [ ]:
minute_return = bars.close / bars.open - 1
minute_return.describe()
Compute a VWAP using resample
In [ ]:
volume = ticks.Volume.resample('1min').sum()
value = ticks.prod(axis=1).resample('1min').sum()
vwap = value / volume
In [ ]:
vwap.ix['2011-11-01 09:27':'2011-11-01 09:32']
In [ ]:
bars.open.at_time('9:30')
In [ ]:
bars.close.at_time('16:00')
In [ ]:
filtered = vwap.between_time('10:00', '16:00')
filtered.head(20)
In [ ]:
vol = volume.between_time('10:00', '16:00')
vol.head(20)
In [ ]:
filtered.ix['2011-11-03':'2011-11-04'].head(20)
In [ ]:
filled = filtered.fillna(method='pad', limit=1)
filled.ix['2011-11-03':'2011-11-04'].head(20)
In [ ]:
vol = vol.fillna(0.)
vol.head(20)
In [ ]:
filled.ix['2011-11-03':'2011-11-04'].plot()
plt.ylim(103.5, 104.5)
In [ ]:
vwap.ix['2011-11-03':'2011-11-04'].plot()
plt.ylim(103.5, 104.5)
vol.ix['2011-11-03':'2011-11-04'].plot(secondary_y=True, style='r')
In [ ]:
ticks.head()
shift realigns values
In [ ]:
ticks.shift(1).head()
In [ ]:
ticks.shift(-1).head()
tshift manipulates index values
In [ ]:
ticks.tshift(1, 'min').head()
In [ ]:
minute_return.head()
In [ ]:
mr = minute_return.between_time('9:30', '16:00')
mr.head()
In [ ]:
lagged = mr.shift(1)
lagged.head()
We shouldn't use shift here because:
In [ ]:
lagged.at_time('9:30')
In [ ]:
mr.at_time('16:00')
In [ ]:
lagged = minute_return.tshift(1, 'min').between_time('9:30', '16:00')
lagged.at_time('9:30')
Let's play
In [ ]:
pd.ols(y=mr, x=lagged)
In [ ]:
mr = vwap / bars.open - 1
mr = mr.between_time('9:30', '16:00')
lagged = mr.tshift(1, 'min').between_time('9:30', '16:00')
pd.ols(y=mr, x=lagged)
In [ ]:
inter = mr * vol
inter = inter.between_time('9:30', '16:00')
lagged_inter = inter.tshift(1, 'min').between_time('9:30', '16:00')
pd.ols(y=mr, x=lagged_inter)
Convert to percentage volume
In [ ]:
vol = vol.groupby(vol.index.day).transform(lambda x: x/x.sum())
vol.head()
Verify
In [ ]:
vol.resample('D', how='sum')
In [ ]:
inter = mr * vol
inter = inter.between_time('9:30', '16:00')
lagged_inter = inter.tshift(1, 'min').between_time('9:30', '16:00')
pd.ols(y=mr, x=lagged_inter)
Vivaldi FTW
In [ ]:
hour = vol.index.hour
hourly_volume = vol.groupby(hour).mean()
In [ ]:
hourly_volume.plot(kind='bar')
Expanding window of hourly means for volume
In [ ]:
hourly = vol.resample('H')
def calc_mean(hr):
hr = time(hour=hr)
data = hourly.at_time(hr)
return pd.expanding_mean(data)
df = pd.concat([calc_mean(hr) for hr in range(10, 16)])
df = df.sort_index()
df
Compute deviations from the hourly means
In [ ]:
clean_vol = vol.between_time('10:00', '15:59')
dev = clean_vol - df.reindex(clean_vol.index, method='pad') # be careful over day boundaries
dev
In [ ]:
inter = mr * dev
inter = inter.between_time('10:00', '15:59')
pd.ols(y=mr, x=inter.tshift(1, 'min'))
pd.date_range
In [ ]:
rng = pd.date_range('2005', '2012', freq='M')
rng
In [ ]:
pd.date_range('2005', periods=7*12, freq='M')
In [ ]:
pd.date_range(end='2012', periods=7*12, freq='M')
| Name | Description |
| D | Calendar day |
| B | Business day |
| M | Calendar end of month |
| MS | Calendar start of month |
| BM | Business end of month |
| BMS | Business start of month |
| W-{MON, TUE,...} | Week ending on Monday, Tuesday, ... |
| Q-{JAN, FEB,...} | Quarter end with year ending January, February... |
| QS-{JAN, FEB,...} | Quarter start with year ending January, February... |
| BQ-{JAN, FEB,...} | Business quarter end with year ending January, February... |
| BQS-{JAN, FEB,...} | Business quarter start with year ending January, February... |
| A-{JAN, FEB, ...} | Year end (December) |
| AS-{JAN, FEB, ...} | Year start (December) |
| BA-{JAN, FEB, ...} | Business year end (December) |
| BAS-{JAN, FEB, ...} | Business year start (December) |
| H | Hour |
| T | Minute |
| s | Second |
| L, ms | Millisecond |
| U | Microsecond |
Anchored offsets
In [ ]:
pd.date_range('2005', periods=4, freq='Q')
In [ ]:
pd.date_range('2005', periods=4, freq='Q-NOV')
Week anchor indicates end of week
In [ ]:
wkrng = pd.date_range('2012-10-25', periods=3, freq='W')
wkrng
In [ ]:
wkrng[0].dayofweek
Year anchor indicates year ending month
In [ ]:
pd.date_range('2005', periods=3, freq='A-JUN')
DatetimeIndex is a subclass of Index
In [ ]:
isinstance(rng, pd.Index)
In [ ]:
rng[2:4]
Use it for Series/DataFrame labelling
In [ ]:
s = Series(randn(len(rng)), rng)
s.head()
In [ ]:
df = DataFrame(randn(len(rng), 3), rng, ['X', 'Y', 'Z'])
df.head()
In [ ]:
s[datetime(2005, 1, 31) : datetime(2006, 12, 31)] #slice end inclusive
In [ ]:
df['2005-1-31':'2006-12-31']
Partial indexing
In [ ]:
s['2005':'2006']
positional indexing still works
In [ ]:
df[:2] # slice end exclusive
Elements boxed as Timestamp (subclass of datetime.datetime)
In [ ]:
elm = rng[0]
elm
In [ ]:
isinstance(elm, datetime)
Why do we need this subclass?
In [ ]:
elm.nanosecond
Implemented internally using numpy.datetime64 (dtype='M8[ns]')
In [ ]:
val = rng.values
In [ ]:
type(val)
In [ ]:
val.dtype
Upgrade Numpy to 1.7b to fix repr issue
In [ ]:
val[0]
Or use DatetimeIndex.asobject for workaround
In [ ]:
rng.asobject.values[0]
In [ ]:
rng.asobject
In [ ]:
rng.to_pydatetime()
In [ ]:
rng.to_pydatetime()[0]
Integer representation
In [ ]:
type(rng.asi8)
In [ ]:
rng.asi8.dtype
In [ ]:
rng.asi8[0]
In [ ]:
s.index.freqstr
In [ ]:
s.resample('30D').head(10)
In [ ]:
s.resample('30D', fill_method='ffill').head(10)
Upsampling
In [ ]:
s.ix[:3].resample('W')
In [ ]:
s.ix[:3].resample('W', fill_method='ffill')
asfreq
In [ ]:
s.asfreq('Q').head()
In [ ]:
s.resample('Q', 'last').head()
closed: 'left' or 'right' bin edge is closed (default is 'right')
In [ ]:
s.resample('Q').head()
In [ ]:
s.ix[3:6].mean()
In [ ]:
s.resample('Q', closed='left').head()
In [ ]:
s.ix[2:5].mean()
label: label the bin with 'left' or 'right' edge (default is 'right')
In [ ]:
s.resample('Q').head()
In [ ]:
s.resample('Q', label='left').head()
loffset: shift the result index
In [ ]:
s.resample('Q', label='left', loffset='-1D').head()
In [ ]:
rng.tz
In [ ]:
d = rng[0]
d
In [ ]:
d.tz
In [ ]:
localized = rng.tz_localize('US/Eastern')
Localization assumes naive time is local (and not UTC)
In [ ]:
localized[0]
In [ ]:
localized.asi8[0]
In [ ]:
rng.asi8[0]
In [ ]:
d_utc = d.tz_localize('UTC')
d_utc
In [ ]:
d_utc.tz_localize('US/Eastern')
In [ ]:
localized.tz_convert('UTC')
In [ ]:
d_ny = d_utc.tz_convert('US/Eastern')
d_ny
In [ ]:
rng.tz_convert('US/Eastern')
A lot of time series data is better represented as intervals of time rather than points in time.
This is represented in pandas as Period and PeriodIndex
In [ ]:
p = pd.Period('2005', 'A')
p
In [ ]:
pd.Period('2006Q1', 'Q-MAR')
In [ ]:
pd.Period('2007-1-1', 'B')
No xxx-start frequencies
In [ ]:
pd.Period('2005', 'AS')
In [ ]:
pd.period_range('2005', '2012', freq='A')
In [ ]:
prng = pd.period_range('2005', periods=7, freq='A')
prng
In [ ]:
p
In [ ]:
p.to_timestamp()
In [ ]:
p.to_timestamp('M', 's')
In [ ]:
p.to_timestamp('M', 'e')
In [ ]:
prng.to_timestamp(how='e')
In [ ]:
prng.to_timestamp('M', 'e')
In [ ]:
rng
In [ ]:
rng.to_period()
In [ ]:
rng.to_period('D')
In [ ]:
p
In [ ]:
p.end_time
In [ ]:
datetime(2005, 12, 31, 10, 0, 0) < p.end_time # WAT?!
Look for a 0.9.1 bugfix release next week
In [ ]: