In [83]:

    
%matplotlib inline



In [140]:

    
from datetime import datetime

import numpy as np
from pandas import Series, DataFrame
import pandas as pd



In [85]:

    
import pylab as plt

Introduction to Pandas



In [86]:

    
# plot a random walk
plt.plot(np.random.randn(1000).cumsum())









    Out[86]:





[<matplotlib.lines.Line2D at 0x7f9d5e0ae250>]

play with numpy arrays



In [87]:

    
np.arange(5)









    Out[87]:





array([0, 1, 2, 3, 4])



In [88]:

    
np.arange(5)[2]









    Out[88]:





2



In [89]:

    
np.arange(5)[2:]









    Out[89]:





array([2, 3, 4])



In [90]:

    
## use pandas to add an index tothis array



In [91]:

    
index = ['a', 'b', 'c', 'd', 'e']



In [92]:

    
s = Series(np.arange(5), index=index)
s









    Out[92]:





a    0
b    1
c    2
d    3
e    4
dtype: int64



In [93]:

    
s.index









    Out[93]:





Index([u'a', u'b', u'c', u'd', u'e'], dtype='object')



In [94]:

    
s.values









    Out[94]:





array([0, 1, 2, 3, 4])



In [95]:

    
# Series objects behave like numpy arrays
s * 2









    Out[95]:





a    0
b    2
c    4
d    6
e    8
dtype: int64



In [96]:

    
s * s









    Out[96]:





a     0
b     1
c     4
d     9
e    16
dtype: int64



In [97]:

    
s[:3]









    Out[97]:





a    0
b    1
c    2
dtype: int64



In [98]:

    
s['b']









    Out[98]:





1



In [99]:

    
s['b':]









    Out[99]:





b    1
c    2
d    3
e    4
dtype: int64



In [100]:

    
s[[1, 0, 2]]









    Out[100]:





b    1
a    0
c    2
dtype: int64



In [101]:

    
s[['b', 'd', 'e']]









    Out[101]:





b    1
d    3
e    4
dtype: int64

Date ranges



In [102]:

    
dates = pd.date_range('2012-07-16', '2012-07-21')



In [103]:

    
dates









    Out[103]:





DatetimeIndex(['2012-07-16', '2012-07-17', '2012-07-18', '2012-07-19',
               '2012-07-20', '2012-07-21'],
              dtype='datetime64[ns]', freq='D')



In [104]:

    
# time series of temperatures (in F) in Austin
atemps = Series([101.5, 98, 95, 99, 100, 92], index=dates)



In [105]:

    
atemps.index









    Out[105]:





DatetimeIndex(['2012-07-16', '2012-07-17', '2012-07-18', '2012-07-19',
               '2012-07-20', '2012-07-21'],
              dtype='datetime64[ns]', freq='D')



In [106]:

    
idx = atemps.index[2]
idx









    Out[106]:





Timestamp('2012-07-18 00:00:00', offset='D')



In [107]:

    
atemps[idx]









    Out[107]:





95.0



In [108]:

    
# another time series of temperatures (in F) in San Diego
sdtemps =  Series([77, 73, 77, 78, 75, 79], index=dates)

multiple time series in a DataFrame



In [109]:

    
temps = DataFrame({'austin': atemps, 'san diego': sdtemps})



In [110]:

    
temps



In [111]:

    
temps.index









    Out[111]:





DatetimeIndex(['2012-07-16', '2012-07-17', '2012-07-18', '2012-07-19',
               '2012-07-20', '2012-07-21'],
              dtype='datetime64[ns]', freq='D')



In [112]:

    
temps.columns









    Out[112]:





Index([u'austin', u'san diego'], dtype='object')



In [113]:

    
temps['san diego']









    Out[113]:





2012-07-16    77
2012-07-17    73
2012-07-18    77
2012-07-19    78
2012-07-20    75
2012-07-21    79
Freq: D, Name: san diego, dtype: int64



In [114]:

    
temps['difference'] = temps['austin'] - temps['san diego']



In [115]:

    
temps



In [116]:

    
del temps['difference']



In [117]:

    
temps



In [118]:

    
temps.austin









    Out[118]:





2012-07-16    101.5
2012-07-17     98.0
2012-07-18     95.0
2012-07-19     99.0
2012-07-20    100.0
2012-07-21     92.0
Freq: D, Name: austin, dtype: float64



In [119]:

    
# access the third row of the dataframe
temps.ix[2]









    Out[119]:





austin       95
san diego    77
Name: 2012-07-18 00:00:00, dtype: float64



In [120]:

    
# or use the index to get the same row
ridx = temps.index[2]
temps.ix[ridx]









    Out[120]:





austin       95
san diego    77
Name: 2012-07-18 00:00:00, dtype: float64



In [121]:

    
# access the 'austin' column of the third row
temps.ix[ridx, 'austin']









    Out[121]:





95.0



In [122]:

    
# access the 'austin' column of all rows (beginning w/ the third)
temps.ix[ridx:, 'austin']









    Out[122]:





2012-07-18     95
2012-07-19     99
2012-07-20    100
2012-07-21     92
Freq: D, Name: austin, dtype: float64



In [123]:

    
# access the 'austin' and 'san diego' columns of all rows (beginning w/ the third)
temps.ix[ridx:, ['austin', 'san diego']]

mean of Series



In [124]:

    
# mean of DataFrame columns
temps.mean()









    Out[124]:





austin       97.583333
san diego    76.500000
dtype: float64



In [125]:

    
# mean of DataFrame rows
temps.mean(1)









    Out[125]:





2012-07-16    89.25
2012-07-17    85.50
2012-07-18    86.00
2012-07-19    88.50
2012-07-20    87.50
2012-07-21    85.50
Freq: D, dtype: float64



In [126]:

    
# deviations from the column mean
temps - temps.mean()



In [127]:

    
# select all rows where the temperature in Austin is above 100 F
temps[temps.austin > 100]

pd.read_csv with parse_dates parameter

Can't show this because the tutorial materials are no longer available. (Video: starts at 40m)

Timestamp



In [129]:

    
# starts at 53m

# Timestamp is a subclass of datetime.datetime which supports nanoseconds
pd.Timestamp.mro()









    Out[129]:





[pandas.tslib.Timestamp,
 pandas.tslib._Timestamp,
 datetime.datetime,
 datetime.date,
 object]



In [136]:

    
pd.Timestamp('2016')









    Out[136]:





Timestamp('2016-01-01 00:00:00')



In [151]:

    
pd.Timestamp('7/2/2012 10:06')









    Out[151]:





Timestamp('2012-07-02 10:06:00')



In [154]:

    
# month-first vs. day-first date parsing
print pd.to_datetime('7/2/2012')
print pd.to_datetime('7/2/2012', dayfirst=True)









    



2012-07-02 00:00:00
2012-02-07 00:00:00



In [137]:

    
stamp = pd.Timestamp('17.7.2012 1006')
stamp









    Out[137]:





Timestamp('2012-07-17 10:06:00')



In [138]:

    
stamp.strftime("%Y%m%d")









    Out[138]:





'20120717'



In [139]:

    
stamp.value # nanosecond timestamp









    Out[139]:





1342519560000000000



In [141]:

    
dates = ['2012-07-16', '2012-07-18', '2012-07-20']



In [144]:

    
date_series = Series(range(3), index=dates)
date_series









    Out[144]:





2012-07-16    0
2012-07-18    1
2012-07-20    2
dtype: int64



In [145]:

    
date_series.index[0] # the index is still a string!









    Out[145]:





'2012-07-16'



In [146]:

    
pd.DatetimeIndex(dates)









    Out[146]:





DatetimeIndex(['2012-07-16', '2012-07-18', '2012-07-20'], dtype='datetime64[ns]', freq=None)



In [149]:

    
# create a series with a proper datetime index
date_series = Series(range(3), index=pd.DatetimeIndex(dates))
print date_series
print date_series.index[0]









    



2012-07-16    0
2012-07-18    1
2012-07-20    2
dtype: int64
2012-07-16 00:00:00

Time Series indexing



In [157]:

    
pd.date_range('2012-07-12', periods=5)









    Out[157]:





DatetimeIndex(['2012-07-12', '2012-07-13', '2012-07-14', '2012-07-15',
               '2012-07-16'],
              dtype='datetime64[ns]', freq='D')



In [159]:

    
rng = pd.date_range('2012-07-12', periods=1000)
rng









    Out[159]:





DatetimeIndex(['2012-07-12', '2012-07-13', '2012-07-14', '2012-07-15',
               '2012-07-16', '2012-07-17', '2012-07-18', '2012-07-19',
               '2012-07-20', '2012-07-21',
               ...
               '2015-03-29', '2015-03-30', '2015-03-31', '2015-04-01',
               '2015-04-02', '2015-04-03', '2015-04-04', '2015-04-05',
               '2015-04-06', '2015-04-07'],
              dtype='datetime64[ns]', length=1000, freq='D')



In [160]:

    
ts = Series(np.arange(1000), index=rng)
ts









    Out[160]:





2012-07-12      0
2012-07-13      1
2012-07-14      2
2012-07-15      3
2012-07-16      4
2012-07-17      5
2012-07-18      6
2012-07-19      7
2012-07-20      8
2012-07-21      9
2012-07-22     10
2012-07-23     11
2012-07-24     12
2012-07-25     13
2012-07-26     14
2012-07-27     15
2012-07-28     16
2012-07-29     17
2012-07-30     18
2012-07-31     19
2012-08-01     20
2012-08-02     21
2012-08-03     22
2012-08-04     23
2012-08-05     24
2012-08-06     25
2012-08-07     26
2012-08-08     27
2012-08-09     28
2012-08-10     29
             ... 
2015-03-09    970
2015-03-10    971
2015-03-11    972
2015-03-12    973
2015-03-13    974
2015-03-14    975
2015-03-15    976
2015-03-16    977
2015-03-17    978
2015-03-18    979
2015-03-19    980
2015-03-20    981
2015-03-21    982
2015-03-22    983
2015-03-23    984
2015-03-24    985
2015-03-25    986
2015-03-26    987
2015-03-27    988
2015-03-28    989
2015-03-29    990
2015-03-30    991
2015-03-31    992
2015-04-01    993
2015-04-02    994
2015-04-03    995
2015-04-04    996
2015-04-05    997
2015-04-06    998
2015-04-07    999
Freq: D, dtype: int64



In [161]:

    
ts[datetime(2013, 1, 29)]









    Out[161]:





201



In [163]:

    
ts['2013-01-29']









    Out[163]:





201



In [162]:

    
ts[ts.index[201]]









    Out[162]:





201



In [165]:

    
# attention: slicing using datestrings includes the end element!
# this will only work on ordered time series!
ts[:'2012-07-20']









    Out[165]:





2012-07-12    0
2012-07-13    1
2012-07-14    2
2012-07-15    3
2012-07-16    4
2012-07-17    5
2012-07-18    6
2012-07-19    7
2012-07-20    8
Freq: D, dtype: int64



In [ ]:

	austin	san diego
2012-07-16	3.916667	0.5
2012-07-17	0.416667	-3.5
2012-07-18	-2.583333	0.5
2012-07-19	1.416667	1.5
2012-07-20	2.416667	-1.5
2012-07-21	-5.583333	2.5

	austin	san diego
2012-07-16	101.5	77
2012-07-17	98.0	73
2012-07-18	95.0	77
2012-07-19	99.0	78
2012-07-20	100.0	75
2012-07-21	92.0	79

	austin	san diego	difference
2012-07-16	101.5	77	24.5
2012-07-17	98.0	73	25.0
2012-07-18	95.0	77	18.0
2012-07-19	99.0	78	21.0
2012-07-20	100.0	75	25.0
2012-07-21	92.0	79	13.0