10 Minutes to pandas source

This is a short introduction to pandas, geared mainly for new users. You can see more complex recipes in the Cookbook


In [1]:
%pylab


Using matplotlib backend: Qt4Agg
Populating the interactive namespace from numpy and matplotlib

In [2]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

Object Creation


In [3]:
s = pd.Series([1,3,5,np.nan,6,8])

In [4]:
s


Out[4]:
0     1
1     3
2     5
3   NaN
4     6
5     8
dtype: float64

In [5]:
dates = pd.date_range('20130101',periods=6)

In [6]:
dates


Out[6]:
<class 'pandas.tseries.index.DatetimeIndex'>
[2013-01-01, ..., 2013-01-06]
Length: 6, Freq: D, Timezone: None

In [6]:


In [7]:
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))

In [8]:
df


Out[8]:
A B C D
2013-01-01 -1.243312 1.441631 0.371263 -0.135514
2013-01-02 -1.366575 0.301990 -0.402715 -0.337147
2013-01-03 -1.912022 0.312768 -0.689279 0.974364
2013-01-04 -3.212095 1.459726 -1.997869 -0.411211
2013-01-05 -1.814442 0.732649 -0.573100 0.012055
2013-01-06 -0.457683 -0.108085 1.350276 -1.633954

In [8]:


In [9]:
df2 = pd.DataFrame({ 'A' : 1.,
   ....:                      'B' : pd.Timestamp('20130102'),
   ....:                      'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
   ....:                      'D' : np.array([3] * 4,dtype='int32'),
   ....:                      'E' : 'foo' })

In [10]:
df2


Out[10]:
A B C D E
0 1 2013-01-02 1 3 foo
1 1 2013-01-02 1 3 foo
2 1 2013-01-02 1 3 foo
3 1 2013-01-02 1 3 foo

In [11]:
df2.dtypes


Out[11]:
A           float64
B    datetime64[ns]
C           float32
D             int32
E            object
dtype: object

In [11]:

Viewing Data


In [12]:
df.head()


Out[12]:
A B C D
2013-01-01 -1.243312 1.441631 0.371263 -0.135514
2013-01-02 -1.366575 0.301990 -0.402715 -0.337147
2013-01-03 -1.912022 0.312768 -0.689279 0.974364
2013-01-04 -3.212095 1.459726 -1.997869 -0.411211
2013-01-05 -1.814442 0.732649 -0.573100 0.012055

In [13]:
df.tail(3)


Out[13]:
A B C D
2013-01-04 -3.212095 1.459726 -1.997869 -0.411211
2013-01-05 -1.814442 0.732649 -0.573100 0.012055
2013-01-06 -0.457683 -0.108085 1.350276 -1.633954

In [14]:
df.index


Out[14]:
<class 'pandas.tseries.index.DatetimeIndex'>
[2013-01-01, ..., 2013-01-06]
Length: 6, Freq: D, Timezone: None

In [15]:
df.columns


Out[15]:
Index(['A', 'B', 'C', 'D'], dtype='object')

In [16]:
df.values


Out[16]:
array([[-1.24331225,  1.44163057,  0.37126291, -0.13551394],
       [-1.36657532,  0.30198969, -0.40271479, -0.33714672],
       [-1.91202174,  0.31276772, -0.6892787 ,  0.97436424],
       [-3.21209523,  1.45972626, -1.99786948, -0.41121143],
       [-1.81444202,  0.73264875, -0.57310016,  0.01205508],
       [-0.45768254, -0.10808484,  1.35027643, -1.63395423]])

In [17]:
df.describe()


Out[17]:
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean -1.667688 0.690113 -0.323571 -0.255235
std 0.916658 0.646381 1.121615 0.840190
min -3.212095 -0.108085 -1.997869 -1.633954
25% -1.887627 0.304684 -0.660234 -0.392695
50% -1.590509 0.522708 -0.487907 -0.236330
75% -1.274128 1.264385 0.177768 -0.024837
max -0.457683 1.459726 1.350276 0.974364

In [18]:
df.T


Out[18]:
2013-01-01 00:00:00 2013-01-02 00:00:00 2013-01-03 00:00:00 2013-01-04 00:00:00 2013-01-05 00:00:00 2013-01-06 00:00:00
A -1.243312 -1.366575 -1.912022 -3.212095 -1.814442 -0.457683
B 1.441631 0.301990 0.312768 1.459726 0.732649 -0.108085
C 0.371263 -0.402715 -0.689279 -1.997869 -0.573100 1.350276
D -0.135514 -0.337147 0.974364 -0.411211 0.012055 -1.633954

In [18]:


In [19]:
df.sort_index(axis=1, ascending=False)


Out[19]:
D C B A
2013-01-01 -0.135514 0.371263 1.441631 -1.243312
2013-01-02 -0.337147 -0.402715 0.301990 -1.366575
2013-01-03 0.974364 -0.689279 0.312768 -1.912022
2013-01-04 -0.411211 -1.997869 1.459726 -3.212095
2013-01-05 0.012055 -0.573100 0.732649 -1.814442
2013-01-06 -1.633954 1.350276 -0.108085 -0.457683

In [20]:
df.sort(columns='B')


Out[20]:
A B C D
2013-01-06 -0.457683 -0.108085 1.350276 -1.633954
2013-01-02 -1.366575 0.301990 -0.402715 -0.337147
2013-01-03 -1.912022 0.312768 -0.689279 0.974364
2013-01-05 -1.814442 0.732649 -0.573100 0.012055
2013-01-01 -1.243312 1.441631 0.371263 -0.135514
2013-01-04 -3.212095 1.459726 -1.997869 -0.411211

Selection


In [21]:
df['A']


Out[21]:
2013-01-01   -1.243312
2013-01-02   -1.366575
2013-01-03   -1.912022
2013-01-04   -3.212095
2013-01-05   -1.814442
2013-01-06   -0.457683
Freq: D, Name: A, dtype: float64

Selecting via [], which slices the rows.


In [22]:
df[0:3]


Out[22]:
A B C D
2013-01-01 -1.243312 1.441631 0.371263 -0.135514
2013-01-02 -1.366575 0.301990 -0.402715 -0.337147
2013-01-03 -1.912022 0.312768 -0.689279 0.974364

In [23]:
df['20130102':'20130104']


Out[23]:
A B C D
2013-01-02 -1.366575 0.301990 -0.402715 -0.337147
2013-01-03 -1.912022 0.312768 -0.689279 0.974364
2013-01-04 -3.212095 1.459726 -1.997869 -0.411211

Selection by Label


In [24]:
df.loc[dates[0]]


Out[24]:
A   -1.243312
B    1.441631
C    0.371263
D   -0.135514
Name: 2013-01-01 00:00:00, dtype: float64

In [25]:
df.loc[:,['A','B']]


Out[25]:
A B
2013-01-01 -1.243312 1.441631
2013-01-02 -1.366575 0.301990
2013-01-03 -1.912022 0.312768
2013-01-04 -3.212095 1.459726
2013-01-05 -1.814442 0.732649
2013-01-06 -0.457683 -0.108085

In [26]:
df.loc['20130102':'20130104',['A','B']]


Out[26]:
A B
2013-01-02 -1.366575 0.301990
2013-01-03 -1.912022 0.312768
2013-01-04 -3.212095 1.459726

In [27]:
df.loc['20130102',['A','B']]


Out[27]:
A   -1.366575
B    0.301990
Name: 2013-01-02 00:00:00, dtype: float64

In [28]:
df.loc[dates[0],'A']


Out[28]:
-1.2433122526602816

In [29]:
df.at[dates[0],'A']


Out[29]:
-1.2433122526602816

Selection by Position


In [30]:
df.iloc[3]


Out[30]:
A   -3.212095
B    1.459726
C   -1.997869
D   -0.411211
Name: 2013-01-04 00:00:00, dtype: float64

In [31]:
df.iloc[3:5,0:2]


Out[31]:
A B
2013-01-04 -3.212095 1.459726
2013-01-05 -1.814442 0.732649

In [32]:
df.iloc[[1,2,4],[0,2]]


Out[32]:
A C
2013-01-02 -1.366575 -0.402715
2013-01-03 -1.912022 -0.689279
2013-01-05 -1.814442 -0.573100

In [33]:
df.iloc[1:3,:]


Out[33]:
A B C D
2013-01-02 -1.366575 0.301990 -0.402715 -0.337147
2013-01-03 -1.912022 0.312768 -0.689279 0.974364

In [34]:
df.iloc[:,1:3]


Out[34]:
B C
2013-01-01 1.441631 0.371263
2013-01-02 0.301990 -0.402715
2013-01-03 0.312768 -0.689279
2013-01-04 1.459726 -1.997869
2013-01-05 0.732649 -0.573100
2013-01-06 -0.108085 1.350276

In [35]:
df.iloc[1,1]


Out[35]:
0.30198969390842811

For getting fast access to a scalar (equiv to the prior method)


In [36]:
df.iat[1,1]


Out[36]:
0.30198969390842811

Boolean Indexing


In [ ]:


In [29]:


In [29]:


In [29]: