Getting started with pandas


In [1]:
from pandas import Series, DataFrame
import pandas as pd

In [2]:
from __future__ import division
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
from pandas import Series, DataFrame
import pandas as pd
np.set_printoptions(precision=4)

In [3]:
%pwd


Out[3]:
u'/Users/pmui/datascience/lecture03.more.wrangling'

Introduction to pandas data structures

Series


In [4]:
obj = Series([4, 7, -5, 3])
obj


Out[4]:
0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
obj.name

In [6]:
obj.values


Out[6]:
array([ 4,  7, -5,  3])

In [7]:
obj.index


Out[7]:
RangeIndex(start=0, stop=4, step=1)

In [8]:
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2


Out[8]:
d    4
b    7
a   -5
c    3
dtype: int64

In [9]:
obj2.index


Out[9]:
Index([u'd', u'b', u'a', u'c'], dtype='object')

In [10]:
obj2.values


Out[10]:
array([ 4,  7, -5,  3])

In [11]:
obj2['a']


Out[11]:
-5

In [12]:
obj2['d'] = 6

In [13]:
obj2[['c', 'a', 'd']]


Out[13]:
c    3
a   -5
d    6
dtype: int64

In [14]:
obj2[obj2 > 0]


Out[14]:
d    6
b    7
c    3
dtype: int64

In [15]:
obj2 * 2


Out[15]:
d    12
b    14
a   -10
c     6
dtype: int64

In [16]:
np.exp(obj2)


Out[16]:
d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [17]:
'b' in obj2


Out[17]:
True

In [18]:
'e' in obj2


Out[18]:
False

In [19]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
obj3


Out[19]:
Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [20]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states)
obj4


Out[20]:
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [21]:
pd.isnull(obj4)


Out[21]:
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [22]:
pd.notnull(obj4)


Out[22]:
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [13]:
obj4.isnull()


Out[13]:
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [14]:
obj3


Out[14]:
Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [15]:
obj4


Out[15]:
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [16]:
obj3 + obj4


Out[16]:
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [17]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4


Out[17]:
state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [23]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj


Out[23]:
Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

DataFrame


In [24]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)

In [25]:
frame


Out[25]:
pop state year
0 1.5 Ohio 2000
1 1.7 Ohio 2001
2 3.6 Ohio 2002
3 2.4 Nevada 2001
4 2.9 Nevada 2002

In [26]:
DataFrame(data, columns=['year', 'state', 'pop'])


Out[26]:
year state pop
0 2000 Ohio 1.5
1 2001 Ohio 1.7
2 2002 Ohio 3.6
3 2001 Nevada 2.4
4 2002 Nevada 2.9

In [27]:
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                   index=['one', 'two', 'three', 'four', 'five'])
frame2


Out[27]:
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN

In [28]:
frame2.columns


Out[28]:
Index([u'year', u'state', u'pop', u'debt'], dtype='object')

A column in a DataFrame can be retrieved as a Series either by dict-like notation or by attribute:


In [29]:
frame2['state']


Out[29]:
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [30]:
frame2.year


Out[30]:
one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

Rows can also be retrieved by position or name by a couple of methods, such as the ix indexing field


In [ ]:
frame2.ix['three']

Columns can be modified by assignment. For example, the empty 'debt' column could be assigned a scalar value or an array of values:


In [32]:
frame2['debt'] = 16.5
frame2


Out[32]:
year state pop debt
one 2000 Ohio 1.5 16.5
two 2001 Ohio 1.7 16.5
three 2002 Ohio 3.6 16.5
four 2001 Nevada 2.4 16.5
five 2002 Nevada 2.9 16.5

In [33]:
frame2['debt'] = np.arange(5.)
frame2


Out[33]:
year state pop debt
one 2000 Ohio 1.5 0.0
two 2001 Ohio 1.7 1.0
three 2002 Ohio 3.6 2.0
four 2001 Nevada 2.4 3.0
five 2002 Nevada 2.9 4.0

When assigning lists or arrays to a column, the value’s length must match the length of the DataFrame. If you assign a Series, it will be instead conformed exactly to the DataFrame’s index, inserting missing values in any holes:


In [34]:
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2


Out[34]:
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2002 Nevada 2.9 -1.7

Assigning a column that doesn’t exist will create a new column.


In [35]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2


Out[35]:
year state pop debt eastern
one 2000 Ohio 1.5 NaN True
two 2001 Ohio 1.7 -1.2 True
three 2002 Ohio 3.6 NaN True
four 2001 Nevada 2.4 -1.5 False
five 2002 Nevada 2.9 -1.7 False

The del keyword will delete columns as with a dict:


In [36]:
del frame2['eastern']
frame2.columns


Out[36]:
Index([u'year', u'state', u'pop', u'debt'], dtype='object')

If passed to DataFrame, a nested dict of dics will be interpreted as: the outer dict keys as the columns and the inner keys as the row indices:


In [40]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [41]:
frame3 = DataFrame(pop)
frame3


Out[41]:
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6

In [42]:
frame3.T


Out[42]:
2000 2001 2002
Nevada NaN 2.4 2.9
Ohio 1.5 1.7 3.6

The keys in the inner dicts are unioned and sorted to form the index in the result. This isn’t true if an explicit index is specified:


In [43]:
DataFrame(pop, index=[2001, 2002, 2003])


Out[43]:
Nevada Ohio
2001 2.4 1.7
2002 2.9 3.6
2003 NaN NaN

Dicts of Series are treated much in the same way:


In [44]:
frame3['Ohio'][:-1]


Out[44]:
2000    1.5
2001    1.7
Name: Ohio, dtype: float64

In [45]:
frame3['Nevada'][:2]


Out[45]:
2000    NaN
2001    2.4
Name: Nevada, dtype: float64

In [46]:
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}
DataFrame(pdata)


Out[46]:
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7

If a DataFrame’s index and columns have their name attributes set, these will also be displayed:


In [47]:
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3


Out[47]:
state Nevada Ohio
year
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6

Like Series, the values attribute returns the data contained in the DataFrame as a 2D ndarray:


In [48]:
frame3.values


Out[48]:
array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

If the DataFrame’s columns are different dtypes, the dtype of the values array will be chosen to accomodate all of the columns:


In [49]:
frame2.values


Out[49]:
array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7]], dtype=object)

Index objects

pandas’s Index objects are responsible for holding the axis labels and other metadata (like the axis name or names). Any array or other sequence of labels used when con- structing a Series or DataFrame is internally converted to an Index:


In [50]:
obj = Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index


Out[50]:
Index([u'a', u'b', u'c'], dtype='object')

In [51]:
index[1:]


Out[51]:
Index([u'b', u'c'], dtype='object')

Index objects are immutable and thus can’t be modified by the user:


In [ ]:
# this below will generate an error

index[1] = 'd'

Immutability is important so that Index objects can be safely shared among data structures:


In [ ]:
index = pd.Index(np.arange(3))
obj2 = Series([1.5, -2.5, 0], index=index)
obj2.index is index

In addition to being array-like, an Index also functions as a fixed-size set:


In [52]:
frame3


Out[52]:
state Nevada Ohio
year
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6

In [53]:
'Ohio' in frame3.columns


Out[53]:
True

In [54]:
2003 in frame3.index


Out[54]:
False

Essential functionality

Reindexing

A critical method on pandas objects is reindex, which means to create a new object with the data conformed to a new index. Consider a simple example from above:


In [55]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj


Out[55]:
d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

Calling reindex on this Series rearranges the data according to the new index, intro- ducing missing values if any index values were not already present:


In [56]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2


Out[56]:
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

For ordered data like time series, it may be desirable to do some interpolation or filling of values when reindexing. The method option allows us to do this, using a method such as ffill which forward fills the values:


In [57]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)


Out[57]:
a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [58]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.reindex(range(6), method='ffill')


Out[58]:
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [59]:
frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],
                  columns=['Ohio', 'Texas', 'California'])
frame


Out[59]:
Ohio Texas California
a 0 1 2
c 3 4 5
d 6 7 8

In [60]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2


Out[60]:
Ohio Texas California
a 0.0 1.0 2.0
b NaN NaN NaN
c 3.0 4.0 5.0
d 6.0 7.0 8.0

The columns can be reindexed using the columns keyword:


In [61]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)


Out[61]:
Texas Utah California
a 1 NaN 2
c 4 NaN 5
d 7 NaN 8

Both can be reindexed in one shot, though interpolation will only apply row-wise (axis 0):


In [62]:
frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill',
              columns=states)


Out[62]:
Texas Utah California
a 1 NaN 2
b 1 NaN 2
c 4 NaN 5
d 7 NaN 8

As you’ll see soon, reindexing can be done more succinctly by label-indexing with ix:


In [ ]:
frame.ix[['a', 'b', 'c', 'd'], states]

Dropping entries from an axis

Dropping one or more entries from an axis is easy if you have an index array or list without those entries. As that can require a bit of munging and set logic, the drop method will return a new object with the indicated value or values deleted from an axis:


In [63]:
obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')
new_obj


Out[63]:
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [64]:
obj.drop(['d', 'c'])


Out[64]:
a    0.0
b    1.0
e    4.0
dtype: float64

With DataFrame, index values can be deleted from either axis:


In [65]:
data = DataFrame(np.arange(16).reshape((4, 4)),
                 index=['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns=['one', 'two', 'three', 'four'])

In [66]:
data.drop(['Colorado', 'Ohio'])


Out[66]:
one two three four
Utah 8 9 10 11
New York 12 13 14 15

In [69]:
data.drop('Utah', axis=0)


Out[69]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
New York 12 13 14 15

In [67]:
data.drop('two', axis=1)


Out[67]:
one three four
Ohio 0 2 3
Colorado 4 6 7
Utah 8 10 11
New York 12 14 15

In [68]:
data.drop(['two', 'four'], axis=1)


Out[68]:
one three
Ohio 0 2
Colorado 4 6
Utah 8 10
New York 12 14

Indexing, selection, and filtering

Series indexing (obj[...]) works analogously to NumPy array indexing, except you can use the Series’s index values instead of only integers. Here are some examples of this:


In [72]:
obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj


Out[72]:
a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [73]:
obj['b']


Out[73]:
1.0

In [71]:
obj[1]


Out[71]:
1.0

In [74]:
obj[2:4]


Out[74]:
c    2.0
d    3.0
dtype: float64

In [75]:
obj[['b', 'a', 'd']]


Out[75]:
b    1.0
a    0.0
d    3.0
dtype: float64

In [76]:
obj[[1, 3]]


Out[76]:
b    1.0
d    3.0
dtype: float64

In [77]:
obj[obj < 2]


Out[77]:
a    0.0
b    1.0
dtype: float64

Slicing with labels behaves differently than normal Python slicing in that the endpoint is inclusive:


In [78]:
obj['b':'c']


Out[78]:
b    1.0
c    2.0
dtype: float64

Setting using these methods works just as you would expect:


In [79]:
obj['b':'c'] = 5
obj


Out[79]:
a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

As you’ve seen above, indexing into a DataFrame is for retrieving one or more columns either with a single value or sequence:


In [80]:
data = DataFrame(np.arange(16).reshape((4, 4)),
                 index=['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns=['one', 'two', 'three', 'four'])
data


Out[80]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15

In [81]:
data['two']


Out[81]:
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [82]:
data[['three', 'one']]


Out[82]:
three one
Ohio 2 0
Colorado 6 4
Utah 10 8
New York 14 12

Indexing like this has a few special cases. First selecting rows by slicing or a boolean array:


In [83]:
data[:2]


Out[83]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7

In [84]:
data[data['three'] > 5]


Out[84]:
one two three four
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15

In [85]:
data < 5


Out[85]:
one two three four
Ohio True True True True
Colorado True False False False
Utah False False False False
New York False False False False

In [86]:
data[data < 5] = 0

In [87]:
data


Out[87]:
one two three four
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11
New York 12 13 14 15

Indexing field ix enables you to select a subset of the rows and columns from a DataFrame with NumPy- like notation plus axis labels. This is also a less verbose way to do reindexing:


In [88]:
data.ix['Colorado', ['two', 'three']]


Out[88]:
two      5
three    6
Name: Colorado, dtype: int64

In [89]:
data.ix[['Colorado', 'Utah'], [3, 0, 1]]


Out[89]:
four one two
Colorado 7 0 5
Utah 11 8 9

In [90]:
data.ix[2]


Out[90]:
one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [91]:
data.ix[:'Utah', 'two']


Out[91]:
Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64

In [92]:
data.ix[data.three > 5, :3]


Out[92]:
one two three
Colorado 0 5 6
Utah 8 9 10
New York 12 13 14

Arithmetic and data alignment

One of the most important pandas features is the behavior of arithmetic between ob- jects with different indexes. When adding together objects, if any index pairs are not the same, the respective index in the result will be the union of the index pairs. Let’s look at a simple example:


In [93]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])

In [94]:
s1


Out[94]:
a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [95]:
s2


Out[95]:
a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [96]:
s1 + s2


Out[96]:
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

The internal data alignment introduces NA values in the indices that don’t overlap. Missing values propagate in arithmetic computations.

In the case of DataFrame, alignment is performed on both the rows and the columns:


In [97]:
df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                index=['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1


Out[97]:
b c d
Ohio 0.0 1.0 2.0
Texas 3.0 4.0 5.0
Colorado 6.0 7.0 8.0

In [98]:
df2


Out[98]:
b d e
Utah 0.0 1.0 2.0
Ohio 3.0 4.0 5.0
Texas 6.0 7.0 8.0
Oregon 9.0 10.0 11.0

Adding these together returns a DataFrame whose index and columns are the unions of the ones in each DataFrame:


In [99]:
df1 + df2


Out[99]:
b c d e
Colorado NaN NaN NaN NaN
Ohio 3.0 NaN 6.0 NaN
Oregon NaN NaN NaN NaN
Texas 9.0 NaN 12.0 NaN
Utah NaN NaN NaN NaN

Arithmetic methods with fill values

In arithmetic operations between differently-indexed objects, you might want to fill with a special value, like 0, when an axis label is found in one object but not the other:


In [100]:
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
df1


Out[100]:
a b c d
0 0.0 1.0 2.0 3.0
1 4.0 5.0 6.0 7.0
2 8.0 9.0 10.0 11.0

In [101]:
df2


Out[101]:
a b c d e
0 0.0 1.0 2.0 3.0 4.0
1 5.0 6.0 7.0 8.0 9.0
2 10.0 11.0 12.0 13.0 14.0
3 15.0 16.0 17.0 18.0 19.0

In [102]:
df1 + df2


Out[102]:
a b c d e
0 0.0 2.0 4.0 6.0 NaN
1 9.0 11.0 13.0 15.0 NaN
2 18.0 20.0 22.0 24.0 NaN
3 NaN NaN NaN NaN NaN

In [103]:
df1.add(df2, fill_value=0)


Out[103]:
a b c d e
0 0.0 2.0 4.0 6.0 4.0
1 9.0 11.0 13.0 15.0 9.0
2 18.0 20.0 22.0 24.0 14.0
3 15.0 16.0 17.0 18.0 19.0

Relatedly, when reindexing a Series or DataFrame, you can also specify a different fill value:


In [104]:
df1.reindex(columns=df2.columns, fill_value=0)


Out[104]:
a b c d e
0 0.0 1.0 2.0 3.0 0
1 4.0 5.0 6.0 7.0 0
2 8.0 9.0 10.0 11.0 0

Operations between DataFrame and Series

As with NumPy arrays, arithmetic between DataFrame and Series is well-defined. First, as a motivating example, consider the difference between a 2D array and one of its rows:


In [105]:
arr = np.arange(12.).reshape((3, 4))
arr


Out[105]:
array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])

In [106]:
arr[0]


Out[106]:
array([ 0.,  1.,  2.,  3.])

In [107]:
arr - arr[0]


Out[107]:
array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

This is referred to as broadcasting. Operations between a DataFrame and a Series are similar:


In [108]:
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.ix[0]
frame


Out[108]:
b d e
Utah 0.0 1.0 2.0
Ohio 3.0 4.0 5.0
Texas 6.0 7.0 8.0
Oregon 9.0 10.0 11.0

In [109]:
series


Out[109]:
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

By default, arithmetic between DataFrame and Series matches the index of the Series on the DataFrame's columns, broadcasting down the rows:


In [110]:
frame - series


Out[110]:
b d e
Utah 0.0 0.0 0.0
Ohio 3.0 3.0 3.0
Texas 6.0 6.0 6.0
Oregon 9.0 9.0 9.0

If an index value is not found in either the DataFrame’s columns or the Series’s index, the objects will be reindexed to form the union:


In [115]:
series2 = Series(range(3), index=['b', 'e', 'f'])
series2


Out[115]:
b    0
e    1
f    2
dtype: int64

In [116]:
frame + series2


Out[116]:
b d e f
Utah 0.0 NaN 3.0 NaN
Ohio 3.0 NaN 6.0 NaN
Texas 6.0 NaN 9.0 NaN
Oregon 9.0 NaN 12.0 NaN

If you want to instead broadcast over the columns, matching on the rows, you have to use one of the arithmetic methods. For example:


In [112]:
series3 = frame['d']
frame


Out[112]:
b d e
Utah 0.0 1.0 2.0
Ohio 3.0 4.0 5.0
Texas 6.0 7.0 8.0
Oregon 9.0 10.0 11.0

In [113]:
series3


Out[113]:
Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [114]:
frame.sub(series3, axis=0)


Out[114]:
b d e
Utah -1.0 0.0 1.0
Ohio -1.0 0.0 1.0
Texas -1.0 0.0 1.0
Oregon -1.0 0.0 1.0

Function application and mapping

NumPy ufuncs (element-wise array methods) work fine with pandas objects:


In [117]:
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [118]:
frame


Out[118]:
b d e
Utah -0.204708 0.478943 -0.519439
Ohio -0.555730 1.965781 1.393406
Texas 0.092908 0.281746 0.769023
Oregon 1.246435 1.007189 -1.296221

In [119]:
np.abs(frame)


Out[119]:
b d e
Utah 0.204708 0.478943 0.519439
Ohio 0.555730 1.965781 1.393406
Texas 0.092908 0.281746 0.769023
Oregon 1.246435 1.007189 1.296221

In [120]:
f = lambda x: x.max() - x.min()

In [121]:
frame.apply(f)


Out[121]:
b    1.802165
d    1.684034
e    2.689627
dtype: float64

In [122]:
frame.apply(f, axis=1)


Out[122]:
Utah      0.998382
Ohio      2.521511
Texas     0.676115
Oregon    2.542656
dtype: float64

Many of the most common array statistics (like sum and mean) are DataFrame methods, so using apply is not necessary.

The function passed to apply need not return a scalar value, it can also return a Series with multiple values:


In [123]:
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)


Out[123]:
b d e
min -0.555730 0.281746 -1.296221
max 1.246435 1.965781 1.393406

Element-wise Python functions can be used, too. Suppose you wanted to compute a formatted string from each floating point value in frame. You can do this with applymap:


In [124]:
format = lambda x: '%.2f' % x
frame.applymap(format)


Out[124]:
b d e
Utah -0.20 0.48 -0.52
Ohio -0.56 1.97 1.39
Texas 0.09 0.28 0.77
Oregon 1.25 1.01 -1.30

The reason for the name applymap is that Series has a map method for applying an element-wise function:


In [125]:
frame['e'].map(format)


Out[125]:
Utah      -0.52
Ohio       1.39
Texas      0.77
Oregon    -1.30
Name: e, dtype: object

Sorting and ranking

Sorting a data set by some criterion is another important built-in operation. To sort lexicographically by row or column index, use the sort_index method, which returns a new, sorted object:


In [126]:
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()


Out[126]:
a    1
b    2
c    3
d    0
dtype: int64

With a DataFrame, you can sort by index on either axis:


In [127]:
frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])
frame.sort_index()


Out[127]:
d a b c
one 4 5 6 7
three 0 1 2 3

In [128]:
frame.sort_index(axis=1)


Out[128]:
a b c d
three 1 2 3 0
one 5 6 7 4

The data is sorted in ascending order by default, but can be sorted in descending order, too:


In [129]:
frame.sort_index(axis=1, ascending=False)


Out[129]:
d c b a
three 0 3 2 1
one 4 7 6 5

To sort a Series by its values, use its order method:


In [130]:
obj = Series([4, 7, -3, 2])
obj.order()


/Users/pmui/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/ipykernel/__main__.py:2: FutureWarning: order is deprecated, use sort_values(...)
  from ipykernel import kernelapp as app
Out[130]:
2   -3
3    2
0    4
1    7
dtype: int64

In [131]:
obj.sort_values()


Out[131]:
2   -3
3    2
0    4
1    7
dtype: int64

Any missing values are sorted to the end of the Series by default:


In [133]:
obj = Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()


Out[133]:
4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

On DataFrame, you may want to sort by the values in one or more columns. To do so, pass one or more column names to the by option:


In [134]:
frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame


Out[134]:
a b
0 0 4
1 1 7
2 0 -3
3 1 2

In [135]:
frame.sort_index(by='b')


/Users/pmui/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
  if __name__ == '__main__':
Out[135]:
a b
2 0 -3
3 1 2
0 0 4
1 1 7

In [137]:
frame.sort_values(by='b')


Out[137]:
a b
2 0 -3
3 1 2
0 0 4
1 1 7

In [138]:
frame.sort_index(by=['a', 'b'])


/Users/pmui/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
  if __name__ == '__main__':
Out[138]:
a b
2 0 -3
0 0 4
3 1 2
1 1 7

In [139]:
frame.sort_values(by=['a', 'b'])


Out[139]:
a b
2 0 -3
0 0 4
3 1 2
1 1 7

Ranking is closely related to sorting, assigning ranks from one through the number of valid data points in an array. It is similar to the indirect sort indices produced by numpy.argsort, except that ties are broken according to a rule. The rank methods for Series and DataFrame are the place to look; by default rank breaks ties by assigning each group the mean rank:


In [141]:
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj


Out[141]:
0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [143]:
obj.rank?

In [142]:
obj.rank()


Out[142]:
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

Ranks can also be assigned according to the order they’re observed in the data:


In [144]:
obj.rank(method='first')


Out[144]:
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

Naturally, you can rank in descending order, too:


In [147]:
obj.rank(ascending=False, method='max')


Out[147]:
0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

The "method" is used for breaking ties:

  • 'average' Default: assign the average rank to each entry in the equal group.
  • 'min' Use the minimum rank for the whole group.
  • 'max' Use the maximum rank for the whole group.
  • 'first' Assign ranks in the order the values appear in the data.

In [ ]:
frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                   'c': [-2, 5, 8, -2.5]})
frame

In [ ]:
frame.rank(axis=1)

Axis indexes with duplicate values

While many pandas functions (like reindex) require that the labels be unique, it’s not mandatory. Let’s consider a small Series with duplicate indices:


In [148]:
obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj


Out[148]:
a    0
a    1
b    2
b    3
c    4
dtype: int64

In [149]:
obj.index.is_unique


Out[149]:
False

Data selection is one of the main things that behaves differently with duplicates. In- dexing a value with multiple entries returns a Series while single entries return a scalar value:


In [150]:
obj['a']


Out[150]:
a    0
a    1
dtype: int64

In [151]:
obj['c']


Out[151]:
4

The same logic extends to indexing rows in a DataFrame:


In [152]:
df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df


Out[152]:
0 1 2
a 0.274992 0.228913 1.352917
a 0.886429 -2.001637 -0.371843
b 1.669025 -0.438570 -0.539741
b 0.476985 3.248944 -1.021228

In [153]:
df.ix['b']


Out[153]:
0 1 2
b 1.669025 -0.438570 -0.539741
b 0.476985 3.248944 -1.021228

Summarizing and computing descriptive statistics

pandas objects are equipped with a set of common mathematical and statistical meth- ods. Most of these fall into the category of reductions or summary statistics, methods that extract a single value (like the sum or mean) from a Series or a Series of values from the rows or columns of a DataFrame. Compared with the equivalent methods of vanilla NumPy arrays, they are all built from the ground up to exclude missing data. Consider a small DataFrame:


In [154]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
                [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
df


Out[154]:
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3

Calling DataFrame’s sum method returns a Series containing column sums:


In [155]:
df.sum()


Out[155]:
one    9.25
two   -5.80
dtype: float64

Passing axis=1 sums over the rows instead:


In [156]:
df.sum(axis=1)


Out[156]:
a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64

NA values are excluded unless the entire slice (row or column in this case) is NA. This can be disabled using the skipna option:


In [157]:
df.mean(axis=1, skipna=False)


Out[157]:
a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

Common options for each reduction method are:

  • axis: Axis to reduce over. 0 for DataFrame’s rows and 1 for columns.
  • skipna: Exclude missing values, True by default.
  • level: Reduce grouped by level if the axis is hierarchically-indexed (MultiIndex).

Some methods, like idxmin and idxmax, return indirect statistics like the index value where the minimum or maximum values are attained:


In [158]:
df.idxmax()


Out[158]:
one    b
two    d
dtype: object

Other methods are accumulations:


In [159]:
df.cumsum()


Out[159]:
one two
a 1.40 NaN
b 8.50 -4.5
c NaN NaN
d 9.25 -5.8

Another type of method is neither a reduction nor an accumulation. describe is one such example, producing multiple summary statistics in one shot:


In [160]:
df.describe()


Out[160]:
one two
count 3.000000 2.000000
mean 3.083333 -2.900000
std 3.493685 2.262742
min 0.750000 -4.500000
25% 1.075000 -3.700000
50% 1.400000 -2.900000
75% 4.250000 -2.100000
max 7.100000 -1.300000

On non-numeric data, describe produces alternate summary statistics:


In [161]:
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()


Out[161]:
count     16
unique     3
top        a
freq       8
dtype: object

Correlation and covariance in Finance

Some summary statistics, like correlation and covariance, are computed from pairs of arguments. Let’s consider some DataFrames of stock prices and volumes obtained from Yahoo! Finance:


In [162]:
import pandas.io.data as web

all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker)

price = DataFrame({tic: data['Adj Close']
                   for tic, data in all_data.iteritems()})
volume = DataFrame({tic: data['Volume']
                    for tic, data in all_data.iteritems()})


/Users/pmui/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas/io/data.py:35: FutureWarning: 
The pandas.io.data module is moved to a separate package (pandas-datareader) and will be removed from pandas in a future version.
After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.
  FutureWarning)

Now, compute percent changes of the prices:


In [201]:
price.head()


Out[201]:
AAPL GOOG IBM MSFT
Date
2010-01-04 28.141855 313.062468 114.283108 26.045432
2010-01-05 28.190509 311.683844 112.902572 26.053846
2010-01-06 27.742101 303.826685 112.169153 25.893956
2010-01-07 27.690818 296.753749 111.780878 25.624666
2010-01-08 27.874915 300.709808 112.902572 25.801387

In [163]:
returns = priceA.pct_change()
returns.tail()


Out[163]:
AAPL GOOG IBM MSFT
Date
2016-07-25 -0.013379 -0.003999 0.003579 0.002828
2016-07-26 -0.006883 -0.001825 -0.003259 0.000529
2016-07-27 0.064963 0.004537 -0.001789 -0.010042
2016-07-28 0.013502 0.005581 -0.002843 0.000356
2016-07-29 -0.001246 0.030674 -0.004648 0.008362

The corr method of Series computes the correlation of the overlapping, non-NA, aligned-by-index values in two Series. Relatedly, cov computes the covariance:


In [166]:
returns.MSFT.corr(returns.IBM)


Out[166]:
0.50196224207624862

In [167]:
returns.MSFT.cov(returns.IBM)


Out[167]:
9.0865799179417242e-05

DataFrame’s corr and cov methods, on the other hand, return a full correlation or covariance matrix as a DataFrame, respectively:


In [168]:
returns.corr()


Out[168]:
AAPL GOOG IBM MSFT
AAPL 1.000000 0.407796 0.388075 0.391186
GOOG 0.407796 1.000000 0.405451 0.465208
IBM 0.388075 0.405451 1.000000 0.501962
MSFT 0.391186 0.465208 0.501962 1.000000

In [169]:
returns.cov()


Out[169]:
AAPL GOOG IBM MSFT
AAPL 0.000282 0.000110 0.000080 0.000097
GOOG 0.000110 0.000258 0.000079 0.000111
IBM 0.000080 0.000079 0.000149 0.000091
MSFT 0.000097 0.000111 0.000091 0.000220

Using DataFrame’s corrwith method, you can compute pairwise correlations between a DataFrame’s columns or rows with another Series or DataFrame. Passing a Series returns a Series with the correlation value computed for each column:


In [170]:
returns.corrwith(returns.IBM)


Out[170]:
AAPL    0.388075
GOOG    0.405451
IBM     1.000000
MSFT    0.501962
dtype: float64

Passing a DataFrame computes the correlations of matching column names. Here I compute correlations of percent changes with volume:


In [171]:
returns.corrwith(volume)


Out[171]:
AAPL   -0.077827
GOOG   -0.007241
IBM    -0.207137
MSFT   -0.098255
dtype: float64

Passing axis=1 does things row-wise instead. In all cases, the data points are aligned by label before computing the correlation.

Unique values, value counts, and membership

Another class of related methods extracts information about the values contained in a one-dimensional Series.

  • isin: Compute boolean array indicating whether each Series value is contained in the passed sequence of values.
  • unique: Compute array of unique values in a Series, returned in the order observed.
  • value_counts: Return a Series containing unique values as its index and frequencies as its values, ordered count in descending order.

To illustrate these, consider this example:


In [172]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

The first function is unique, which gives you an array of the unique values in a Series:


In [173]:
uniques = obj.unique()
uniques


Out[173]:
array(['c', 'a', 'd', 'b'], dtype=object)

The unique values are not necessarily returned in sorted order, but could be sorted after the fact if needed (uniques.sort()). Relatedly, value_counts computes a Series con- taining value frequencies:


In [174]:
obj.value_counts()


Out[174]:
c    3
a    3
b    2
d    1
dtype: int64

The Series is sorted by value in descending order as a convenience. value_counts is also available as a top-level pandas method that can be used with any array or sequence:


In [175]:
pd.value_counts(obj.values, sort=False)


Out[175]:
a    3
c    3
b    2
d    1
dtype: int64

Lastly, isin is responsible for vectorized set membership and can be very useful in filtering a data set down to a subset of values in a Series or column in a DataFrame:


In [176]:
mask = obj.isin(['b', 'c'])
mask


Out[176]:
0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [177]:
obj[mask]


Out[177]:
0    c
5    b
6    b
7    c
8    c
dtype: object

In some cases, you may want to compute a histogram on multiple related columns in a DataFrame. Here’s an example:


In [178]:
data = DataFrame({'Qu1': [1, 3, 4, 3, 4],
                  'Qu2': [2, 3, 1, 2, 3],
                  'Qu3': [1, 5, 2, 4, 4]})
data


Out[178]:
Qu1 Qu2 Qu3
0 1 2 1
1 3 3 5
2 4 1 2
3 3 2 4
4 4 3 4

Passing pandas.value_counts to this DataFrame’s apply function gives:


In [179]:
result = data.apply(pd.value_counts).fillna(0)
result


Out[179]:
Qu1 Qu2 Qu3
1 1.0 1.0 1.0
2 0.0 2.0 1.0
3 2.0 2.0 0.0
4 2.0 0.0 2.0
5 0.0 0.0 1.0

Handling missing data

Missing data is common in most data analysis applications. One of the goals in de- signing pandas was to make working with missing data as painless as possible. For example, all of the descriptive statistics on pandas objects exclude missing data as you’ve seen earlier.

pandas uses the floating point value NaN (Not a Number) to represent missing data in both floating as well as in non-floating point arrays. It is just used as a sentinel that can be easily detected:


In [260]:
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data


Out[260]:
0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [261]:
string_data.isnull()


Out[261]:
0    False
1    False
2     True
3    False
dtype: bool

The built-in Python None value is also treated as NA in object arrays:


In [182]:
string_data[0] = None
string_data.isnull()


Out[182]:
0     True
1    False
2     True
3    False
dtype: bool

Filtering out missing data

Common NA handling methods are:

  • dropna: Filter axis labels based on whether values for each label have missing data, with varying thresholds for how much missing data to tolerate.
  • fillna: Fill in missing data with some value or using an interpolation method such as 'ffill' or 'bfill'.
  • isnull: Return like-type object containing boolean values indicating which values are missing / NA.
  • notnull: Negation of isnull.

You have a number of options for filtering out missing data. While doing it by hand is always an option, dropna can be very helpful. On a Series, it returns the Series with only the non-null data and index values:


In [262]:
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data.dropna()


Out[262]:
0    1.0
2    3.5
4    7.0
dtype: float64

Naturally, you could have computed this yourself by boolean indexing:


In [185]:
data[data.notnull()]


Out[185]:
0    1.0
2    3.5
4    7.0
dtype: float64

With DataFrame objects, these are a bit more complex. You may want to drop rows or columns which are all NA or just those containing any NAs. dropna by default drops any row containing a missing value:


In [186]:
data = DataFrame([[1., 6.5, 3.], [1., NA, NA],
                  [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
data


Out[186]:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0

In [187]:
cleaned


Out[187]:
0 1 2
0 1.0 6.5 3.0

Passing how='all' will only drop rows that are all NA:


In [188]:
data.dropna(how='all')


Out[188]:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
3 NaN 6.5 3.0

Dropping columns in the same way is only a matter of passing axis=1:


In [189]:
data[4] = NA
data


Out[189]:
0 1 2 4
0 1.0 6.5 3.0 NaN
1 1.0 NaN NaN NaN
2 NaN NaN NaN NaN
3 NaN 6.5 3.0 NaN

In [190]:
data.dropna(axis=1, how='all')


Out[190]:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0

A related way to filter out DataFrame rows tends to concern time series data. Suppose you want to keep only rows containing a certain number of observations. You can indicate this with the thresh argument:


In [263]:
df = DataFrame(np.random.randn(7, 3))
df.ix[:4, 1] = NA; df.ix[:2, 2] = NA
df


Out[263]:
0 1 2
0 -0.204708 NaN NaN
1 -0.555730 NaN NaN
2 0.092908 NaN NaN
3 1.246435 NaN -1.296221
4 0.274992 NaN 1.352917
5 0.886429 -2.001637 -0.371843
6 1.669025 -0.438570 -0.539741

In [264]:
df.dropna(thresh=3)


Out[264]:
0 1 2
5 0.886429 -2.001637 -0.371843
6 1.669025 -0.438570 -0.539741

Filling in missing data

Rather than filtering out missing data (and potentially discarding other data along with it), you may want to fill in the “holes” in any number of ways. For most purposes, the fillna method is the workhorse function to use. Calling fillna with a constant replaces missing values with that value:


In [265]:
df.fillna(0)


Out[265]:
0 1 2
0 -0.204708 0.000000 0.000000
1 -0.555730 0.000000 0.000000
2 0.092908 0.000000 0.000000
3 1.246435 0.000000 -1.296221
4 0.274992 0.000000 1.352917
5 0.886429 -2.001637 -0.371843
6 1.669025 -0.438570 -0.539741

Calling fillna with a dict you can use a different fill value for each column:


In [195]:
df.fillna({1: 0.5, 2: -1})


Out[195]:
0 1 2
0 -0.577087 0.500000 -1.000000
1 0.523772 0.500000 -1.000000
2 -0.713544 0.500000 -1.000000
3 -1.860761 0.500000 0.560145
4 -1.265934 0.500000 -1.063512
5 0.332883 -2.359419 -0.199543
6 -1.541996 -0.970736 -1.307030

fillna returns a new object, but you can modify the existing object in place:


In [196]:
# always returns a reference to the filled object
_ = df.fillna(0, inplace=True)
df


Out[196]:
0 1 2
0 -0.577087 0.000000 0.000000
1 0.523772 0.000000 0.000000
2 -0.713544 0.000000 0.000000
3 -1.860761 0.000000 0.560145
4 -1.265934 0.000000 -1.063512
5 0.332883 -2.359419 -0.199543
6 -1.541996 -0.970736 -1.307030

The same interpolation methods available for reindexing can be used with fillna:


In [269]:
df = DataFrame(np.random.randn(6, 3))
df.ix[2:, 1] = NA; df.ix[4:, 2] = NA
df


Out[269]:
0 1 2
0 -0.831154 -2.370232 -1.860761
1 -0.860757 0.560145 -1.265934
2 0.119827 NaN 0.332883
3 -2.359419 NaN -1.541996
4 -0.970736 NaN NaN
5 0.377984 NaN NaN

In [270]:
df.fillna(method='ffill')


Out[270]:
0 1 2
0 -0.831154 -2.370232 -1.860761
1 -0.860757 0.560145 -1.265934
2 0.119827 0.560145 0.332883
3 -2.359419 0.560145 -1.541996
4 -0.970736 0.560145 -1.541996
5 0.377984 0.560145 -1.541996

In [271]:
df.fillna(method='ffill', limit=2)


Out[271]:
0 1 2
0 -0.831154 -2.370232 -1.860761
1 -0.860757 0.560145 -1.265934
2 0.119827 0.560145 0.332883
3 -2.359419 0.560145 -1.541996
4 -0.970736 NaN -1.541996
5 0.377984 NaN -1.541996

With fillna you can do lots of other things with a little creativity. For example, you might pass the mean or median value of a Series:


In [199]:
data = Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())


Out[199]:
0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

Hierarchical indexing

Hierarchical indexing is an important feature of pandas enabling you to have multiple (two or more) index levels on an axis. Somewhat abstractly, it provides a way for you to work with higher dimensional data in a lower dimensional form. Let’s start with a simple example; create a Series with a list of lists or arrays as the index:


In [266]:
data = Series(np.random.randn(10),
              index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                     [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data


Out[266]:
a  1    0.476985
   2    3.248944
   3   -1.021228
b  1   -0.577087
   2    0.124121
   3    0.302614
c  1    0.523772
   2    0.000940
d  2    1.343810
   3   -0.713544
dtype: float64

In [267]:
data.index


Out[267]:
MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [ ]:
data['b']

In [ ]:
data['b':'c']

In [ ]:
data.ix[['b', 'd']]

In [ ]:
data[:, 2]

In [ ]:
data.unstack()

In [ ]:
data.unstack().stack()

In [ ]:
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'],
                           ['Green', 'Red', 'Green']])
frame

In [ ]:
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame

In [ ]:
frame['Ohio']
MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']], names=['state', 'color'])

Reordering and sorting levels


In [ ]:
frame.swaplevel('key1', 'key2')

In [ ]:
frame.sortlevel(1)

In [ ]:
frame.swaplevel(0, 1).sortlevel(0)

Summary statistics by level


In [ ]:
frame.sum(level='key2')

In [ ]:
frame.sum(level='color', axis=1)

Using a DataFrame's columns


In [ ]:
frame = DataFrame({'a': range(7), 'b': range(7, 0, -1),
                   'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                   'd': [0, 1, 2, 0, 1, 2, 3]})
frame

In [ ]:
frame2 = frame.set_index(['c', 'd'])
frame2

In [ ]:
frame.set_index(['c', 'd'], drop=False)

In [ ]:
frame2.reset_index()

Other pandas topics

Integer indexing


In [ ]:
ser = Series(np.arange(3.))
ser.iloc[-1]

In [ ]:
ser

In [ ]:
ser2 = Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]

In [ ]:
ser.ix[:1]

In [ ]:
ser3 = Series(range(3), index=[-5, 1, 3])
ser3.iloc[2]

In [ ]:
frame = DataFrame(np.arange(6).reshape((3, 2)), index=[2, 0, 1])
frame.iloc[0]

Panel data


In [ ]:
import pandas.io.data as web

pdata = pd.Panel(dict((stk, web.get_data_yahoo(stk))
                       for stk in ['AAPL', 'GOOG', 'MSFT', 'DELL']))

In [ ]:
pdata

In [ ]:
pdata = pdata.swapaxes('items', 'minor')
pdata['Adj Close']

In [ ]:
pdata.ix[:, '6/1/2012', :]

In [ ]:
pdata.ix['Adj Close', '5/22/2012':, :]

In [ ]:
stacked = pdata.ix[:, '5/30/2012':, :].to_frame()
stacked

In [ ]:
stacked.to_panel()