Getting started with pandas



In [1]:

    
from pandas import Series, DataFrame
import pandas as pd



In [2]:

    
from __future__ import division
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
from pandas import Series, DataFrame
import pandas as pd
np.set_printoptions(precision=4)



In [3]:

    
%pwd









    Out[3]:





u'/Users/pmui/datascience/lecture03.more.wrangling'

Introduction to pandas data structures

Series



In [4]:

    
obj = Series([4, 7, -5, 3])
obj









    Out[4]:





0    4
1    7
2   -5
3    3
dtype: int64



In [5]:

    
obj.name



In [6]:

    
obj.values









    Out[6]:





array([ 4,  7, -5,  3])



In [7]:

    
obj.index









    Out[7]:





RangeIndex(start=0, stop=4, step=1)



In [8]:

    
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2









    Out[8]:





d    4
b    7
a   -5
c    3
dtype: int64



In [9]:

    
obj2.index









    Out[9]:





Index([u'd', u'b', u'a', u'c'], dtype='object')



In [10]:

    
obj2.values









    Out[10]:





array([ 4,  7, -5,  3])



In [11]:

    
obj2['a']









    Out[11]:





-5



In [12]:

    
obj2['d'] = 6



In [13]:

    
obj2[['c', 'a', 'd']]









    Out[13]:





c    3
a   -5
d    6
dtype: int64



In [14]:

    
obj2[obj2 > 0]









    Out[14]:





d    6
b    7
c    3
dtype: int64



In [15]:

    
obj2 * 2









    Out[15]:





d    12
b    14
a   -10
c     6
dtype: int64



In [16]:

    
np.exp(obj2)









    Out[16]:





d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64



In [17]:

    
'b' in obj2









    Out[17]:





True



In [18]:

    
'e' in obj2









    Out[18]:





False



In [19]:

    
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
obj3









    Out[19]:





Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64



In [20]:

    
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states)
obj4









    Out[20]:





California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64



In [21]:

    
pd.isnull(obj4)









    Out[21]:





California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool



In [22]:

    
pd.notnull(obj4)









    Out[22]:





California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool



In [13]:

    
obj4.isnull()









    Out[13]:





California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool



In [14]:

    
obj3









    Out[14]:





Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64



In [15]:

    
obj4









    Out[15]:





California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64



In [16]:

    
obj3 + obj4









    Out[16]:





California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64



In [17]:

    
obj4.name = 'population'
obj4.index.name = 'state'
obj4









    Out[17]:





state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64



In [23]:

    
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj









    Out[23]:





Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

DataFrame



In [24]:

    
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)



In [25]:

    
frame



In [26]:

    
DataFrame(data, columns=['year', 'state', 'pop'])



In [27]:

    
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                   index=['one', 'two', 'three', 'four', 'five'])
frame2









    Out[27]:






  
    
      
      year
      state
      pop
      debt
    
  
  
    
      one
      2000
      Ohio
      1.5
      NaN
    
    
      two
      2001
      Ohio
      1.7
      NaN
    
    
      three
      2002
      Ohio
      3.6
      NaN
    
    
      four
      2001
      Nevada
      2.4
      NaN
    
    
      five
      2002
      Nevada
      2.9
      NaN



In [28]:

    
frame2.columns









    Out[28]:





Index([u'year', u'state', u'pop', u'debt'], dtype='object')

A column in a DataFrame can be retrieved as a Series either by dict-like notation or by attribute:



In [29]:

    
frame2['state']









    Out[29]:





one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object



In [30]:

    
frame2.year









    Out[30]:





one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

Rows can also be retrieved by position or name by a couple of methods, such as the ix indexing field



In [ ]:

    
frame2.ix['three']

Columns can be modified by assignment. For example, the empty 'debt' column could be assigned a scalar value or an array of values:



In [32]:

    
frame2['debt'] = 16.5
frame2



In [33]:

    
frame2['debt'] = np.arange(5.)
frame2

When assigning lists or arrays to a column, the value’s length must match the length of the DataFrame. If you assign a Series, it will be instead conformed exactly to the DataFrame’s index, inserting missing values in any holes:



In [34]:

    
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2









    Out[34]:






  
    
      
      year
      state
      pop
      debt
    
  
  
    
      one
      2000
      Ohio
      1.5
      NaN
    
    
      two
      2001
      Ohio
      1.7
      -1.2
    
    
      three
      2002
      Ohio
      3.6
      NaN
    
    
      four
      2001
      Nevada
      2.4
      -1.5
    
    
      five
      2002
      Nevada
      2.9
      -1.7

Assigning a column that doesn’t exist will create a new column.



In [35]:

    
frame2['eastern'] = frame2.state == 'Ohio'
frame2









    Out[35]:






  
    
      
      year
      state
      pop
      debt
      eastern
    
  
  
    
      one
      2000
      Ohio
      1.5
      NaN
      True
    
    
      two
      2001
      Ohio
      1.7
      -1.2
      True
    
    
      three
      2002
      Ohio
      3.6
      NaN
      True
    
    
      four
      2001
      Nevada
      2.4
      -1.5
      False
    
    
      five
      2002
      Nevada
      2.9
      -1.7
      False

The del keyword will delete columns as with a dict:



In [36]:

    
del frame2['eastern']
frame2.columns









    Out[36]:





Index([u'year', u'state', u'pop', u'debt'], dtype='object')

If passed to DataFrame, a nested dict of dics will be interpreted as: the outer dict keys as the columns and the inner keys as the row indices:



In [40]:

    
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}



In [41]:

    
frame3 = DataFrame(pop)
frame3



In [42]:

    
frame3.T

The keys in the inner dicts are unioned and sorted to form the index in the result. This isn’t true if an explicit index is specified:



In [43]:

    
DataFrame(pop, index=[2001, 2002, 2003])

Dicts of Series are treated much in the same way:



In [44]:

    
frame3['Ohio'][:-1]









    Out[44]:





2000    1.5
2001    1.7
Name: Ohio, dtype: float64



In [45]:

    
frame3['Nevada'][:2]









    Out[45]:





2000    NaN
2001    2.4
Name: Nevada, dtype: float64



In [46]:

    
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}
DataFrame(pdata)

If a DataFrame’s index and columns have their name attributes set, these will also be displayed:



In [47]:

    
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3

Like Series, the values attribute returns the data contained in the DataFrame as a 2D ndarray:



In [48]:

    
frame3.values









    Out[48]:





array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

If the DataFrame’s columns are different dtypes, the dtype of the values array will be chosen to accomodate all of the columns:



In [49]:

    
frame2.values









    Out[49]:





array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7]], dtype=object)

Index objects

pandas’s Index objects are responsible for holding the axis labels and other metadata (like the axis name or names). Any array or other sequence of labels used when con- structing a Series or DataFrame is internally converted to an Index:



In [50]:

    
obj = Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index









    Out[50]:





Index([u'a', u'b', u'c'], dtype='object')



In [51]:

    
index[1:]









    Out[51]:





Index([u'b', u'c'], dtype='object')

Index objects are immutable and thus can’t be modified by the user:



In [ ]:

    
# this below will generate an error

index[1] = 'd'

Immutability is important so that Index objects can be safely shared among data structures:



In [ ]:

    
index = pd.Index(np.arange(3))
obj2 = Series([1.5, -2.5, 0], index=index)
obj2.index is index

In addition to being array-like, an Index also functions as a fixed-size set:



In [52]:

    
frame3



In [53]:

    
'Ohio' in frame3.columns









    Out[53]:





True



In [54]:

    
2003 in frame3.index









    Out[54]:





False

Essential functionality

Reindexing

A critical method on pandas objects is reindex, which means to create a new object with the data conformed to a new index. Consider a simple example from above:



In [55]:

    
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj









    Out[55]:





d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

Calling reindex on this Series rearranges the data according to the new index, intro- ducing missing values if any index values were not already present:



In [56]:

    
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2









    Out[56]:





a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

For ordered data like time series, it may be desirable to do some interpolation or filling of values when reindexing. The method option allows us to do this, using a method such as ffill which forward fills the values:



In [57]:

    
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)









    Out[57]:





a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64



In [58]:

    
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3.reindex(range(6), method='ffill')









    Out[58]:





0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object



In [59]:

    
frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],
                  columns=['Ohio', 'Texas', 'California'])
frame



In [60]:

    
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

The columns can be reindexed using the columns keyword:



In [61]:

    
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Both can be reindexed in one shot, though interpolation will only apply row-wise (axis 0):



In [62]:

    
frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill',
              columns=states)

As you’ll see soon, reindexing can be done more succinctly by label-indexing with ix:



In [ ]:

    
frame.ix[['a', 'b', 'c', 'd'], states]

Dropping entries from an axis

Dropping one or more entries from an axis is easy if you have an index array or list without those entries. As that can require a bit of munging and set logic, the drop method will return a new object with the indicated value or values deleted from an axis:



In [63]:

    
obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')
new_obj









    Out[63]:





a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64



In [64]:

    
obj.drop(['d', 'c'])









    Out[64]:





a    0.0
b    1.0
e    4.0
dtype: float64

With DataFrame, index values can be deleted from either axis:



In [65]:

    
data = DataFrame(np.arange(16).reshape((4, 4)),
                 index=['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns=['one', 'two', 'three', 'four'])



In [66]:

    
data.drop(['Colorado', 'Ohio'])



In [69]:

    
data.drop('Utah', axis=0)



In [67]:

    
data.drop('two', axis=1)



In [68]:

    
data.drop(['two', 'four'], axis=1)

Indexing, selection, and filtering

Series indexing (obj[...]) works analogously to NumPy array indexing, except you can use the Series’s index values instead of only integers. Here are some examples of this:



In [72]:

    
obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj









    Out[72]:





a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64



In [73]:

    
obj['b']









    Out[73]:





1.0



In [71]:

    
obj[1]









    Out[71]:





1.0



In [74]:

    
obj[2:4]









    Out[74]:





c    2.0
d    3.0
dtype: float64



In [75]:

    
obj[['b', 'a', 'd']]









    Out[75]:





b    1.0
a    0.0
d    3.0
dtype: float64



In [76]:

    
obj[[1, 3]]









    Out[76]:





b    1.0
d    3.0
dtype: float64



In [77]:

    
obj[obj < 2]









    Out[77]:





a    0.0
b    1.0
dtype: float64

Slicing with labels behaves differently than normal Python slicing in that the endpoint is inclusive:



In [78]:

    
obj['b':'c']









    Out[78]:





b    1.0
c    2.0
dtype: float64

Setting using these methods works just as you would expect:



In [79]:

    
obj['b':'c'] = 5
obj









    Out[79]:





a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

As you’ve seen above, indexing into a DataFrame is for retrieving one or more columns either with a single value or sequence:



In [80]:

    
data = DataFrame(np.arange(16).reshape((4, 4)),
                 index=['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns=['one', 'two', 'three', 'four'])
data



In [81]:

    
data['two']









    Out[81]:





Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64



In [82]:

    
data[['three', 'one']]

Indexing like this has a few special cases. First selecting rows by slicing or a boolean array:



In [83]:

    
data[:2]



In [84]:

    
data[data['three'] > 5]



In [85]:

    
data < 5









    Out[85]:






  
    
      
      one
      two
      three
      four
    
  
  
    
      Ohio
      True
      True
      True
      True
    
    
      Colorado
      True
      False
      False
      False
    
    
      Utah
      False
      False
      False
      False
    
    
      New York
      False
      False
      False
      False



In [86]:

    
data[data < 5] = 0



In [87]:

    
data

Indexing field ix enables you to select a subset of the rows and columns from a DataFrame with NumPy- like notation plus axis labels. This is also a less verbose way to do reindexing:



In [88]:

    
data.ix['Colorado', ['two', 'three']]









    Out[88]:





two      5
three    6
Name: Colorado, dtype: int64



In [89]:

    
data.ix[['Colorado', 'Utah'], [3, 0, 1]]



In [90]:

    
data.ix[2]









    Out[90]:





one       8
two       9
three    10
four     11
Name: Utah, dtype: int64



In [91]:

    
data.ix[:'Utah', 'two']









    Out[91]:





Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64



In [92]:

    
data.ix[data.three > 5, :3]

Arithmetic and data alignment

One of the most important pandas features is the behavior of arithmetic between ob- jects with different indexes. When adding together objects, if any index pairs are not the same, the respective index in the result will be the union of the index pairs. Let’s look at a simple example:



In [93]:

    
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])



In [94]:

    
s1









    Out[94]:





a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64



In [95]:

    
s2









    Out[95]:





a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64



In [96]:

    
s1 + s2









    Out[96]:





a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

The internal data alignment introduces NA values in the indices that don’t overlap. Missing values propagate in arithmetic computations.

In the case of DataFrame, alignment is performed on both the rows and the columns:



In [97]:

    
df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                index=['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1



In [98]:

    
df2

Adding these together returns a DataFrame whose index and columns are the unions of the ones in each DataFrame:



In [99]:

    
df1 + df2









    Out[99]:






  
    
      
      b
      c
      d
      e
    
  
  
    
      Colorado
      NaN
      NaN
      NaN
      NaN
    
    
      Ohio
      3.0
      NaN
      6.0
      NaN
    
    
      Oregon
      NaN
      NaN
      NaN
      NaN
    
    
      Texas
      9.0
      NaN
      12.0
      NaN
    
    
      Utah
      NaN
      NaN
      NaN
      NaN

Arithmetic methods with fill values

In arithmetic operations between differently-indexed objects, you might want to fill with a special value, like 0, when an axis label is found in one object but not the other:



In [100]:

    
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))
df1



In [101]:

    
df2



In [102]:

    
df1 + df2



In [103]:

    
df1.add(df2, fill_value=0)

Relatedly, when reindexing a Series or DataFrame, you can also specify a different fill value:



In [104]:

    
df1.reindex(columns=df2.columns, fill_value=0)

Operations between DataFrame and Series

As with NumPy arrays, arithmetic between DataFrame and Series is well-defined. First, as a motivating example, consider the difference between a 2D array and one of its rows:



In [105]:

    
arr = np.arange(12.).reshape((3, 4))
arr









    Out[105]:





array([[  0.,   1.,   2.,   3.],
       [  4.,   5.,   6.,   7.],
       [  8.,   9.,  10.,  11.]])



In [106]:

    
arr[0]









    Out[106]:





array([ 0.,  1.,  2.,  3.])



In [107]:

    
arr - arr[0]









    Out[107]:





array([[ 0.,  0.,  0.,  0.],
       [ 4.,  4.,  4.,  4.],
       [ 8.,  8.,  8.,  8.]])

This is referred to as broadcasting. Operations between a DataFrame and a Series are similar:



In [108]:

    
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.ix[0]
frame



In [109]:

    
series









    Out[109]:





b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

By default, arithmetic between DataFrame and Series matches the index of the Series on the DataFrame's columns, broadcasting down the rows:



In [110]:

    
frame - series

If an index value is not found in either the DataFrame’s columns or the Series’s index, the objects will be reindexed to form the union:



In [115]:

    
series2 = Series(range(3), index=['b', 'e', 'f'])
series2









    Out[115]:





b    0
e    1
f    2
dtype: int64



In [116]:

    
frame + series2

If you want to instead broadcast over the columns, matching on the rows, you have to use one of the arithmetic methods. For example:



In [112]:

    
series3 = frame['d']
frame



In [113]:

    
series3









    Out[113]:





Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64



In [114]:

    
frame.sub(series3, axis=0)

Function application and mapping

NumPy ufuncs (element-wise array methods) work fine with pandas objects:



In [117]:

    
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])



In [118]:

    
frame



In [119]:

    
np.abs(frame)



In [120]:

    
f = lambda x: x.max() - x.min()



In [121]:

    
frame.apply(f)









    Out[121]:





b    1.802165
d    1.684034
e    2.689627
dtype: float64



In [122]:

    
frame.apply(f, axis=1)









    Out[122]:





Utah      0.998382
Ohio      2.521511
Texas     0.676115
Oregon    2.542656
dtype: float64

Many of the most common array statistics (like sum and mean) are DataFrame methods, so using apply is not necessary.

The function passed to apply need not return a scalar value, it can also return a Series with multiple values:



In [123]:

    
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

Element-wise Python functions can be used, too. Suppose you wanted to compute a formatted string from each floating point value in frame. You can do this with applymap:



In [124]:

    
format = lambda x: '%.2f' % x
frame.applymap(format)

The reason for the name applymap is that Series has a map method for applying an element-wise function:



In [125]:

    
frame['e'].map(format)









    Out[125]:





Utah      -0.52
Ohio       1.39
Texas      0.77
Oregon    -1.30
Name: e, dtype: object

Sorting and ranking

Sorting a data set by some criterion is another important built-in operation. To sort lexicographically by row or column index, use the sort_index method, which returns a new, sorted object:



In [126]:

    
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()









    Out[126]:





a    1
b    2
c    3
d    0
dtype: int64

With a DataFrame, you can sort by index on either axis:



In [127]:

    
frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])
frame.sort_index()



In [128]:

    
frame.sort_index(axis=1)

The data is sorted in ascending order by default, but can be sorted in descending order, too:



In [129]:

    
frame.sort_index(axis=1, ascending=False)

To sort a Series by its values, use its order method:



In [130]:

    
obj = Series([4, 7, -3, 2])
obj.order()









    



/Users/pmui/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/ipykernel/__main__.py:2: FutureWarning: order is deprecated, use sort_values(...)
  from ipykernel import kernelapp as app






    Out[130]:





2   -3
3    2
0    4
1    7
dtype: int64



In [131]:

    
obj.sort_values()









    Out[131]:





2   -3
3    2
0    4
1    7
dtype: int64

Any missing values are sorted to the end of the Series by default:



In [133]:

    
obj = Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()









    Out[133]:





4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

On DataFrame, you may want to sort by the values in one or more columns. To do so, pass one or more column names to the by option:



In [134]:

    
frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame



In [135]:

    
frame.sort_index(by='b')









    



/Users/pmui/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
  if __name__ == '__main__':






    Out[135]:






  
    
      
      a
      b
    
  
  
    
      2
      0
      -3
    
    
      3
      1
      2
    
    
      0
      0
      4
    
    
      1
      1
      7



In [137]:

    
frame.sort_values(by='b')



In [138]:

    
frame.sort_index(by=['a', 'b'])









    



/Users/pmui/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
  if __name__ == '__main__':






    Out[138]:






  
    
      
      a
      b
    
  
  
    
      2
      0
      -3
    
    
      0
      0
      4
    
    
      3
      1
      2
    
    
      1
      1
      7



In [139]:

    
frame.sort_values(by=['a', 'b'])

Ranking is closely related to sorting, assigning ranks from one through the number of valid data points in an array. It is similar to the indirect sort indices produced by numpy.argsort, except that ties are broken according to a rule. The rank methods for Series and DataFrame are the place to look; by default rank breaks ties by assigning each group the mean rank:



In [141]:

    
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj









    Out[141]:





0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64



In [143]:

    
obj.rank?



In [142]:

    
obj.rank()









    Out[142]:





0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

Ranks can also be assigned according to the order they’re observed in the data:



In [144]:

    
obj.rank(method='first')









    Out[144]:





0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

Naturally, you can rank in descending order, too:



In [147]:

    
obj.rank(ascending=False, method='max')









    Out[147]:





0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

The "method" is used for breaking ties:

'average' Default: assign the average rank to each entry in the equal group.
'min' Use the minimum rank for the whole group.
'max' Use the maximum rank for the whole group.
'first' Assign ranks in the order the values appear in the data.



In [ ]:

    
frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                   'c': [-2, 5, 8, -2.5]})
frame



In [ ]:

    
frame.rank(axis=1)

Axis indexes with duplicate values

While many pandas functions (like reindex) require that the labels be unique, it’s not mandatory. Let’s consider a small Series with duplicate indices:



In [148]:

    
obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj









    Out[148]:





a    0
a    1
b    2
b    3
c    4
dtype: int64



In [149]:

    
obj.index.is_unique









    Out[149]:





False

Data selection is one of the main things that behaves differently with duplicates. In- dexing a value with multiple entries returns a Series while single entries return a scalar value:



In [150]:

    
obj['a']









    Out[150]:





a    0
a    1
dtype: int64



In [151]:

    
obj['c']









    Out[151]:





4

The same logic extends to indexing rows in a DataFrame:



In [152]:

    
df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df



In [153]:

    
df.ix['b']

Summarizing and computing descriptive statistics

pandas objects are equipped with a set of common mathematical and statistical meth- ods. Most of these fall into the category of reductions or summary statistics, methods that extract a single value (like the sum or mean) from a Series or a Series of values from the rows or columns of a DataFrame. Compared with the equivalent methods of vanilla NumPy arrays, they are all built from the ground up to exclude missing data. Consider a small DataFrame:



In [154]:

    
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
                [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
df

Calling DataFrame’s sum method returns a Series containing column sums:



In [155]:

    
df.sum()









    Out[155]:





one    9.25
two   -5.80
dtype: float64

Passing axis=1 sums over the rows instead:



In [156]:

    
df.sum(axis=1)









    Out[156]:





a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64

NA values are excluded unless the entire slice (row or column in this case) is NA. This can be disabled using the skipna option:



In [157]:

    
df.mean(axis=1, skipna=False)









    Out[157]:





a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

Common options for each reduction method are:

axis: Axis to reduce over. 0 for DataFrame’s rows and 1 for columns.
skipna: Exclude missing values, True by default.
level: Reduce grouped by level if the axis is hierarchically-indexed (MultiIndex).

Some methods, like idxmin and idxmax, return indirect statistics like the index value where the minimum or maximum values are attained:



In [158]:

    
df.idxmax()









    Out[158]:





one    b
two    d
dtype: object

Other methods are accumulations:



In [159]:

    
df.cumsum()

Another type of method is neither a reduction nor an accumulation. describe is one such example, producing multiple summary statistics in one shot:



In [160]:

    
df.describe()

On non-numeric data, describe produces alternate summary statistics:



In [161]:

    
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()









    Out[161]:





count     16
unique     3
top        a
freq       8
dtype: object

Correlation and covariance in Finance

Some summary statistics, like correlation and covariance, are computed from pairs of arguments. Let’s consider some DataFrames of stock prices and volumes obtained from Yahoo! Finance:



In [162]:

    
import pandas.io.data as web

all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker)

price = DataFrame({tic: data['Adj Close']
                   for tic, data in all_data.iteritems()})
volume = DataFrame({tic: data['Volume']
                    for tic, data in all_data.iteritems()})









    



/Users/pmui/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/pandas/io/data.py:35: FutureWarning: 
The pandas.io.data module is moved to a separate package (pandas-datareader) and will be removed from pandas in a future version.
After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.
  FutureWarning)

Now, compute percent changes of the prices:



In [201]:

    
price.head()



In [163]:

    
returns = priceA.pct_change()
returns.tail()

The corr method of Series computes the correlation of the overlapping, non-NA, aligned-by-index values in two Series. Relatedly, cov computes the covariance:



In [166]:

    
returns.MSFT.corr(returns.IBM)









    Out[166]:





0.50196224207624862



In [167]:

    
returns.MSFT.cov(returns.IBM)









    Out[167]:





9.0865799179417242e-05

DataFrame’s corr and cov methods, on the other hand, return a full correlation or covariance matrix as a DataFrame, respectively:



In [168]:

    
returns.corr()



In [169]:

    
returns.cov()

Using DataFrame’s corrwith method, you can compute pairwise correlations between a DataFrame’s columns or rows with another Series or DataFrame. Passing a Series returns a Series with the correlation value computed for each column:



In [170]:

    
returns.corrwith(returns.IBM)









    Out[170]:





AAPL    0.388075
GOOG    0.405451
IBM     1.000000
MSFT    0.501962
dtype: float64

Passing a DataFrame computes the correlations of matching column names. Here I compute correlations of percent changes with volume:



In [171]:

    
returns.corrwith(volume)









    Out[171]:





AAPL   -0.077827
GOOG   -0.007241
IBM    -0.207137
MSFT   -0.098255
dtype: float64

Passing axis=1 does things row-wise instead. In all cases, the data points are aligned by label before computing the correlation.

Unique values, value counts, and membership

Another class of related methods extracts information about the values contained in a one-dimensional Series.

isin: Compute boolean array indicating whether each Series value is contained in the passed sequence of values.
unique: Compute array of unique values in a Series, returned in the order observed.
value_counts: Return a Series containing unique values as its index and frequencies as its values, ordered count in descending order.

To illustrate these, consider this example:



In [172]:

    
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

The first function is unique, which gives you an array of the unique values in a Series:



In [173]:

    
uniques = obj.unique()
uniques









    Out[173]:





array(['c', 'a', 'd', 'b'], dtype=object)

The unique values are not necessarily returned in sorted order, but could be sorted after the fact if needed (uniques.sort()). Relatedly, value_counts computes a Series con- taining value frequencies:



In [174]:

    
obj.value_counts()









    Out[174]:





c    3
a    3
b    2
d    1
dtype: int64

The Series is sorted by value in descending order as a convenience. value_counts is also available as a top-level pandas method that can be used with any array or sequence:



In [175]:

    
pd.value_counts(obj.values, sort=False)









    Out[175]:





a    3
c    3
b    2
d    1
dtype: int64

Lastly, isin is responsible for vectorized set membership and can be very useful in filtering a data set down to a subset of values in a Series or column in a DataFrame:



In [176]:

    
mask = obj.isin(['b', 'c'])
mask









    Out[176]:





0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool



In [177]:

    
obj[mask]









    Out[177]:





0    c
5    b
6    b
7    c
8    c
dtype: object

In some cases, you may want to compute a histogram on multiple related columns in a DataFrame. Here’s an example:



In [178]:

    
data = DataFrame({'Qu1': [1, 3, 4, 3, 4],
                  'Qu2': [2, 3, 1, 2, 3],
                  'Qu3': [1, 5, 2, 4, 4]})
data

Passing pandas.value_counts to this DataFrame’s apply function gives:



In [179]:

    
result = data.apply(pd.value_counts).fillna(0)
result

Handling missing data

Missing data is common in most data analysis applications. One of the goals in de- signing pandas was to make working with missing data as painless as possible. For example, all of the descriptive statistics on pandas objects exclude missing data as you’ve seen earlier.

pandas uses the floating point value NaN (Not a Number) to represent missing data in both floating as well as in non-floating point arrays. It is just used as a sentinel that can be easily detected:



In [260]:

    
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data









    Out[260]:





0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object



In [261]:

    
string_data.isnull()









    Out[261]:





0    False
1    False
2     True
3    False
dtype: bool

The built-in Python None value is also treated as NA in object arrays:



In [182]:

    
string_data[0] = None
string_data.isnull()









    Out[182]:





0     True
1    False
2     True
3    False
dtype: bool

Filtering out missing data

Common NA handling methods are:

dropna: Filter axis labels based on whether values for each label have missing data, with varying thresholds for how much missing data to tolerate.
fillna: Fill in missing data with some value or using an interpolation method such as 'ffill' or 'bfill'.
isnull: Return like-type object containing boolean values indicating which values are missing / NA.
notnull: Negation of isnull.

You have a number of options for filtering out missing data. While doing it by hand is always an option, dropna can be very helpful. On a Series, it returns the Series with only the non-null data and index values:



In [262]:

    
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data.dropna()









    Out[262]:





0    1.0
2    3.5
4    7.0
dtype: float64

Naturally, you could have computed this yourself by boolean indexing:



In [185]:

    
data[data.notnull()]









    Out[185]:





0    1.0
2    3.5
4    7.0
dtype: float64

With DataFrame objects, these are a bit more complex. You may want to drop rows or columns which are all NA or just those containing any NAs. dropna by default drops any row containing a missing value:



In [186]:

    
data = DataFrame([[1., 6.5, 3.], [1., NA, NA],
                  [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
data



In [187]:

    
cleaned

Passing how='all' will only drop rows that are all NA:



In [188]:

    
data.dropna(how='all')

Dropping columns in the same way is only a matter of passing axis=1:



In [189]:

    
data[4] = NA
data



In [190]:

    
data.dropna(axis=1, how='all')

A related way to filter out DataFrame rows tends to concern time series data. Suppose you want to keep only rows containing a certain number of observations. You can indicate this with the thresh argument:



In [263]:

    
df = DataFrame(np.random.randn(7, 3))
df.ix[:4, 1] = NA; df.ix[:2, 2] = NA
df



In [264]:

    
df.dropna(thresh=3)

Filling in missing data

Rather than filtering out missing data (and potentially discarding other data along with it), you may want to fill in the “holes” in any number of ways. For most purposes, the fillna method is the workhorse function to use. Calling fillna with a constant replaces missing values with that value:



In [265]:

    
df.fillna(0)

Calling fillna with a dict you can use a different fill value for each column:



In [195]:

    
df.fillna({1: 0.5, 2: -1})

fillna returns a new object, but you can modify the existing object in place:



In [196]:

    
# always returns a reference to the filled object
_ = df.fillna(0, inplace=True)
df

The same interpolation methods available for reindexing can be used with fillna:



In [269]:

    
df = DataFrame(np.random.randn(6, 3))
df.ix[2:, 1] = NA; df.ix[4:, 2] = NA
df



In [270]:

    
df.fillna(method='ffill')



In [271]:

    
df.fillna(method='ffill', limit=2)

With fillna you can do lots of other things with a little creativity. For example, you might pass the mean or median value of a Series:



In [199]:

    
data = Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())









    Out[199]:





0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

Hierarchical indexing

Hierarchical indexing is an important feature of pandas enabling you to have multiple (two or more) index levels on an axis. Somewhat abstractly, it provides a way for you to work with higher dimensional data in a lower dimensional form. Let’s start with a simple example; create a Series with a list of lists or arrays as the index:



In [266]:

    
data = Series(np.random.randn(10),
              index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                     [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])
data









    Out[266]:





a  1    0.476985
   2    3.248944
   3   -1.021228
b  1   -0.577087
   2    0.124121
   3    0.302614
c  1    0.523772
   2    0.000940
d  2    1.343810
   3   -0.713544
dtype: float64



In [267]:

    
data.index









    Out[267]:





MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])



In [ ]:

    
data['b']



In [ ]:

    
data['b':'c']



In [ ]:

    
data.ix[['b', 'd']]



In [ ]:

    
data[:, 2]



In [ ]:

    
data.unstack()



In [ ]:

    
data.unstack().stack()



In [ ]:

    
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'],
                           ['Green', 'Red', 'Green']])
frame



In [ ]:

    
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame



In [ ]:

    
frame['Ohio']

MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']], names=['state', 'color'])

Reordering and sorting levels



In [ ]:

    
frame.swaplevel('key1', 'key2')



In [ ]:

    
frame.sortlevel(1)



In [ ]:

    
frame.swaplevel(0, 1).sortlevel(0)

Summary statistics by level



In [ ]:

    
frame.sum(level='key2')



In [ ]:

    
frame.sum(level='color', axis=1)

Using a DataFrame's columns



In [ ]:

    
frame = DataFrame({'a': range(7), 'b': range(7, 0, -1),
                   'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                   'd': [0, 1, 2, 0, 1, 2, 3]})
frame



In [ ]:

    
frame2 = frame.set_index(['c', 'd'])
frame2



In [ ]:

    
frame.set_index(['c', 'd'], drop=False)



In [ ]:

    
frame2.reset_index()

	a	b	c	d	e
0	0.0	2.0	4.0	6.0	4.0
1	9.0	11.0	13.0	15.0	9.0
2	18.0	20.0	22.0	24.0	14.0
3	15.0	16.0	17.0	18.0	19.0

	b	d	e
Utah	-0.204708	0.478943	-0.519439
Ohio	-0.555730	1.965781	1.393406
Texas	0.092908	0.281746	0.769023
Oregon	1.246435	1.007189	-1.296221

	b	d	e
Utah	0.204708	0.478943	0.519439
Ohio	0.555730	1.965781	1.393406
Texas	0.092908	0.281746	0.769023
Oregon	1.246435	1.007189	1.296221

	b	d	e
min	-0.555730	0.281746	-1.296221
max	1.246435	1.965781	1.393406

	0	1	2
a	0.274992	0.228913	1.352917
a	0.886429	-2.001637	-0.371843
b	1.669025	-0.438570	-0.539741
b	0.476985	3.248944	-1.021228