Pandas

Chapter 5 Intro to data Structure


In [1]:
#initialize
%matplotlib inline 
%qtconsole

In [ ]:


In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
from pandas import Timestamp
randn = np.random.randn
from pandas import *

5.1 Series


In [3]:
#s = Series(data, index=index)
s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e'])
s
s.index


Out[3]:
Index([u'a', u'b', u'c', u'd', u'e'], dtype=object)

In [4]:
#From dict
d = {'a' : 0.0, 'b' : 1.1, 'c' : 2.2 }
Series(d)
Series(d, index = ['a','b'])
Series(d, index = ['a','b','z'])
Series(5, index = ['a', 'b', 'c', 'd'])


Out[4]:
a    5
b    5
c    5
d    5
dtype: int64

5.1.1 Series is ndarray-like


In [64]:
s['a']
s[:3]
s[[4,3,1]]
s[s> s.median()]
s>s.median()


Out[64]:
a     True
b     True
c    False
d    False
e    False
dtype: bool

In [62]:
np.exp(s)


Out[62]:
a    2.760369
b    1.487827
c    0.183412
d    0.583944
e    0.603313
dtype: float64

5.1.2 Series is dict-like


In [66]:
s['a'] = 12
s


Out[66]:
a    12.000000
b     0.397317
c    -1.696023
d    -0.537950
e    -0.505318
dtype: float64

In [68]:
'ee' in s
s['ee']


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-68-45038a6cba50> in <module>()
      1 'ee' in s
----> 2 s['ee']

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
    611     def __getitem__(self, key):
    612         try:
--> 613             return self.index.get_value(self, key)
    614         except InvalidIndexError:
    615             pass

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/index.pyc in get_value(self, series, key)
    775                     raise InvalidIndexError(key)
    776                 else:
--> 777                     raise e1
    778             except Exception:  # pragma: no cover
    779                 raise e1

KeyError: 'ee'

In [74]:
s.get('a')
s.get('f', np.Inf)


Out[74]:
inf

5.1.3 Vectorized operations and label alignment with Series


In [80]:
s + s
s*2
np.exp(s)
s[1:] + s[:-1]


Out[80]:
a         NaN
b    0.794634
c   -3.392045
d   -1.075901
e         NaN
dtype: float64

5.1.4 Name attribute


In [85]:
s = Series(np.random.randn(5), dtype = int32, name = 'whatever')
type(s)


Out[85]:
pandas.core.series.Series

5.2 DataFrame

  • DataFrame accepts many different kinds of inputs:

    • Dict of 1D ndarrays, lists, dists, or Series
    • 2-D numpy.ndarray
    • Structured or record ndarray
    • A Series
    • Another DataFrame

5.2.1 From dict of Series or dicts


In [61]:
d = {'one' : Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}

In [62]:
df = DataFrame(d)
df


Out[62]:
one two
a 1 1
b 2 2
c 3 3
d NaN 4

In [24]:
d2 = DataFrame(d, index=['d', 'b', 'a'])
d2['one']['d']
#d2[1][2]??


Out[24]:
nan

In [27]:
df.index
df.columns


Out[27]:
Index([one, two], dtype=object)

5.2.2 From dict of ndarrays /lists


In [30]:
d = {'one' : [1., 2., 3., 4.],
     'two' : [4., 3., 2., 1.]}

In [32]:
DataFrame(d, index=['a', 'b', 'c', 'd'])


Out[32]:
one two
a 1 4
b 2 3
c 3 2
d 4 1

5.2.3 From structured or record array


In [14]:
#This case is handled identically to a dict of arrays
data = np.zeros((2,), dtype=[('A', 'i4'), ('B', 'f4'),('C','a10')])

In [15]:
data[:] = [(1,2.,'Hello'),(2,3.,"World")]

In [16]:
DataFrame(data)


Out[16]:
A B C
0 1 2 Hello
1 2 3 World

In [22]:
DataFrame(data, index=['first', 'second'])


Out[22]:
A B C
first 1 2 Hello
second 2 3 World

In [24]:
DataFrame(data,columns=['C','A','B'])


Out[24]:
C A B
0 Hello 1 2
1 World 2 3

5.2.4 From a list of dicts


In [25]:
data2 = [{'a':1, 'b':2}, {'a':5,'b':10, 'c':20}]

In [36]:
DataFrame(data2, index = ['first','second'])


Out[36]:
a b c
first 1 2 NaN
second 5 10 20

In [37]:
DataFrame(data2, columns=['a','b']) # A selection


Out[37]:
a b
0 1 2
1 5 10

5.2.5 From a Series

5.2.7 Column Selection, addition, deletion


In [42]:
df['one']


Out[42]:
a     1
b     2
c     3
d   NaN
Name: one, dtype: float64

In [43]:
df['three'] = df['one']*df['two']

In [49]:
boolindex = df['two']>1
boolindex


Out[49]:
a    False
b     True
c     True
d     True
Name: two, dtype: bool

In [50]:
df[boolindex]


Out[50]:
one two three
b 2 2 4
c 3 3 9
d NaN 4 NaN

In [52]:
df['b']


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-52-f2a266ed0ffd> in <module>()
----> 1 df['b']

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1926         else:
   1927             # get column
-> 1928             return self._get_item_cache(key)
   1929 
   1930     def _getitem_slice(self, key):

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
    568             return cache[item]
    569         except Exception:
--> 570             values = self._data.get(item)
    571             res = self._box_item_values(item, values)
    572             cache[item] = res

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item)
   1381 
   1382     def get(self, item):
-> 1383         _, block = self._find_block(item)
   1384         return block.get(item)
   1385 

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.pyc in _find_block(self, item)
   1523 
   1524     def _find_block(self, item):
-> 1525         self._check_have(item)
   1526         for i, block in enumerate(self.blocks):
   1527             if item in block:

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.pyc in _check_have(self, item)
   1530     def _check_have(self, item):
   1531         if item not in self.items:
-> 1532             raise KeyError('no item named %s' % com.pprint_thing(item))
   1533 
   1534     def reindex_axis(self, new_axis, method=None, axis=0, copy=True):

KeyError: u'no item named b'

In [53]:
del df['two']

In [54]:
df


Out[54]:
one three
a 1 1
b 2 4
c 3 9
d NaN NaN

In [55]:
df['new'] = 1

In [56]:
df


Out[56]:
one three new
a 1 1 1
b 2 4 1
c 3 9 1
d NaN NaN 1

In [59]:
df.ix[3]


Out[59]:
one     NaN
three   NaN
new       1
Name: d, dtype: float64

5.2.8 Indexing/Selection

|Operation | Syntax | Result |Select column | df[col] | Series |Select row by lable | df.xs(lable) or df.ix[lable] | Series |Select row by location(int) | df.ix[loc] | Series |Slice rows | df[5:10] | DataFrame |Selection rows by boolean vector | df[bool_vec] | DataFrame

In [64]:
df[1]


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-64-dc0ea82142cb> in <module>()
----> 1 df[1]

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1926         else:
   1927             # get column
-> 1928             return self._get_item_cache(key)
   1929 
   1930     def _getitem_slice(self, key):

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
    568             return cache[item]
    569         except Exception:
--> 570             values = self._data.get(item)
    571             res = self._box_item_values(item, values)
    572             cache[item] = res

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item)
   1381 
   1382     def get(self, item):
-> 1383         _, block = self._find_block(item)
   1384         return block.get(item)
   1385 

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.pyc in _find_block(self, item)
   1523 
   1524     def _find_block(self, item):
-> 1525         self._check_have(item)
   1526         for i, block in enumerate(self.blocks):
   1527             if item in block:

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.pyc in _check_have(self, item)
   1530     def _check_have(self, item):
   1531         if item not in self.items:
-> 1532             raise KeyError('no item named %s' % com.pprint_thing(item))
   1533 
   1534     def reindex_axis(self, new_axis, method=None, axis=0, copy=True):

KeyError: u'no item named 1'

5.2.9 DataTypes


In [75]:
df['interger'] = 1
df['int32'] = df['interger'].astype('int32')
df['float32'] = Series([1.0]*len(df),dtype='float32')
df['timestamp'] = Timestamp('20010102')

In [77]:
df.dtypes


Out[77]:
one                 float64
two                 float64
interger              int64
int32                 int32
float32             float32
timestamp    datetime64[ns]
dtype: object

5.2.11 Data alignment and arithmetic


In [3]:
index_df = date_range('1', periods=10)  #use data range as index, otherwise, Boom!!
df = DataFrame(randn(10, 4), index=index_df, columns=['A', 'B', 'C', 'D'])
df - df['A']
(df.T - df['A']).T


/Users/chenchen/anaconda/lib/python2.7/site-packages/pandas/core/frame.py:3619: FutureWarning: TimeSeries broadcasting along DataFrame index by default is deprecated. Please use DataFrame.<op> to explicitly broadcast arithmetic operations along the index
  FutureWarning)
Out[3]:
A B C D
2013-11-01 0 0.070616 1.764970 0.483617
2013-11-02 0 -2.154749 -0.160766 1.531038
2013-11-03 0 -0.590357 0.202836 1.108110
2013-11-04 0 -0.219865 -0.262338 -1.444890
2013-11-05 0 0.258523 -1.696840 -1.297896
2013-11-06 0 -1.112081 0.808770 -1.233935
2013-11-07 0 1.497496 -0.272568 1.900655
2013-11-08 0 1.618075 0.784253 0.442696
2013-11-09 0 0.170048 1.366825 0.277063
2013-11-10 0 0.676875 0.121673 -0.862354

In [7]:
df - df['A']
df * 5 + 2
1/df
df ** 4


Out[7]:
A B C D
2013-11-01 8.056042 6.787938 0.000041 2.081309
2013-11-02 0.000116 17.694768 0.000011 7.142781
2013-11-03 0.000301 0.271847 0.000026 0.908869
2013-11-04 0.022124 0.000756 0.000231 1.258766
2013-11-05 0.314838 1.030715 0.806888 0.090728
2013-11-06 0.000267 0.938527 0.769471 1.496948
2013-11-07 0.281678 0.349674 1.004335 1.887639
2013-11-08 2.014804 0.033142 0.027480 0.314228
2013-11-09 0.019255 0.001680 0.977456 0.000083
2013-11-10 2.155483 0.081802 1.411590 18.503734

5.2.12 Transposing


In [8]:
df[:5]


Out[8]:
A B C D
2013-11-01 -1.684730 -1.614115 0.080239 -1.201114
2013-11-02 0.103770 -2.050979 -0.056996 1.634808
2013-11-03 -0.131716 -0.722073 0.071120 0.976394
2013-11-04 0.385670 0.165805 0.123332 -1.059220
2013-11-05 0.749069 1.007592 -0.947771 -0.548827

In [9]:
df[:5].T


Out[9]:
2013-11-01 00:00:00 2013-11-02 00:00:00 2013-11-03 00:00:00 2013-11-04 00:00:00 2013-11-05 00:00:00
A -1.684730 0.103770 -0.131716 0.385670 0.749069
B -1.614115 -2.050979 -0.722073 0.165805 1.007592
C 0.080239 -0.056996 0.071120 0.123332 -0.947771
D -1.201114 1.634808 0.976394 -1.059220 -0.548827

5.2.13DataFrame Interoperability with NumPy functions


In [10]:
np.exp(df)


Out[10]:
A B C D
2013-11-01 0.185494 0.199067 1.083546 0.300859
2013-11-02 1.109346 0.128609 0.944598 5.128475
2013-11-03 0.876590 0.485744 1.073710 2.654867
2013-11-04 1.470599 1.180343 1.131260 0.346726
2013-11-05 2.115030 2.738997 0.387604 0.577627
2013-11-06 1.136345 0.373714 2.551258 0.330841
2013-11-07 0.482625 2.157568 0.367482 3.228897
2013-11-08 0.303795 1.532153 0.665545 0.472978
2013-11-09 0.689003 0.816718 2.702874 0.908967
2013-11-10 0.297698 0.585786 0.336216 0.125678

In [11]:
np.asarray(df)


Out[11]:
array([[-1.68473046, -1.61411495,  0.08023923, -1.20111378],
       [ 0.10377037, -2.05097901, -0.05699605,  1.63480831],
       [-0.13171584, -0.72207297,  0.07112033,  0.97639441],
       [ 0.38567009,  0.16580471,  0.12333169, -1.05922022],
       [ 0.74906882,  1.00759175, -0.94777076, -0.54882724],
       [ 0.12781657, -0.98426416,  0.93658673, -1.10611851],
       [-0.72851447,  0.76898165, -1.0010821 ,  1.1721406 ],
       [-1.1914016 ,  0.42667381, -0.40714827, -0.74870594],
       [-0.37250927, -0.20246151,  0.99431559, -0.09544651],
       [-1.21167503, -0.53480017, -1.09000165, -2.07402908]])

In [13]:
df.T.dot(df)


Out[13]:
A B C D
A 7.149707 2.949391 1.471583 3.690284
B 2.949391 10.445165 -2.482248 -0.048738
C 1.471583 -2.482248 5.150122 0.530655
D 3.690284 -0.048738 0.530655 13.960456

In [14]:
s1 = Series(np.arange(5,10))
s1.dot(s1)


Out[14]:
255

5.2.14 Consol Display


In [15]:
#baseball = read_csv('a_large_data_set.csv')
#print baseball (only have summary)
#print baseball.ix[-20:, :12].to_string() represant as a tabulate formate

In [21]:
DataFrame(randn(3,12))


Out[21]:
&ltclass 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 0 to 2
Data columns (total 12 columns):
0     3  non-null values
1     3  non-null values
2     3  non-null values
3     3  non-null values
4     3  non-null values
5     3  non-null values
6     3  non-null values
7     3  non-null values
8     3  non-null values
9     3  non-null values
10    3  non-null values
11    3  non-null values
dtypes: float64(12)

In [24]:
df.C


Out[24]:
2013-11-01    0.080239
2013-11-02   -0.056996
2013-11-03    0.071120
2013-11-04    0.123332
2013-11-05   -0.947771
2013-11-06    0.936587
2013-11-07   -1.001082
2013-11-08   -0.407148
2013-11-09    0.994316
2013-11-10   -1.090002
Freq: D, Name: C, dtype: float64

5.3 Panel : 3D Data

Chapter Six: Essential Basic Functionality

6.1 Head and Tail


In [30]:
long_series = Series(randn(1000))
long_series.head()
long_series.tail(3)


Out[30]:
997    1.102883
998   -0.304486
999    0.998342
dtype: float64

In [1]:
long_series.values;


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-1fab067e9e82> in <module>()
----> 1 long_series.values;

NameError: name 'long_series' is not defined

6.4 Descriptive Statistics


In [33]:
df


Out[33]:
A B C D
2013-11-01 -1.684730 -1.614115 0.080239 -1.201114
2013-11-02 0.103770 -2.050979 -0.056996 1.634808
2013-11-03 -0.131716 -0.722073 0.071120 0.976394
2013-11-04 0.385670 0.165805 0.123332 -1.059220
2013-11-05 0.749069 1.007592 -0.947771 -0.548827
2013-11-06 0.127817 -0.984264 0.936587 -1.106119
2013-11-07 -0.728514 0.768982 -1.001082 1.172141
2013-11-08 -1.191402 0.426674 -0.407148 -0.748706
2013-11-09 -0.372509 -0.202462 0.994316 -0.095447
2013-11-10 -1.211675 -0.534800 -1.090002 -2.074029

In [37]:
df.mean(0)


Out[37]:
A   -0.395422
B   -0.373964
C   -0.129741
D   -0.305012
dtype: float64

In [38]:
df.mean(1)


Out[38]:
2013-11-01   -1.104930
2013-11-02   -0.092349
2013-11-03    0.048431
2013-11-04   -0.096103
2013-11-05    0.065016
2013-11-06   -0.256495
2013-11-07    0.052881
2013-11-08   -0.480146
2013-11-09    0.080975
2013-11-10   -1.227626
Freq: D, dtype: float64

In [39]:
df.sum(0, skipna=True)


Out[39]:
A   -3.954221
B   -3.739641
C   -1.297405
D   -3.050118
dtype: float64

In [42]:
df.std()


Out[42]:
A    0.787833
B    1.002590
C    0.743998
D    1.203243
dtype: float64

In [46]:
df.cumsum()


Out[46]:
A B C D
2013-11-01 -1.684730 -1.614115 0.080239 -1.201114
2013-11-02 -1.580960 -3.665094 0.023243 0.433695
2013-11-03 -1.712676 -4.387167 0.094364 1.410089
2013-11-04 -1.327006 -4.221362 0.217695 0.350869
2013-11-05 -0.577937 -3.213770 -0.730076 -0.197959
2013-11-06 -0.450120 -4.198035 0.206511 -1.304077
2013-11-07 -1.178635 -3.429053 -0.794571 -0.131936
2013-11-08 -2.370037 -3.002379 -1.201719 -0.880642
2013-11-09 -2.742546 -3.204841 -0.207404 -0.976089
2013-11-10 -3.954221 -3.739641 -1.297405 -3.050118
  • Other function:
    • count
    • sum
    • mean
    • median
    • min
    • max
    • abs
    • prod
    • std
    • var
    • skew
    • kurt
    • quantile
    • cumsum
    • cumprod
    • cummax
    • cummin

In [48]:
long_series.describe()


Out[48]:
count    1000.000000
mean        0.012889
std         1.017901
min        -2.997970
25%        -0.633205
50%         0.048765
75%         0.693814
max         3.278543
dtype: float64

In [49]:
long_series.idxmax()


Out[49]:
992

In [50]:
long_series.idxmin()


Out[50]:
229

In [52]:
long_series.ix[229]


Out[52]:
-2.9979700475667403

In [56]:
df.idxmax(axis=1)


Out[56]:
2013-11-01    C
2013-11-02    D
2013-11-03    D
2013-11-04    A
2013-11-05    B
2013-11-06    C
2013-11-07    D
2013-11-08    B
2013-11-09    C
2013-11-10    B
Freq: D, dtype: object

In [57]:
df.apply(np.mean)


Out[57]:
A   -0.395422
B   -0.373964
C   -0.129741
D   -0.305012
dtype: float64

6.6 Reindexing and altering lables

Chapter 7 Indexing and Selecting

7.1 Basics


In [65]:
df = DataFrame(randn(8,4), index=range(1, 9), columns=range(1,9,2))

In [66]:
df.ix[1]


Out[66]:
1    0.718618
3   -0.597387
5   -1.231868
7    1.122448
Name: 1, dtype: float64

In [67]:
df


Out[67]:
1 3 5 7
1 0.718618 -0.597387 -1.231868 1.122448
2 0.427829 -0.903850 0.666879 1.836270
3 0.202233 1.554029 -0.127751 0.814677
4 0.568951 1.646502 -1.925555 -2.665528
5 0.576323 0.986997 -1.285871 2.839785
6 0.321969 0.081332 0.813703 0.785385
7 0.487467 1.266642 0.171523 0.995347
8 -0.151233 -0.779178 -0.465204 -0.505634

In [68]:
df.index=range(8)

In [69]:
df


Out[69]:
1 3 5 7
0 0.718618 -0.597387 -1.231868 1.122448
1 0.427829 -0.903850 0.666879 1.836270
2 0.202233 1.554029 -0.127751 0.814677
3 0.568951 1.646502 -1.925555 -2.665528
4 0.576323 0.986997 -1.285871 2.839785
5 0.321969 0.081332 0.813703 0.785385
6 0.487467 1.266642 0.171523 0.995347
7 -0.151233 -0.779178 -0.465204 -0.505634

In [ ]: