Pandas

Chapter 5 Intro to data Structure



In [1]:

    
#initialize
%matplotlib inline 
%qtconsole



In [ ]:



In [2]:

    
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
from pandas import Timestamp
randn = np.random.randn
from pandas import *

5.1 Series



In [3]:

    
#s = Series(data, index=index)
s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e'])
s
s.index









    Out[3]:





Index([u'a', u'b', u'c', u'd', u'e'], dtype=object)



In [4]:

    
#From dict
d = {'a' : 0.0, 'b' : 1.1, 'c' : 2.2 }
Series(d)
Series(d, index = ['a','b'])
Series(d, index = ['a','b','z'])
Series(5, index = ['a', 'b', 'c', 'd'])









    Out[4]:





a    5
b    5
c    5
d    5
dtype: int64

5.1.1 Series is ndarray-like



In [64]:

    
s['a']
s[:3]
s[[4,3,1]]
s[s> s.median()]
s>s.median()









    Out[64]:





a     True
b     True
c    False
d    False
e    False
dtype: bool



In [62]:

    
np.exp(s)









    Out[62]:





a    2.760369
b    1.487827
c    0.183412
d    0.583944
e    0.603313
dtype: float64

5.1.2 Series is dict-like



In [66]:

    
s['a'] = 12
s









    Out[66]:





a    12.000000
b     0.397317
c    -1.696023
d    -0.537950
e    -0.505318
dtype: float64



In [68]:

    
'ee' in s
s['ee']









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-68-45038a6cba50> in <module>()
      1 'ee' in s
----> 2 s['ee']

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
    611     def __getitem__(self, key):
    612         try:
--> 613             return self.index.get_value(self, key)
    614         except InvalidIndexError:
    615             pass

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/index.pyc in get_value(self, series, key)
    775                     raise InvalidIndexError(key)
    776                 else:
--> 777                     raise e1
    778             except Exception:  # pragma: no cover
    779                 raise e1

KeyError: 'ee'



In [74]:

    
s.get('a')
s.get('f', np.Inf)









    Out[74]:





inf

5.1.3 Vectorized operations and label alignment with Series



In [80]:

    
s + s
s*2
np.exp(s)
s[1:] + s[:-1]









    Out[80]:





a         NaN
b    0.794634
c   -3.392045
d   -1.075901
e         NaN
dtype: float64

5.1.4 Name attribute



In [85]:

    
s = Series(np.random.randn(5), dtype = int32, name = 'whatever')
type(s)









    Out[85]:





pandas.core.series.Series

5.2 DataFrame

DataFrame accepts many different kinds of inputs:
- Dict of 1D ndarrays, lists, dists, or Series
- 2-D numpy.ndarray
- Structured or record ndarray
- A Series
- Another DataFrame

5.2.1 From dict of Series or dicts



In [61]:

    
d = {'one' : Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}



In [62]:

    
df = DataFrame(d)
df



In [24]:

    
d2 = DataFrame(d, index=['d', 'b', 'a'])
d2['one']['d']
#d2[1][2]??









    Out[24]:





nan



In [27]:

    
df.index
df.columns









    Out[27]:





Index([one, two], dtype=object)

5.2.2 From dict of ndarrays /lists



In [30]:

    
d = {'one' : [1., 2., 3., 4.],
     'two' : [4., 3., 2., 1.]}



In [32]:

    
DataFrame(d, index=['a', 'b', 'c', 'd'])

5.2.3 From structured or record array



In [14]:

    
#This case is handled identically to a dict of arrays
data = np.zeros((2,), dtype=[('A', 'i4'), ('B', 'f4'),('C','a10')])



In [15]:

    
data[:] = [(1,2.,'Hello'),(2,3.,"World")]



In [16]:

    
DataFrame(data)



In [22]:

    
DataFrame(data, index=['first', 'second'])



In [24]:

    
DataFrame(data,columns=['C','A','B'])

5.2.4 From a list of dicts



In [25]:

    
data2 = [{'a':1, 'b':2}, {'a':5,'b':10, 'c':20}]



In [36]:

    
DataFrame(data2, index = ['first','second'])



In [37]:

    
DataFrame(data2, columns=['a','b']) # A selection

5.2.5 From a Series

5.2.7 Column Selection, addition, deletion



In [42]:

    
df['one']









    Out[42]:





a     1
b     2
c     3
d   NaN
Name: one, dtype: float64



In [43]:

    
df['three'] = df['one']*df['two']



In [49]:

    
boolindex = df['two']>1
boolindex









    Out[49]:





a    False
b     True
c     True
d     True
Name: two, dtype: bool



In [50]:

    
df[boolindex]



In [52]:

    
df['b']









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-52-f2a266ed0ffd> in <module>()
----> 1 df['b']

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1926         else:
   1927             # get column
-> 1928             return self._get_item_cache(key)
   1929 
   1930     def _getitem_slice(self, key):

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
    568             return cache[item]
    569         except Exception:
--> 570             values = self._data.get(item)
    571             res = self._box_item_values(item, values)
    572             cache[item] = res

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item)
   1381 
   1382     def get(self, item):
-> 1383         _, block = self._find_block(item)
   1384         return block.get(item)
   1385 

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.pyc in _find_block(self, item)
   1523 
   1524     def _find_block(self, item):
-> 1525         self._check_have(item)
   1526         for i, block in enumerate(self.blocks):
   1527             if item in block:

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.pyc in _check_have(self, item)
   1530     def _check_have(self, item):
   1531         if item not in self.items:
-> 1532             raise KeyError('no item named %s' % com.pprint_thing(item))
   1533 
   1534     def reindex_axis(self, new_axis, method=None, axis=0, copy=True):

KeyError: u'no item named b'



In [53]:

    
del df['two']



In [54]:

    
df



In [55]:

    
df['new'] = 1



In [56]:

    
df



In [59]:

    
df.ix[3]









    Out[59]:





one     NaN
three   NaN
new       1
Name: d, dtype: float64

5.2.8 Indexing/Selection



In [64]:

    
df[1]









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-64-dc0ea82142cb> in <module>()
----> 1 df[1]

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1926         else:
   1927             # get column
-> 1928             return self._get_item_cache(key)
   1929 
   1930     def _getitem_slice(self, key):

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
    568             return cache[item]
    569         except Exception:
--> 570             values = self._data.get(item)
    571             res = self._box_item_values(item, values)
    572             cache[item] = res

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item)
   1381 
   1382     def get(self, item):
-> 1383         _, block = self._find_block(item)
   1384         return block.get(item)
   1385 

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.pyc in _find_block(self, item)
   1523 
   1524     def _find_block(self, item):
-> 1525         self._check_have(item)
   1526         for i, block in enumerate(self.blocks):
   1527             if item in block:

/Users/chenchen/anaconda/python.app/Contents/lib/python2.7/site-packages/pandas/core/internals.pyc in _check_have(self, item)
   1530     def _check_have(self, item):
   1531         if item not in self.items:
-> 1532             raise KeyError('no item named %s' % com.pprint_thing(item))
   1533 
   1534     def reindex_axis(self, new_axis, method=None, axis=0, copy=True):

KeyError: u'no item named 1'

5.2.9 DataTypes



In [75]:

    
df['interger'] = 1
df['int32'] = df['interger'].astype('int32')
df['float32'] = Series([1.0]*len(df),dtype='float32')
df['timestamp'] = Timestamp('20010102')



In [77]:

    
df.dtypes









    Out[77]:





one                 float64
two                 float64
interger              int64
int32                 int32
float32             float32
timestamp    datetime64[ns]
dtype: object

5.2.11 Data alignment and arithmetic



In [3]:

    
index_df = date_range('1', periods=10)  #use data range as index, otherwise, Boom!!
df = DataFrame(randn(10, 4), index=index_df, columns=['A', 'B', 'C', 'D'])
df - df['A']
(df.T - df['A']).T









    



/Users/chenchen/anaconda/lib/python2.7/site-packages/pandas/core/frame.py:3619: FutureWarning: TimeSeries broadcasting along DataFrame index by default is deprecated. Please use DataFrame.<op> to explicitly broadcast arithmetic operations along the index
  FutureWarning)






    Out[3]:






  
    
      
      A
      B
      C
      D
    
  
  
    
      2013-11-01
       0
       0.070616
       1.764970
       0.483617
    
    
      2013-11-02
       0
      -2.154749
      -0.160766
       1.531038
    
    
      2013-11-03
       0
      -0.590357
       0.202836
       1.108110
    
    
      2013-11-04
       0
      -0.219865
      -0.262338
      -1.444890
    
    
      2013-11-05
       0
       0.258523
      -1.696840
      -1.297896
    
    
      2013-11-06
       0
      -1.112081
       0.808770
      -1.233935
    
    
      2013-11-07
       0
       1.497496
      -0.272568
       1.900655
    
    
      2013-11-08
       0
       1.618075
       0.784253
       0.442696
    
    
      2013-11-09
       0
       0.170048
       1.366825
       0.277063
    
    
      2013-11-10
       0
       0.676875
       0.121673
      -0.862354



In [7]:

    
df - df['A']
df * 5 + 2
1/df
df ** 4

5.2.12 Transposing



In [8]:

    
df[:5]



In [9]:

    
df[:5].T









    Out[9]:






  
    
      
      2013-11-01 00:00:00
      2013-11-02 00:00:00
      2013-11-03 00:00:00
      2013-11-04 00:00:00
      2013-11-05 00:00:00
    
  
  
    
      A
      -1.684730
       0.103770
      -0.131716
       0.385670
       0.749069
    
    
      B
      -1.614115
      -2.050979
      -0.722073
       0.165805
       1.007592
    
    
      C
       0.080239
      -0.056996
       0.071120
       0.123332
      -0.947771
    
    
      D
      -1.201114
       1.634808
       0.976394
      -1.059220
      -0.548827

5.2.13DataFrame Interoperability with NumPy functions



In [10]:

    
np.exp(df)



In [11]:

    
np.asarray(df)









    Out[11]:





array([[-1.68473046, -1.61411495,  0.08023923, -1.20111378],
       [ 0.10377037, -2.05097901, -0.05699605,  1.63480831],
       [-0.13171584, -0.72207297,  0.07112033,  0.97639441],
       [ 0.38567009,  0.16580471,  0.12333169, -1.05922022],
       [ 0.74906882,  1.00759175, -0.94777076, -0.54882724],
       [ 0.12781657, -0.98426416,  0.93658673, -1.10611851],
       [-0.72851447,  0.76898165, -1.0010821 ,  1.1721406 ],
       [-1.1914016 ,  0.42667381, -0.40714827, -0.74870594],
       [-0.37250927, -0.20246151,  0.99431559, -0.09544651],
       [-1.21167503, -0.53480017, -1.09000165, -2.07402908]])



In [13]:

    
df.T.dot(df)



In [14]:

    
s1 = Series(np.arange(5,10))
s1.dot(s1)









    Out[14]:





255

5.2.14 Consol Display



In [15]:

    
#baseball = read_csv('a_large_data_set.csv')
#print baseball (only have summary)
#print baseball.ix[-20:, :12].to_string() represant as a tabulate formate



In [21]:

    
DataFrame(randn(3,12))









    Out[21]:




&ltclass 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 0 to 2
Data columns (total 12 columns):
0     3  non-null values
1     3  non-null values
2     3  non-null values
3     3  non-null values
4     3  non-null values
5     3  non-null values
6     3  non-null values
7     3  non-null values
8     3  non-null values
9     3  non-null values
10    3  non-null values
11    3  non-null values
dtypes: float64(12)



In [24]:

    
df.C









    Out[24]:





2013-11-01    0.080239
2013-11-02   -0.056996
2013-11-03    0.071120
2013-11-04    0.123332
2013-11-05   -0.947771
2013-11-06    0.936587
2013-11-07   -1.001082
2013-11-08   -0.407148
2013-11-09    0.994316
2013-11-10   -1.090002
Freq: D, Name: C, dtype: float64

5.3 Panel : 3D Data

Chapter Six: Essential Basic Functionality

6.1 Head and Tail



In [30]:

    
long_series = Series(randn(1000))
long_series.head()
long_series.tail(3)









    Out[30]:





997    1.102883
998   -0.304486
999    0.998342
dtype: float64



In [1]:

    
long_series.values;









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-1fab067e9e82> in <module>()
----> 1 long_series.values;

NameError: name 'long_series' is not defined

6.4 Descriptive Statistics



In [33]:

    
df



In [37]:

    
df.mean(0)









    Out[37]:





A   -0.395422
B   -0.373964
C   -0.129741
D   -0.305012
dtype: float64



In [38]:

    
df.mean(1)









    Out[38]:





2013-11-01   -1.104930
2013-11-02   -0.092349
2013-11-03    0.048431
2013-11-04   -0.096103
2013-11-05    0.065016
2013-11-06   -0.256495
2013-11-07    0.052881
2013-11-08   -0.480146
2013-11-09    0.080975
2013-11-10   -1.227626
Freq: D, dtype: float64



In [39]:

    
df.sum(0, skipna=True)









    Out[39]:





A   -3.954221
B   -3.739641
C   -1.297405
D   -3.050118
dtype: float64



In [42]:

    
df.std()









    Out[42]:





A    0.787833
B    1.002590
C    0.743998
D    1.203243
dtype: float64



In [46]:

    
df.cumsum()

Other function:
- count
- sum
- mean
- median
- min
- max
- abs
- prod
- std
- var
- skew
- kurt
- quantile
- cumsum
- cumprod
- cummax
- cummin



In [48]:

    
long_series.describe()









    Out[48]:





count    1000.000000
mean        0.012889
std         1.017901
min        -2.997970
25%        -0.633205
50%         0.048765
75%         0.693814
max         3.278543
dtype: float64



In [49]:

    
long_series.idxmax()









    Out[49]:





992



In [50]:

    
long_series.idxmin()









    Out[50]:





229



In [52]:

    
long_series.ix[229]









    Out[52]:





-2.9979700475667403



In [56]:

    
df.idxmax(axis=1)









    Out[56]:





2013-11-01    C
2013-11-02    D
2013-11-03    D
2013-11-04    A
2013-11-05    B
2013-11-06    C
2013-11-07    D
2013-11-08    B
2013-11-09    C
2013-11-10    B
Freq: D, dtype: object



In [57]:

    
df.apply(np.mean)









    Out[57]:





A   -0.395422
B   -0.373964
C   -0.129741
D   -0.305012
dtype: float64

6.6 Reindexing and altering lables

Chapter 7 Indexing and Selecting

7.1 Basics



In [65]:

    
df = DataFrame(randn(8,4), index=range(1, 9), columns=range(1,9,2))



In [66]:

    
df.ix[1]









    Out[66]:





1    0.718618
3   -0.597387
5   -1.231868
7    1.122448
Name: 1, dtype: float64



In [67]:

    
df



In [68]:

    
df.index=range(8)



In [69]:

    
df



In [ ]:

	A	B	C	D
2013-11-01	8.056042	6.787938	0.000041	2.081309
2013-11-02	0.000116	17.694768	0.000011	7.142781
2013-11-03	0.000301	0.271847	0.000026	0.908869
2013-11-04	0.022124	0.000756	0.000231	1.258766
2013-11-05	0.314838	1.030715	0.806888	0.090728
2013-11-06	0.000267	0.938527	0.769471	1.496948
2013-11-07	0.281678	0.349674	1.004335	1.887639
2013-11-08	2.014804	0.033142	0.027480	0.314228
2013-11-09	0.019255	0.001680	0.977456	0.000083
2013-11-10	2.155483	0.081802	1.411590	18.503734

	A	B	C	D
2013-11-01	0.185494	0.199067	1.083546	0.300859
2013-11-02	1.109346	0.128609	0.944598	5.128475
2013-11-03	0.876590	0.485744	1.073710	2.654867
2013-11-04	1.470599	1.180343	1.131260	0.346726
2013-11-05	2.115030	2.738997	0.387604	0.577627
2013-11-06	1.136345	0.373714	2.551258	0.330841
2013-11-07	0.482625	2.157568	0.367482	3.228897
2013-11-08	0.303795	1.532153	0.665545	0.472978
2013-11-09	0.689003	0.816718	2.702874	0.908967
2013-11-10	0.297698	0.585786	0.336216	0.125678

	A	B	C	D
A	7.149707	2.949391	1.471583	3.690284
B	2.949391	10.445165	-2.482248	-0.048738
C	1.471583	-2.482248	5.150122	0.530655
D	3.690284	-0.048738	0.530655	13.960456

	A	B	C	D
2013-11-01	-1.684730	-1.614115	0.080239	-1.201114
2013-11-02	-1.580960	-3.665094	0.023243	0.433695
2013-11-03	-1.712676	-4.387167	0.094364	1.410089
2013-11-04	-1.327006	-4.221362	0.217695	0.350869
2013-11-05	-0.577937	-3.213770	-0.730076	-0.197959
2013-11-06	-0.450120	-4.198035	0.206511	-1.304077
2013-11-07	-1.178635	-3.429053	-0.794571	-0.131936
2013-11-08	-2.370037	-3.002379	-1.201719	-0.880642
2013-11-09	-2.742546	-3.204841	-0.207404	-0.976089
2013-11-10	-3.954221	-3.739641	-1.297405	-3.050118

	1	3	5	7
1	0.718618	-0.597387	-1.231868	1.122448
2	0.427829	-0.903850	0.666879	1.836270
3	0.202233	1.554029	-0.127751	0.814677
4	0.568951	1.646502	-1.925555	-2.665528
5	0.576323	0.986997	-1.285871	2.839785
6	0.321969	0.081332	0.813703	0.785385
7	0.487467	1.266642	0.171523	0.995347
8	-0.151233	-0.779178	-0.465204	-0.505634

	B	C	D
2013-11-01	0.070616	1.764970	0.483617
2013-11-02	-2.154749	-0.160766	1.531038
2013-11-03	-0.590357	0.202836	1.108110
2013-11-04	-0.219865	-0.262338	-1.444890
2013-11-05	0.258523	-1.696840	-1.297896
2013-11-06	-1.112081	0.808770	-1.233935
2013-11-07	1.497496	-0.272568	1.900655
2013-11-08	1.618075	0.784253	0.442696
2013-11-09	0.170048	1.366825	0.277063
2013-11-10	0.676875	0.121673	-0.862354

	2013-11-01 00:00:00	2013-11-02 00:00:00	2013-11-03 00:00:00	2013-11-04 00:00:00	2013-11-05 00:00:00
A	-1.684730	0.103770	-0.131716	0.385670	0.749069
B	-1.614115	-2.050979	-0.722073	0.165805	1.007592
C	0.080239	-0.056996	0.071120	0.123332	-0.947771
D	-1.201114	1.634808	0.976394	-1.059220	-0.548827