Indexing,selection,and filtering


In [1]:
%matplotlib inline
from pandas import DataFrame,Series
import pandas as pd
import numpy as np
import scipy as sy

In [5]:
data = DataFrame(np.arange(16).reshape((4,4)),
                 index=['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns=['one', 'two', 'three', 'four'])

In [6]:
data


Out[6]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15

In [10]:
data['two']


Out[10]:
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [11]:
data[['three','one']]


Out[11]:
three one
Ohio 2 0
Colorado 6 4
Utah 10 8
New York 14 12

In [12]:
data[:2]#按行选择


Out[12]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7

In [13]:
data[data['three']>5]


Out[13]:
one two three four
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15

In [16]:
data[data<5]=0
data


Out[16]:
one two three four
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11
New York 12 13 14 15

In [20]:
data.ix['Colorado',['one','two']]


Out[20]:
one    0
two    5
Name: Colorado, dtype: int32

In [21]:
data.ix[['Colorado','Utah'],[3,0,1]]#行、列的顺序


Out[21]:
four one two
Colorado 7 0 5
Utah 11 8 9

Function application and mapping


In [23]:
frame = DataFrame(np.random.randn(4,3),columns=list('bde'),
                  index=('Utah','Ohio','Texas','Oregon'))
frame


Out[23]:
b d e
Utah -0.484636 1.144450 0.767644
Ohio -1.829228 0.098372 -0.651730
Texas 0.034598 -1.363106 0.108255
Oregon -2.036759 1.017694 0.558499

In [24]:
np.abs(frame)


Out[24]:
b d e
Utah 0.484636 1.144450 0.767644
Ohio 1.829228 0.098372 0.651730
Texas 0.034598 1.363106 0.108255
Oregon 2.036759 1.017694 0.558499

In [27]:
f = lambda x: x.max()-x.min()

In [28]:
frame.apply(f)


Out[28]:
b    2.071357
d    2.507556
e    1.419374
dtype: float64

In [33]:
frame.apply(f,axis=1)#应用到行上


Out[33]:
Utah      1.629086
Ohio      1.927600
Texas     1.471361
Oregon    3.054453
dtype: float64

In [39]:
def f(x):
    return Series([x.min()-x.max()],index=['min','max'])
frame.apply(f)


Out[39]:
b d e
min -2.071357 -2.507556 -1.419374
max -2.071357 -2.507556 -1.419374

In [40]:
frame.apply(f,axis=1).T


Out[40]:
Utah Ohio Texas Oregon
min -1.629086 -1.9276 -1.471361 -3.054453
max -1.629086 -1.9276 -1.471361 -3.054453

In [43]:
format = lambda x:'%.3f' % x 
frame.applymap(format)


Out[43]:
b d e
Utah -0.485 1.144 0.768
Ohio -1.829 0.098 -0.652
Texas 0.035 -1.363 0.108
Oregon -2.037 1.018 0.558

Sorting and ranking


In [45]:
obj = Series(range(4),index=['d','a','b','c'])
obj


Out[45]:
d    0
a    1
b    2
c    3
dtype: int32

In [47]:
obj.sort_index()


Out[47]:
a    1
b    2
c    3
d    0
dtype: int32

In [48]:
frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
           columns=['d', 'a', 'b', 'c'])
frame


Out[48]:
d a b c
three 0 1 2 3
one 4 5 6 7

In [49]:
frame.sort_index()


Out[49]:
d a b c
one 4 5 6 7
three 0 1 2 3

In [50]:
frame.sort_index(axis=1)#frame.sort_index(axis=1, ascending=False)


Out[50]:
a b c d
three 1 2 3 0
one 5 6 7 4

In [51]:
#按照多行或者多列排序
frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame


Out[51]:
a b
0 0 4
1 1 7
2 0 -3
3 1 2

In [52]:
frame.sort_index(by='b')


Out[52]:
a b
2 0 -3
3 1 2
0 0 4
1 1 7

In [53]:
frame.sort_index(by=['a','b'])


Out[53]:
a b
2 0 -3
0 0 4
3 1 2
1 1 7

In [55]:
frame.rank(axis=1)


Out[55]:
a b
0 1 2
1 1 2
2 2 1
3 1 2

Correlation and Covariance


In [56]:
import pandas.io.data as web

In [57]:
all_data={}
for ticker in ['AAPL','IBM','MSFT','GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker,'1/1/2010','1/1/2015')

In [67]:
price = DataFrame({tic:data['Close'] for tic,data in all_data.items()})

In [69]:
volume = DataFrame({tic:data['Volume'] for tic,data in all_data.items()})

In [70]:
returns = price.pct_change()#百分比变化
returns.tail()


Out[70]:
AAPL GOOG IBM MSFT
Date
2014-12-24 -0.004709 -0.003430 -0.002589 -0.006398
2014-12-26 0.017677 0.009948 0.003213 -0.005401
2014-12-29 -0.000702 -0.006928 -0.011273 -0.008981
2014-12-30 -0.012203 0.000170 -0.002866 -0.009062
2014-12-31 -0.019019 -0.007579 0.002437 -0.012122

In [71]:
#属性之间的相关性计算
returns.MSFT.corr(returns.IBM)


Out[71]:
0.4958317076413653

In [72]:
#协方差计算
returns.MSFT.cov(returns.IBM)


Out[72]:
8.105815265355002e-05

In [73]:
#相关矩阵
returns.corr()


Out[73]:
AAPL GOOG IBM MSFT
AAPL 1.000000 0.010326 0.215261 0.204335
GOOG 0.010326 1.000000 0.319901 0.480289
IBM 0.215261 0.319901 1.000000 0.495832
MSFT 0.204335 0.480289 0.495832 1.000000

In [74]:
#协方差矩阵
returns.cov()


Out[74]:
AAPL GOOG IBM MSFT
AAPL 0.000865 0.000009 0.000074 0.000084
GOOG 0.000009 0.000175 0.000045 0.000072
IBM 0.000074 0.000045 0.000137 0.000081
MSFT 0.000084 0.000072 0.000081 0.000195

In [75]:
returns.corrwith(returns.IBM)#计算IBM与它股票之间两两的相关性


Out[75]:
AAPL    0.215261
GOOG    0.319901
IBM     1.000000
MSFT    0.495832
dtype: float64

Handling Missing Data


In [77]:
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data.dropna()


Out[77]:
0    1.0
2    3.5
4    7.0
dtype: float64

In [78]:
data = DataFrame([[1., 6.5, 3.], [1., NA, NA],
                  [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
data


Out[78]:
0 1 2
0 1 6.5 3
1 1 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3

In [79]:
cleaned


Out[79]:
0 1 2
0 1 6.5 3

Filling in Missing Data


In [82]:
df = DataFrame(np.random.randn(7, 3))
df.ix[:4, 1] = NA; df.ix[:2, 2] = NA
df


Out[82]:
0 1 2
0 -0.847640 NaN NaN
1 -2.698985 NaN NaN
2 0.791130 NaN NaN
3 0.441995 NaN -1.071578
4 0.358731 NaN -0.765573
5 0.119484 -0.243186 -0.085614
6 0.712221 0.147730 -2.709724

In [83]:
df.fillna(0)


Out[83]:
0 1 2
0 -0.847640 0.000000 0.000000
1 -2.698985 0.000000 0.000000
2 0.791130 0.000000 0.000000
3 0.441995 0.000000 -1.071578
4 0.358731 0.000000 -0.765573
5 0.119484 -0.243186 -0.085614
6 0.712221 0.147730 -2.709724

In [ ]: