In [1]:
%matplotlib inline
from pandas import DataFrame,Series
import pandas as pd
import numpy as np
import scipy as sy
In [5]:
data = DataFrame(np.arange(16).reshape((4,4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
In [6]:
data
Out[6]:
In [10]:
data['two']
Out[10]:
In [11]:
data[['three','one']]
Out[11]:
In [12]:
data[:2]#按行选择
Out[12]:
In [13]:
data[data['three']>5]
Out[13]:
In [16]:
data[data<5]=0
data
Out[16]:
In [20]:
data.ix['Colorado',['one','two']]
Out[20]:
In [21]:
data.ix[['Colorado','Utah'],[3,0,1]]#行、列的顺序
Out[21]:
In [23]:
frame = DataFrame(np.random.randn(4,3),columns=list('bde'),
index=('Utah','Ohio','Texas','Oregon'))
frame
Out[23]:
In [24]:
np.abs(frame)
Out[24]:
In [27]:
f = lambda x: x.max()-x.min()
In [28]:
frame.apply(f)
Out[28]:
In [33]:
frame.apply(f,axis=1)#应用到行上
Out[33]:
In [39]:
def f(x):
return Series([x.min()-x.max()],index=['min','max'])
frame.apply(f)
Out[39]:
In [40]:
frame.apply(f,axis=1).T
Out[40]:
In [43]:
format = lambda x:'%.3f' % x
frame.applymap(format)
Out[43]:
In [45]:
obj = Series(range(4),index=['d','a','b','c'])
obj
Out[45]:
In [47]:
obj.sort_index()
Out[47]:
In [48]:
frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
columns=['d', 'a', 'b', 'c'])
frame
Out[48]:
In [49]:
frame.sort_index()
Out[49]:
In [50]:
frame.sort_index(axis=1)#frame.sort_index(axis=1, ascending=False)
Out[50]:
In [51]:
#按照多行或者多列排序
frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
Out[51]:
In [52]:
frame.sort_index(by='b')
Out[52]:
In [53]:
frame.sort_index(by=['a','b'])
Out[53]:
In [55]:
frame.rank(axis=1)
Out[55]:
In [56]:
import pandas.io.data as web
In [57]:
all_data={}
for ticker in ['AAPL','IBM','MSFT','GOOG']:
all_data[ticker] = web.get_data_yahoo(ticker,'1/1/2010','1/1/2015')
In [67]:
price = DataFrame({tic:data['Close'] for tic,data in all_data.items()})
In [69]:
volume = DataFrame({tic:data['Volume'] for tic,data in all_data.items()})
In [70]:
returns = price.pct_change()#百分比变化
returns.tail()
Out[70]:
In [71]:
#属性之间的相关性计算
returns.MSFT.corr(returns.IBM)
Out[71]:
In [72]:
#协方差计算
returns.MSFT.cov(returns.IBM)
Out[72]:
In [73]:
#相关矩阵
returns.corr()
Out[73]:
In [74]:
#协方差矩阵
returns.cov()
Out[74]:
In [75]:
returns.corrwith(returns.IBM)#计算IBM与它股票之间两两的相关性
Out[75]:
In [77]:
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data.dropna()
Out[77]:
In [78]:
data = DataFrame([[1., 6.5, 3.], [1., NA, NA],
[NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
data
Out[78]:
In [79]:
cleaned
Out[79]:
In [82]:
df = DataFrame(np.random.randn(7, 3))
df.ix[:4, 1] = NA; df.ix[:2, 2] = NA
df
Out[82]:
In [83]:
df.fillna(0)
Out[83]:
In [ ]: