[TOC]
In [129]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
s
Out[2]:
通过传递一个numpy array,日期index以及column标签来创建一个DataFrame:
In [3]:
dates = pd.date_range('20160101',periods=6)
dates
Out[3]:
In [4]:
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df
Out[4]:
通过传递一个dictionary来创建一个DataFrame:
In [5]:
df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20160102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3]*4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' })
df2
Out[5]:
查看每一列的数据类型:
In [6]:
df2.dtypes
Out[6]:
In [7]:
df.head()
Out[7]:
In [8]:
df.tail(3)
Out[8]:
显示index,columns以及底层的numpy数据:
In [9]:
df.index
Out[9]:
In [10]:
df.columns
Out[10]:
In [11]:
df.values
Out[11]:
describe()方法能对数据做一个快速统计汇总:
In [12]:
df.describe()
Out[12]:
对数据进行转置:
In [13]:
df.T
Out[13]:
按某一坐标轴进行排序:
In [14]:
df.sort_index(axis=1,ascending=False)
Out[14]:
按值进行排序:
In [15]:
df.sort_values(by='B')
Out[15]:
In [16]:
df['A']
Out[16]:
使用[],对行进行切片选取。
注意,[]内必须是一个范围,而不能仅仅是一个值。否则报错:
In [24]:
df[0:3] # 选取范围是[),前闭合,后开
Out[24]:
In [25]:
df['20160102':'20160104'] # 选取范围是[], 前闭合,后闭合
Out[25]:
In [40]:
# []内必须是一个范围,而不能仅仅是一个值。否则报错
# df[1] # 报错
# df['20160102'] # 报错
# df[dates[0]] # 报错
In [19]:
df.loc[dates[0]]
Out[19]:
使用label对多个轴进行选取:
In [32]:
df.loc[:, ['A', 'B']]
Out[32]:
根据label进行切片选择,注意切片范围是前后闭合:
In [34]:
df.loc['20160102':'20160104',['A','B']]
Out[34]:
In [36]:
# .loc[] 内的值只能是label,如果传入整数下标,报错
# df.loc[0:3, ['A','B']]
In [43]:
df.loc['20160102',['A','B']]
Out[43]:
获取一个标量值:
In [44]:
df.loc[dates[0],'A']
Out[44]:
In [45]:
df.loc['20160101', 'A']
Out[45]:
快速获取一个标量(与上面方法效果相同)
In [46]:
df.at[dates[0],'A']
Out[46]:
传入单个整形下标:
In [48]:
df.iloc[3]
Out[48]:
传入整型的下标切片,与python/numpy形式相同
In [50]:
df.iloc[3:5,0:2]
Out[50]:
传入list
In [51]:
df.iloc[[1,2,4], [0,2]]
Out[51]:
只对行进行切片
In [52]:
df.iloc[1:3,:]
Out[52]:
只对列进行切片
In [53]:
df.iloc[:,1:3]
Out[53]:
只获取某个值
In [55]:
df.iloc[1,1]
Out[55]:
快速获取某个值(与上面的方法等价)
In [56]:
df.iat[1,1]
Out[56]:
用某列的值来选取数据
In [57]:
df[df.A > 0]
Out[57]:
用where操作来选取数据
In [59]:
df[df > 0]
Out[59]:
用isin()方法来过滤数据
In [60]:
df2 = df.copy()
In [61]:
df2['E'] = ['one','one','two','three','four','three']
df2
Out[61]:
In [62]:
df2[df2['E'].isin(['two','four'])]
Out[62]:
In [63]:
df2[df2['E'].isin(['one','one','two'])]
Out[63]:
In [65]:
s1 = pd.Series([1,2,3,4,5,6],index=pd.date_range('20160102',periods=6))
In [66]:
s1
Out[66]:
In [67]:
df['F'] = s1
s1
Out[67]:
In [69]:
df # 注意F列
Out[69]:
通过label赋值
In [70]:
df.at[dates[0],'A'] = 0
df
Out[70]:
通过下标赋值
In [71]:
df.iat[0,1] = 0
df
Out[71]:
通过传递numpy array赋值
In [72]:
df.loc[:,'D'] = np.array([5] * len(df))
In [73]:
df
Out[73]:
In [74]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2
Out[74]:
reindex()允许你修改、增加、删除指定轴上的index,并返回数据副本。
In [75]:
df1 = df.reindex(index=dates[0:4],columns=list(df.columns)+['E'])
df1.loc[dates[0]:dates[1],'E'] = 1
df1
Out[75]:
剔除所有包含缺失值的行
In [76]:
df1.dropna(how='any')
Out[76]:
填充缺失值
In [77]:
df1.fillna(value=5)
Out[77]:
获取值是否为nan的布尔标记
In [78]:
pd.isnull(df1)
Out[78]:
In [79]:
df
Out[79]:
In [80]:
df.mean()
Out[80]:
In [82]:
df.sum()
Out[82]:
对其他轴进行同样的运算
In [81]:
df.mean(1)
Out[81]:
对于拥有不同维度的对象进行运算时需要对齐。除此之外,pandas会自动沿着指定维度广播。
In [83]:
s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2)
s
Out[83]:
In [84]:
df.sub(s,axis='index')
Out[84]:
In [88]:
df.apply(np.cumsum)
Out[88]:
In [90]:
df.apply(lambda x: x.max()-x.min())
Out[90]:
In [91]:
s = pd.Series(np.random.randint(0,7,size=10))
s
Out[91]:
In [92]:
s.value_counts()
Out[92]:
In [93]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()
Out[93]:
通过pd.concat()来连接pandas对象:
In [96]:
df = pd.DataFrame(np.random.randn(10,4))
df
Out[96]:
In [97]:
# break it into pieces
pieces = [df[:3], df[3:7], df[7:]]
pieces
Out[97]:
In [98]:
pd.concat(pieces)
Out[98]:
In [99]:
left = pd.DataFrame({'key':['foo','foo'],'lval':[1,2]})
left
Out[99]:
In [100]:
right = pd.DataFrame({'key':['foo','foo'],'lval':[4,5]})
right
Out[100]:
In [101]:
pd.merge(left, right, on='key')
Out[101]:
pd.merge()的另一个例子:
In [102]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
left
Out[102]:
In [103]:
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
right
Out[103]:
In [104]:
pd.merge(left, right, on='key')
Out[104]:
In [105]:
df =pd.DataFrame(np.random.randn(8,4),columns=['A','B','C','D'])
df
Out[105]:
In [106]:
s = df.iloc[3]
s
Out[106]:
In [107]:
df.append(s,ignore_index=True)
Out[107]:
In [108]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'bar'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
df
Out[108]:
分组并对每个分组应用sum函数
In [109]:
df.groupby('A').sum()
Out[109]:
按多个column分组形成层级index,然后应用sun函数
In [110]:
df.groupby(['A','B']).sum()
Out[110]:
In [112]:
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two',
'one', 'two', 'one', 'two']]))
tuples
Out[112]:
In [113]:
index = pd.MultiIndex.from_tuples(tuples,names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df2 = df[:4]
df2
Out[113]:
stack()方法对DataFrame的列“压缩”一个层级
In [114]:
stacked = df2.stack()
stacked
Out[114]:
对于一个“stacked” DataFrame或者Series(拥有MultiIndex作为索引),stack()的逆操作是unstack(),默认反堆叠到上一个层级
In [115]:
stacked.unstack()
Out[115]:
In [116]:
stacked.unstack(1)
Out[116]:
In [117]:
stacked.unstack(0)
Out[117]:
In [118]:
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)})
df
Out[118]:
In [119]:
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
Out[119]:
In [86]:
rng = pd.date_range('1/1/2012',periods=100,freq='S')
rng
Out[86]:
In [87]:
ts = pd.Series(np.random.randint(0,500,len(rng)),index=rng)
ts
Out[87]:
In [88]:
ts.resample('5Min',how='sum')
Out[88]:
时区表示
In [89]:
rng = pd.date_range('3/6/2012',periods=5,freq='D')
rng
Out[89]:
In [90]:
ts = pd.Series(np.random.randn(len(rng)),index=rng)
ts
Out[90]:
In [91]:
ts_utc = ts.tz_localize('UTC')
ts_utc
Out[91]:
时区转换
In [92]:
ts_utc.tz_convert('US/Eastern')
Out[92]:
时间跨度转换
In [93]:
rng = pd.date_range('1/1/2012', periods=5, freq='M')
rng
Out[93]:
In [94]:
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts
Out[94]:
In [96]:
ps = ts.to_period()
ps
Out[96]:
In [97]:
ps.to_timestamp()
Out[97]:
日期与时间戳之间的转换使得可以使用一些方便的算术函数。例如,我们把以11月为年底的季度数据转换为当前季度末月底为始的数据
In [98]:
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
prng
Out[98]:
In [99]:
ts = pd.Series(np.random.randn(len(prng)), index = prng)
ts
Out[99]:
In [100]:
ts.index = (prng.asfreq('M', 'end') ) .asfreq('H', 'start') +9
ts
Out[100]:
In [120]:
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'e', 'e']})
df
Out[120]:
把raw_grade转换为categorical类型
In [121]:
df["grade"] = df["raw_grade"].astype("category")
df["grade"]
Out[121]:
对category进行重命名
In [122]:
df["grade"].cat.categories = ["very good","good","very bad"]
对分类重新排序,并添加缺失的分类
In [108]:
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
df["grade"]
Out[108]:
排序是按照分类的顺序进行的,而不是字典序
In [109]:
df.sort_index(by="grade")
Out[109]:
按分类分组时,也会显示空的分类
In [110]:
df.groupby("grade").size()
Out[110]:
In [125]:
ts = pd.Series(np.random.randn(1000),index=pd.date_range('1/1/2000',periods=1000))
ts = ts.cumsum()
ts.plot()
Out[125]:
对于DataFrame类型,plot()能很方便地画出所有column及其label
In [126]:
df = pd.DataFrame(np.random.randn(1000,4),index=ts.index,columns=['A', 'B', 'C', 'D'])
df = df.cumsum()
df.plot()
Out[126]:
In [127]:
df.to_csv('foo.csv')
从一个csv文件读入
In [128]:
pd.read_csv('foo.csv')
Out[128]:
In [120]:
df.to_hdf('foo.h5','df')
从一个HDF5 Store读入
In [121]:
pd.read_hdf('foo.h5','df')
Out[121]:
In [122]:
df.to_excel('foo.xlsx', sheet_name='Sheet1')
从一个Excel文件读入
In [123]:
pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
Out[123]: