In [3]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
ser1 = Series([1,2,3,4])
print(ser1)
print(ser1[0:2])
print(ser1[[0,1,2]])
In [2]:
sdata = {'python' : 1, 'pandas' : 2, 'matplotlib' : 3, 'numpy' : 4, 'scipy' : 5}
ser2 = Series(sdata)
print(ser2)
print(sdata.keys())
print(sdata.items())
print(sdata.values())
'python' in sdata.keys()
Out[2]:
In [3]:
ser3 = Series(sdata, index = ['apple', 'python', 'numpy', 'jupyter', 'scipy'])
print(ser3)
print('index:', ser3.index)
print('value:', ser3.values)
print(ser3[0:2])
print(ser3['apple':'numpy'])
print(ser3[['apple', 'numpy']])
ser3.name = 'try'
print(ser3)
In [4]:
data = {'packages':['pandas', 'numpy', 'matplotlib', 'pandas', 'pandas'],
'id':[1, 2, 3, 4, 5],
'num':[1.2, 3.4, 4, 5.6, 7.0]}
frame = DataFrame(data)
#DataFrame(data, columns=['id', 'num', 'packages'], index = ['A', 'B', 'C', 'D', 'E'])
#frame.sort_values('num', ascending = False)
#print(frame)
#print(frame[['id','num']])
#print(frame.loc[0:3,:])
#print(frame.loc[frame.id<4,:])
#print(type(frame.id))
#print(type(frame.loc[:,'id']))
frame.describe
Out[4]:
In [5]:
data = DataFrame(np.arange(16).reshape(4,4))
data[(data/2-1)%3==0] = np.nan
print(data)
#丢弃na
print(data.dropna())
print(data.dropna(how = 'all'))
print(data.dropna(axis = 1))
#填充na
print(data.fillna(0))
print(data.fillna({0:1, 2:10}))
print(data.fillna(method = 'bfill', limit = 1))
print(data.fillna(method = 'ffill', limit = 1))
In [6]:
frame = DataFrame(np.arange(16).reshape(4,4),
index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns = [['pandas', 'pandas', 'numpy', 'numpy'],
['Series', 'DataFrame', 'array', 'array']])
print(frame)
print(frame['numpy'])
print(frame.loc[:,'numpy'].loc[:,'array'])
print(frame.unstack()) #关于索引展开
In [7]:
df = DataFrame({'key':['b', 'b', 'a', 'c', 'a', 'b'], 'data1':range(6)})
dummies = pd.get_dummies(df['key'], prefix = 'key')
df_with_dummy = df[['data1']].join(dummies)
#print(df_with_dummy)
# 结合使用dummy和cut
df = DataFrame({'name':['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], 'score':np.arange(10)})
#print(df)
dummies = pd.get_dummies(pd.cut(df['score'], [0, 3, 6, 10]))
df_with_dummy = df[['name']].join(dummies)
print(df_with_dummy)
In [8]:
# 数据框一个单元格放置列表
df = DataFrame({'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randn(5),
'data2':np.random.randn(5)})
df.set_value(0, 'key1', ['a', 'b'])
print(df)
In [2]:
# groupby对数据进行分组
df = DataFrame({'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randn(5),
'data2':np.random.randn(5)})
df['data1'].groupby(df['key1']).mean()
df['data1'].groupby([df['key1'], df['key2']]).min()
# 语法糖
df.groupby('key1')[['data1']].max()
df.groupby(['key1', 'key2'])[['data1']].count()
# 函数:
# + min, max, first, last
# + sum, mean, count, prop
# + size, median, std, var
In [4]:
df = DataFrame({'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randn(5),
'data2':np.random.randn(5)})
for name,group in df.groupby('key1'):
print(name)
print(group)
for (key1, key2),group in df.groupby(['key1', 'key2']):
print(key1, key2)
print(group)
In [11]:
# 对列进行分组
people = DataFrame(np.random.randn(5,5),
columns = ['a','b','c','d','e'],
index = ['Joe','Steve','Wes','Jim','Travis'])
people.loc[2:3,['b','c']] = np.nan
#print(people)
mapping = {'a':'red','b':'red','c':'blue',
'd':'blue','e':'red','f':'orange'}
people.groupby(mapping, axis=1).sum()
people.groupby(len).sum()
key_list = ['one','one','one','two','two']
people.groupby([len, key_list]).sum()
Out[11]:
In [12]:
# 分层索引的聚合
# 方法1:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'CN', 'CN'],[1,3,5,1,3]],names=['cty','tenor'])
hier_df = DataFrame(np.random.randn(4,5), columns = columns)
# 方法2:
hier_df = DataFrame(np.random.randn(4,5), columns = [['US','US','US','CN','CN'],[1,3,5,1,3]])
hier_df.columns.names = ['city', 'tenor']
#hier_df.index.names = ['name1', 'name2]
print(hier_df)
hier_df.groupby(level='city',axis=1).count()
Out[12]:
In [13]:
# agg应用自定义函数
# 自定义极差函数
def peak_to_peak(arr):
result = arr.max() - arr.min()
return result
tips = pd.read_csv('tips.csv')
# 添加“小费占总额百分比”的列
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips[:6]
# 应用自定义函数
tips.groupby(['sex','smoker'])[['tip_pct']].agg(peak_to_peak)
# 对多列应用多种函数
tips.groupby(['sex','smoker'])[['tip','tip_pct']].agg(['mean','max',peak_to_peak])
Out[13]:
In [20]:
# 分组级运算和转换
df = DataFrame({'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randn(5),
'data2':np.random.randn(5)})
k1_means = df.groupby('key1').mean().add_prefix('mean_')
pd.merge(df, k1_means, left_on='key1', right_index = True)
df.groupby('key1').transform(np.mean) # 同 group_by %>% mutate %>% ungroup
Out[20]:
In [25]:
def top(df,n=5,column='tip_pct'):
return df.sort_values(by=column)[-n:]
top(tips,n=6)
# 分组应用函数
tips.groupby('smoker').apply(top)
# 分组应用函数并传参
tips.groupby(['sex','smoker']).apply(top, n = 2, column = 'total_bill')
Out[25]:
In [34]:
# 分位数
frame=DataFrame({'data1':np.random.randn(1000),
'data2':np.random.randn(1000)})
factor = pd.cut(frame.data1, 4)
factor[:10]
def get_status(group):
return {
'min':group.min(),
'max':group.max(),
'count':group.count(),
'mean':group.mean()
}
frame.data2.groupby(factor).apply(get_status).unstack()
Out[34]:
In [36]:
# 透视表
tips.pivot_table(index = ['sex','smoker'])
Out[36]:
In [39]:
tips.pivot_table(['tip_pct','size'], index = ['sex','day'], columns='smoker')
# 加入分项小计, 默认 aggfunc 为 mean
tips.pivot_table(['tip_pct','size'], index = ['sex','day'], columns='smoker', margins=True)
Out[39]:
In [42]:
tips.groupby(['sex','day'])['tip_pct','size'].mean()
tips.pivot_table('tip_pct', index = ['sex', 'smoker'], columns='day', aggfunc=len, margins=True, fill_value=0)
Out[42]:
In [44]:
# 交叉表
pd.crosstab(tips.sex, tips.smoker, margins=True)
pd.crosstab([tips.sex, tips.smoker], tips.day, margins=True)
Out[44]:
In [6]:
# 读写文件
df.to_excel('try.xlsx', sheet_name = 'Sheet1')
frame = pd.read_excel('try.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
frame
Out[6]: