In [3]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

ser1 = Series([1,2,3,4])
print(ser1)
print(ser1[0:2])
print(ser1[[0,1,2]])


0    1
1    2
2    3
3    4
dtype: int64
0    1
1    2
dtype: int64
0    1
1    2
2    3
dtype: int64

In [2]:
sdata = {'python' : 1,  'pandas' : 2,  'matplotlib' : 3,  'numpy' : 4,  'scipy' : 5}
ser2 = Series(sdata)
print(ser2)
print(sdata.keys())
print(sdata.items())
print(sdata.values())
'python' in sdata.keys()


matplotlib    3
numpy         4
pandas        2
python        1
scipy         5
dtype: int64
dict_keys(['python', 'pandas', 'scipy', 'numpy', 'matplotlib'])
dict_items([('python', 1), ('pandas', 2), ('scipy', 5), ('numpy', 4), ('matplotlib', 3)])
dict_values([1, 2, 5, 4, 3])
Out[2]:
True

In [3]:
ser3 = Series(sdata, index = ['apple', 'python', 'numpy', 'jupyter', 'scipy'])
print(ser3)
print('index:', ser3.index)
print('value:', ser3.values)
print(ser3[0:2])
print(ser3['apple':'numpy'])
print(ser3[['apple', 'numpy']])
ser3.name = 'try'
print(ser3)


apple      NaN
python     1.0
numpy      4.0
jupyter    NaN
scipy      5.0
dtype: float64
index: Index(['apple', 'python', 'numpy', 'jupyter', 'scipy'], dtype='object')
value: [ nan   1.   4.  nan   5.]
apple     NaN
python    1.0
dtype: float64
apple     NaN
python    1.0
numpy     4.0
dtype: float64
apple    NaN
numpy    4.0
dtype: float64
apple      NaN
python     1.0
numpy      4.0
jupyter    NaN
scipy      5.0
Name: try, dtype: float64

In [4]:
data = {'packages':['pandas', 'numpy', 'matplotlib', 'pandas', 'pandas'],
        'id':[1, 2, 3, 4, 5],
        'num':[1.2, 3.4, 4, 5.6, 7.0]}
frame = DataFrame(data)
#DataFrame(data, columns=['id', 'num', 'packages'], index = ['A', 'B', 'C', 'D', 'E'])
#frame.sort_values('num', ascending = False)
#print(frame)
#print(frame[['id','num']])
#print(frame.loc[0:3,:])
#print(frame.loc[frame.id<4,:])
#print(type(frame.id))
#print(type(frame.loc[:,'id']))
frame.describe


Out[4]:
<bound method NDFrame.describe of    id  num    packages
0   1  1.2      pandas
1   2  3.4       numpy
2   3  4.0  matplotlib
3   4  5.6      pandas
4   5  7.0      pandas>

In [5]:
data = DataFrame(np.arange(16).reshape(4,4))
data[(data/2-1)%3==0] = np.nan
print(data)

#丢弃na
print(data.dropna())
print(data.dropna(how = 'all'))
print(data.dropna(axis = 1))

#填充na
print(data.fillna(0))
print(data.fillna({0:1, 2:10}))
print(data.fillna(method = 'bfill', limit = 1))
print(data.fillna(method = 'ffill', limit = 1))


      0   1     2   3
0   0.0   1   NaN   3
1   4.0   5   6.0   7
2   NaN   9  10.0  11
3  12.0  13   NaN  15
     0  1    2  3
1  4.0  5  6.0  7
      0   1     2   3
0   0.0   1   NaN   3
1   4.0   5   6.0   7
2   NaN   9  10.0  11
3  12.0  13   NaN  15
    1   3
0   1   3
1   5   7
2   9  11
3  13  15
      0   1     2   3
0   0.0   1   0.0   3
1   4.0   5   6.0   7
2   0.0   9  10.0  11
3  12.0  13   0.0  15
      0   1     2   3
0   0.0   1  10.0   3
1   4.0   5   6.0   7
2   1.0   9  10.0  11
3  12.0  13  10.0  15
      0   1     2   3
0   0.0   1   6.0   3
1   4.0   5   6.0   7
2  12.0   9  10.0  11
3  12.0  13   NaN  15
      0   1     2   3
0   0.0   1   NaN   3
1   4.0   5   6.0   7
2   4.0   9  10.0  11
3  12.0  13  10.0  15

In [6]:
frame = DataFrame(np.arange(16).reshape(4,4), 
                 index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                 columns = [['pandas', 'pandas', 'numpy', 'numpy'],
                           ['Series', 'DataFrame', 'array', 'array']])
print(frame)
print(frame['numpy'])
print(frame.loc[:,'numpy'].loc[:,'array'])
print(frame.unstack()) #关于索引展开


    pandas           numpy      
    Series DataFrame array array
a 1      0         1     2     3
  2      4         5     6     7
b 1      8         9    10    11
  2     12        13    14    15
     array  array
a 1      2      3
  2      6      7
b 1     10     11
  2     14     15
     array  array
a 1      2      3
  2      6      7
b 1     10     11
  2     14     15
  pandas                   numpy            
  Series     DataFrame     array            
       1   2         1   2     1   2   1   2
a      0   4         1   5     2   6   3   7
b      8  12         9  13    10  14  11  15

In [7]:
df = DataFrame({'key':['b', 'b', 'a', 'c', 'a', 'b'], 'data1':range(6)})
dummies = pd.get_dummies(df['key'], prefix = 'key')
df_with_dummy = df[['data1']].join(dummies)
#print(df_with_dummy)

# 结合使用dummy和cut
df = DataFrame({'name':['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], 'score':np.arange(10)})
#print(df)
dummies = pd.get_dummies(pd.cut(df['score'], [0, 3, 6, 10]))
df_with_dummy = df[['name']].join(dummies)
print(df_with_dummy)


  name  (0, 3]  (3, 6]  (6, 10]
0    a     0.0     0.0      0.0
1    b     1.0     0.0      0.0
2    c     1.0     0.0      0.0
3    d     1.0     0.0      0.0
4    e     0.0     1.0      0.0
5    f     0.0     1.0      0.0
6    g     0.0     1.0      0.0
7    h     0.0     0.0      1.0
8    i     0.0     0.0      1.0
9    j     0.0     0.0      1.0

In [8]:
# 数据框一个单元格放置列表

df = DataFrame({'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one'],
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)})
df.set_value(0, 'key1', ['a', 'b'])
print(df)


      data1     data2    key1 key2
0  1.710266 -1.670090  [a, b]  one
1  0.997657  3.048344       a  two
2  1.195870  0.312211       b  one
3  1.606072  2.942407       b  two
4  0.413874  0.278038       a  one

In [2]:
# groupby对数据进行分组

df = DataFrame({'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one'],
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)})

df['data1'].groupby(df['key1']).mean()
df['data1'].groupby([df['key1'], df['key2']]).min()

# 语法糖

df.groupby('key1')[['data1']].max()
df.groupby(['key1', 'key2'])[['data1']].count()

# 函数: 
# + min, max, first, last 
# + sum, mean, count, prop 
# + size, median, std, var


-----------------------------------------------------------------------
NameError                             Traceback (most recent call last)
<ipython-input-2-52f68c9d87bf> in <module>()
      1 # groupby对数据进行分组
      2 
----> 3 df = DataFrame({'key1':['a','a','b','b','a'],
      4                 'key2':['one','two','one','two','one'],
      5                 'data1':np.random.randn(5),

NameError: name 'DataFrame' is not defined

In [4]:
df = DataFrame({'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one'],
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)})

for name,group in df.groupby('key1'):
    print(name)
    print(group)
    
for (key1, key2),group in df.groupby(['key1', 'key2']):
    print(key1, key2)
    print(group)


a
      data1     data2 key1 key2
0 -0.301597 -0.576311    a  one
1  1.196534 -0.461711    a  two
4  1.374061 -1.194680    a  one
b
      data1     data2 key1 key2
2  0.200715  0.195816    b  one
3  0.335171  0.969410    b  two
a one
      data1     data2 key1 key2
0 -0.301597 -0.576311    a  one
4  1.374061 -1.194680    a  one
a two
      data1     data2 key1 key2
1  1.196534 -0.461711    a  two
b one
      data1     data2 key1 key2
2  0.200715  0.195816    b  one
b two
      data1    data2 key1 key2
3  0.335171  0.96941    b  two

In [11]:
# 对列进行分组

people = DataFrame(np.random.randn(5,5),
                   columns = ['a','b','c','d','e'],
                   index = ['Joe','Steve','Wes','Jim','Travis'])

people.loc[2:3,['b','c']] = np.nan
#print(people)
mapping = {'a':'red','b':'red','c':'blue',
           'd':'blue','e':'red','f':'orange'}
people.groupby(mapping, axis=1).sum()
people.groupby(len).sum()

key_list = ['one','one','one','two','two']
people.groupby([len, key_list]).sum()


Out[11]:
a b c d e
3 one 1.619607 1.016793 1.802532 -1.415808 -2.470669
two 0.232754 0.833052 0.403825 -0.768159 0.254402
5 one 0.768526 -1.839609 -0.515561 -1.760685 0.441773
6 two 0.521294 -0.574520 -0.118423 0.771838 1.244679

In [12]:
# 分层索引的聚合

# 方法1:

columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'CN', 'CN'],[1,3,5,1,3]],names=['cty','tenor'])
hier_df = DataFrame(np.random.randn(4,5), columns = columns)

# 方法2:

hier_df = DataFrame(np.random.randn(4,5), columns = [['US','US','US','CN','CN'],[1,3,5,1,3]])
hier_df.columns.names = ['city', 'tenor']
#hier_df.index.names = ['name1', 'name2]
print(hier_df)

hier_df.groupby(level='city',axis=1).count()


city         US                            CN          
tenor         1         3         5         1         3
0      0.582234  0.577590  1.330080 -0.456415  0.937492
1     -0.130928  2.706319 -0.967555  2.631627 -0.617691
2     -0.703914  0.801095  0.812124 -0.793731  0.364345
3      0.226949 -0.889552  1.222218 -0.377916 -0.806176
Out[12]:
city CN US
0 2 3
1 2 3
2 2 3
3 2 3

In [13]:
# agg应用自定义函数 

# 自定义极差函数
def peak_to_peak(arr):
    result = arr.max() - arr.min()
    return result

tips = pd.read_csv('tips.csv')

# 添加“小费占总额百分比”的列
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips[:6]

# 应用自定义函数
tips.groupby(['sex','smoker'])[['tip_pct']].agg(peak_to_peak)

# 对多列应用多种函数
tips.groupby(['sex','smoker'])[['tip','tip_pct']].agg(['mean','max',peak_to_peak])


Out[13]:
tip tip_pct
mean max peak_to_peak mean max peak_to_peak
sex smoker
Female No 2.914000 5.20 4.19 0.155622 0.252672 0.195876
Yes 3.046667 5.00 4.00 0.213473 0.325733 0.208830
Male No 3.112642 7.58 6.33 0.160993 0.228050 0.156246
Yes 2.888000 5.00 3.24 0.154222 0.205577 0.126650

In [20]:
# 分组级运算和转换

df = DataFrame({'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one'],
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)})

k1_means = df.groupby('key1').mean().add_prefix('mean_')
pd.merge(df, k1_means, left_on='key1', right_index = True)
df.groupby('key1').transform(np.mean) # 同 group_by %>% mutate %>% ungroup


Out[20]:
data1 data2
0 0.069861 0.06751
1 0.069861 0.06751
2 0.191093 -0.06703
3 0.191093 -0.06703
4 0.069861 0.06751

In [25]:
def top(df,n=5,column='tip_pct'):
    return df.sort_values(by=column)[-n:]

top(tips,n=6)

# 分组应用函数
tips.groupby('smoker').apply(top)

# 分组应用函数并传参
tips.groupby(['sex','smoker']).apply(top, n = 2, column = 'total_bill')


Out[25]:
total_bill tip sex smoker day time size tip_pct
sex smoker
Female No 85 34.83 5.17 Female No Thur Lunch 4 0.148435
11 35.26 5.00 Female No Sun Dinner 4 0.141804
Yes 73 25.28 5.00 Female Yes Sat Dinner 2 0.197785
72 26.86 3.14 Female Yes Sat Dinner 2 0.116902
Male No 23 39.42 7.58 Male No Sat Dinner 4 0.192288
59 48.27 6.73 Male No Sat Dinner 4 0.139424
Yes 83 32.68 5.00 Male Yes Thur Lunch 2 0.152999
56 38.01 3.00 Male Yes Sat Dinner 4 0.078927

In [34]:
# 分位数

frame=DataFrame({'data1':np.random.randn(1000),
                 'data2':np.random.randn(1000)})

factor = pd.cut(frame.data1, 4)
factor[:10]

def get_status(group):
    return {
        'min':group.min(),
        'max':group.max(),
        'count':group.count(),
        'mean':group.mean()
    }
frame.data2.groupby(factor).apply(get_status).unstack()


Out[34]:
count max mean min
data1
(-3.441, -1.82] 35.0 2.674004 -0.106755 -2.363754
(-1.82, -0.205] 400.0 3.227679 0.009128 -2.788863
(-0.205, 1.409] 479.0 3.147800 -0.001439 -2.961218
(1.409, 3.0239] 86.0 2.439445 -0.052254 -2.709633

In [36]:
# 透视表

tips.pivot_table(index = ['sex','smoker'])


Out[36]:
size tip tip_pct total_bill
sex smoker
Female No 2.650000 2.914000 0.155622 19.601000
Yes 1.666667 3.046667 0.213473 18.403333
Male No 2.641509 3.112642 0.160993 19.440000
Yes 2.400000 2.888000 0.154222 19.771000

In [39]:
tips.pivot_table(['tip_pct','size'], index = ['sex','day'], columns='smoker')

# 加入分项小计, 默认 aggfunc 为 mean
tips.pivot_table(['tip_pct','size'], index = ['sex','day'], columns='smoker', margins=True)


Out[39]:
tip_pct size
smoker No Yes All No Yes All
sex day
Female Sat 0.146062 0.213473 0.161618 2.400000 1.666667 2.230769
Sun 0.165208 NaN 0.165208 3.000000 NaN 3.000000
Thur 0.165082 NaN 0.165082 2.500000 NaN 2.500000
Male Sat 0.153424 0.154363 0.153674 2.727273 2.500000 2.666667
Sun 0.168870 NaN 0.168870 2.615385 NaN 2.615385
Thur 0.153344 0.153660 0.153435 2.400000 2.000000 2.285714
All 0.159522 0.167896 0.160788 2.643836 2.230769 2.581395

In [42]:
tips.groupby(['sex','day'])['tip_pct','size'].mean() 
tips.pivot_table('tip_pct', index = ['sex', 'smoker'], columns='day', aggfunc=len, margins=True, fill_value=0)


Out[42]:
day Sat Sun Thur All
sex smoker
Female No 10.0 8.0 2.0 20.0
Yes 3.0 0.0 0.0 3.0
Male No 22.0 26.0 5.0 53.0
Yes 8.0 0.0 2.0 10.0
All 43.0 34.0 9.0 86.0

In [44]:
# 交叉表

pd.crosstab(tips.sex, tips.smoker, margins=True)
pd.crosstab([tips.sex, tips.smoker], tips.day, margins=True)


Out[44]:
day Sat Sun Thur All
sex smoker
Female No 10 8 2 20
Yes 3 0 0 3
Male No 22 26 5 53
Yes 8 0 2 10
All 43 34 9 86

In [6]:
# 读写文件

df.to_excel('try.xlsx', sheet_name = 'Sheet1')
frame = pd.read_excel('try.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
frame


Out[6]:
data1 data2 key1 key2
0 -0.301597 -0.576311 a one
1 1.196534 -0.461711 a two
2 0.200715 0.195816 b one
3 0.335171 0.969410 b two
4 1.374061 -1.194680 a one