notebook.community

Edit and run



In [3]:

    
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

ser1 = Series([1,2,3,4])
print(ser1)
print(ser1[0:2])
print(ser1[[0,1,2]])









    



0    1
1    2
2    3
3    4
dtype: int64
0    1
1    2
dtype: int64
0    1
1    2
2    3
dtype: int64



In [2]:

    
sdata = {'python' : 1,  'pandas' : 2,  'matplotlib' : 3,  'numpy' : 4,  'scipy' : 5}
ser2 = Series(sdata)
print(ser2)
print(sdata.keys())
print(sdata.items())
print(sdata.values())
'python' in sdata.keys()









    



matplotlib    3
numpy         4
pandas        2
python        1
scipy         5
dtype: int64
dict_keys(['python', 'pandas', 'scipy', 'numpy', 'matplotlib'])
dict_items([('python', 1), ('pandas', 2), ('scipy', 5), ('numpy', 4), ('matplotlib', 3)])
dict_values([1, 2, 5, 4, 3])






    Out[2]:





True



In [3]:

    
ser3 = Series(sdata, index = ['apple', 'python', 'numpy', 'jupyter', 'scipy'])
print(ser3)
print('index:', ser3.index)
print('value:', ser3.values)
print(ser3[0:2])
print(ser3['apple':'numpy'])
print(ser3[['apple', 'numpy']])
ser3.name = 'try'
print(ser3)









    



apple      NaN
python     1.0
numpy      4.0
jupyter    NaN
scipy      5.0
dtype: float64
index: Index(['apple', 'python', 'numpy', 'jupyter', 'scipy'], dtype='object')
value: [ nan   1.   4.  nan   5.]
apple     NaN
python    1.0
dtype: float64
apple     NaN
python    1.0
numpy     4.0
dtype: float64
apple    NaN
numpy    4.0
dtype: float64
apple      NaN
python     1.0
numpy      4.0
jupyter    NaN
scipy      5.0
Name: try, dtype: float64



In [4]:

    
data = {'packages':['pandas', 'numpy', 'matplotlib', 'pandas', 'pandas'],
        'id':[1, 2, 3, 4, 5],
        'num':[1.2, 3.4, 4, 5.6, 7.0]}
frame = DataFrame(data)
#DataFrame(data, columns=['id', 'num', 'packages'], index = ['A', 'B', 'C', 'D', 'E'])
#frame.sort_values('num', ascending = False)
#print(frame)
#print(frame[['id','num']])
#print(frame.loc[0:3,:])
#print(frame.loc[frame.id<4,:])
#print(type(frame.id))
#print(type(frame.loc[:,'id']))
frame.describe









    Out[4]:





<bound method NDFrame.describe of    id  num    packages
0   1  1.2      pandas
1   2  3.4       numpy
2   3  4.0  matplotlib
3   4  5.6      pandas
4   5  7.0      pandas>



In [5]:

    
data = DataFrame(np.arange(16).reshape(4,4))
data[(data/2-1)%3==0] = np.nan
print(data)

#丢弃na
print(data.dropna())
print(data.dropna(how = 'all'))
print(data.dropna(axis = 1))

#填充na
print(data.fillna(0))
print(data.fillna({0:1, 2:10}))
print(data.fillna(method = 'bfill', limit = 1))
print(data.fillna(method = 'ffill', limit = 1))









    



      0   1     2   3
0   0.0   1   NaN   3
1   4.0   5   6.0   7
2   NaN   9  10.0  11
3  12.0  13   NaN  15
     0  1    2  3
1  4.0  5  6.0  7
      0   1     2   3
0   0.0   1   NaN   3
1   4.0   5   6.0   7
2   NaN   9  10.0  11
3  12.0  13   NaN  15
    1   3
0   1   3
1   5   7
2   9  11
3  13  15
      0   1     2   3
0   0.0   1   0.0   3
1   4.0   5   6.0   7
2   0.0   9  10.0  11
3  12.0  13   0.0  15
      0   1     2   3
0   0.0   1  10.0   3
1   4.0   5   6.0   7
2   1.0   9  10.0  11
3  12.0  13  10.0  15
      0   1     2   3
0   0.0   1   6.0   3
1   4.0   5   6.0   7
2  12.0   9  10.0  11
3  12.0  13   NaN  15
      0   1     2   3
0   0.0   1   NaN   3
1   4.0   5   6.0   7
2   4.0   9  10.0  11
3  12.0  13  10.0  15



In [6]:

    
frame = DataFrame(np.arange(16).reshape(4,4), 
                 index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                 columns = [['pandas', 'pandas', 'numpy', 'numpy'],
                           ['Series', 'DataFrame', 'array', 'array']])
print(frame)
print(frame['numpy'])
print(frame.loc[:,'numpy'].loc[:,'array'])
print(frame.unstack()) #关于索引展开









    



    pandas           numpy      
    Series DataFrame array array
a 1      0         1     2     3
  2      4         5     6     7
b 1      8         9    10    11
  2     12        13    14    15
     array  array
a 1      2      3
  2      6      7
b 1     10     11
  2     14     15
     array  array
a 1      2      3
  2      6      7
b 1     10     11
  2     14     15
  pandas                   numpy            
  Series     DataFrame     array            
       1   2         1   2     1   2   1   2
a      0   4         1   5     2   6   3   7
b      8  12         9  13    10  14  11  15



In [7]:

    
df = DataFrame({'key':['b', 'b', 'a', 'c', 'a', 'b'], 'data1':range(6)})
dummies = pd.get_dummies(df['key'], prefix = 'key')
df_with_dummy = df[['data1']].join(dummies)
#print(df_with_dummy)

# 结合使用dummy和cut
df = DataFrame({'name':['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], 'score':np.arange(10)})
#print(df)
dummies = pd.get_dummies(pd.cut(df['score'], [0, 3, 6, 10]))
df_with_dummy = df[['name']].join(dummies)
print(df_with_dummy)









    



  name  (0, 3]  (3, 6]  (6, 10]
0    a     0.0     0.0      0.0
1    b     1.0     0.0      0.0
2    c     1.0     0.0      0.0
3    d     1.0     0.0      0.0
4    e     0.0     1.0      0.0
5    f     0.0     1.0      0.0
6    g     0.0     1.0      0.0
7    h     0.0     0.0      1.0
8    i     0.0     0.0      1.0
9    j     0.0     0.0      1.0



In [8]:

    
# 数据框一个单元格放置列表

df = DataFrame({'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one'],
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)})
df.set_value(0, 'key1', ['a', 'b'])
print(df)









    



      data1     data2    key1 key2
0  1.710266 -1.670090  [a, b]  one
1  0.997657  3.048344       a  two
2  1.195870  0.312211       b  one
3  1.606072  2.942407       b  two
4  0.413874  0.278038       a  one



In [2]:

    
# groupby对数据进行分组

df = DataFrame({'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one'],
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)})

df['data1'].groupby(df['key1']).mean()
df['data1'].groupby([df['key1'], df['key2']]).min()

# 语法糖

df.groupby('key1')[['data1']].max()
df.groupby(['key1', 'key2'])[['data1']].count()

# 函数： 
# + min, max, first, last 
# + sum, mean, count, prop 
# + size, median, std, var









    



-----------------------------------------------------------------------
NameError                             Traceback (most recent call last)
<ipython-input-2-52f68c9d87bf> in <module>()
      1 # groupby对数据进行分组
      2 
----> 3 df = DataFrame({'key1':['a','a','b','b','a'],
      4                 'key2':['one','two','one','two','one'],
      5                 'data1':np.random.randn(5),

NameError: name 'DataFrame' is not defined



In [4]:

    
df = DataFrame({'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one'],
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)})

for name,group in df.groupby('key1'):
    print(name)
    print(group)
    
for (key1, key2),group in df.groupby(['key1', 'key2']):
    print(key1, key2)
    print(group)









    



a
      data1     data2 key1 key2
0 -0.301597 -0.576311    a  one
1  1.196534 -0.461711    a  two
4  1.374061 -1.194680    a  one
b
      data1     data2 key1 key2
2  0.200715  0.195816    b  one
3  0.335171  0.969410    b  two
a one
      data1     data2 key1 key2
0 -0.301597 -0.576311    a  one
4  1.374061 -1.194680    a  one
a two
      data1     data2 key1 key2
1  1.196534 -0.461711    a  two
b one
      data1     data2 key1 key2
2  0.200715  0.195816    b  one
b two
      data1    data2 key1 key2
3  0.335171  0.96941    b  two



In [11]:

    
# 对列进行分组

people = DataFrame(np.random.randn(5,5),
                   columns = ['a','b','c','d','e'],
                   index = ['Joe','Steve','Wes','Jim','Travis'])

people.loc[2:3,['b','c']] = np.nan
#print(people)
mapping = {'a':'red','b':'red','c':'blue',
           'd':'blue','e':'red','f':'orange'}
people.groupby(mapping, axis=1).sum()
people.groupby(len).sum()

key_list = ['one','one','one','two','two']
people.groupby([len, key_list]).sum()



In [12]:

    
# 分层索引的聚合

# 方法1：

columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'CN', 'CN'],[1,3,5,1,3]],names=['cty','tenor'])
hier_df = DataFrame(np.random.randn(4,5), columns = columns)

# 方法2：

hier_df = DataFrame(np.random.randn(4,5), columns = [['US','US','US','CN','CN'],[1,3,5,1,3]])
hier_df.columns.names = ['city', 'tenor']
#hier_df.index.names = ['name1', 'name2]
print(hier_df)

hier_df.groupby(level='city',axis=1).count()









    



city         US                            CN          
tenor         1         3         5         1         3
0      0.582234  0.577590  1.330080 -0.456415  0.937492
1     -0.130928  2.706319 -0.967555  2.631627 -0.617691
2     -0.703914  0.801095  0.812124 -0.793731  0.364345
3      0.226949 -0.889552  1.222218 -0.377916 -0.806176






    Out[12]:






  
    
      city
      CN
      US
    
  
  
    
      0
      2
      3
    
    
      1
      2
      3
    
    
      2
      2
      3
    
    
      3
      2
      3



In [13]:

    
# agg应用自定义函数 

# 自定义极差函数
def peak_to_peak(arr):
    result = arr.max() - arr.min()
    return result

tips = pd.read_csv('tips.csv')

# 添加“小费占总额百分比”的列
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips[:6]

# 应用自定义函数
tips.groupby(['sex','smoker'])[['tip_pct']].agg(peak_to_peak)

# 对多列应用多种函数
tips.groupby(['sex','smoker'])[['tip','tip_pct']].agg(['mean','max',peak_to_peak])









    Out[13]:






  
    
      
      
      tip
      tip_pct
    
    
      
      
      mean
      max
      peak_to_peak
      mean
      max
      peak_to_peak
    
    
      sex
      smoker
      
      
      
      
      
      
    
  
  
    
      Female
      No
      2.914000
      5.20
      4.19
      0.155622
      0.252672
      0.195876
    
    
      Yes
      3.046667
      5.00
      4.00
      0.213473
      0.325733
      0.208830
    
    
      Male
      No
      3.112642
      7.58
      6.33
      0.160993
      0.228050
      0.156246
    
    
      Yes
      2.888000
      5.00
      3.24
      0.154222
      0.205577
      0.126650



In [20]:

    
# 分组级运算和转换

df = DataFrame({'key1':['a','a','b','b','a'],
                'key2':['one','two','one','two','one'],
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)})

k1_means = df.groupby('key1').mean().add_prefix('mean_')
pd.merge(df, k1_means, left_on='key1', right_index = True)
df.groupby('key1').transform(np.mean) # 同 group_by %>% mutate %>% ungroup



In [25]:

    
def top(df,n=5,column='tip_pct'):
    return df.sort_values(by=column)[-n:]

top(tips,n=6)

# 分组应用函数
tips.groupby('smoker').apply(top)

# 分组应用函数并传参
tips.groupby(['sex','smoker']).apply(top, n = 2, column = 'total_bill')



In [34]:

    
# 分位数

frame=DataFrame({'data1':np.random.randn(1000),
                 'data2':np.random.randn(1000)})

factor = pd.cut(frame.data1, 4)
factor[:10]

def get_status(group):
    return {
        'min':group.min(),
        'max':group.max(),
        'count':group.count(),
        'mean':group.mean()
    }
frame.data2.groupby(factor).apply(get_status).unstack()









    Out[34]:






  
    
      
      count
      max
      mean
      min
    
    
      data1
      
      
      
      
    
  
  
    
      (-3.441, -1.82]
      35.0
      2.674004
      -0.106755
      -2.363754
    
    
      (-1.82, -0.205]
      400.0
      3.227679
      0.009128
      -2.788863
    
    
      (-0.205, 1.409]
      479.0
      3.147800
      -0.001439
      -2.961218
    
    
      (1.409, 3.0239]
      86.0
      2.439445
      -0.052254
      -2.709633



In [36]:

    
# 透视表

tips.pivot_table(index = ['sex','smoker'])



In [39]:

    
tips.pivot_table(['tip_pct','size'], index = ['sex','day'], columns='smoker')

# 加入分项小计， 默认 aggfunc 为 mean
tips.pivot_table(['tip_pct','size'], index = ['sex','day'], columns='smoker', margins=True)



In [42]:

    
tips.groupby(['sex','day'])['tip_pct','size'].mean() 
tips.pivot_table('tip_pct', index = ['sex', 'smoker'], columns='day', aggfunc=len, margins=True, fill_value=0)



In [44]:

    
# 交叉表

pd.crosstab(tips.sex, tips.smoker, margins=True)
pd.crosstab([tips.sex, tips.smoker], tips.day, margins=True)



In [6]:

    
# 读写文件

df.to_excel('try.xlsx', sheet_name = 'Sheet1')
frame = pd.read_excel('try.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
frame

		a	b	c	d	e
3	one	1.619607	1.016793	1.802532	-1.415808	-2.470669
3	two	0.232754	0.833052	0.403825	-0.768159	0.254402
5	one	0.768526	-1.839609	-0.515561	-1.760685	0.441773
6	two	0.521294	-0.574520	-0.118423	0.771838	1.244679

	data1	data2
0	0.069861	0.06751
1	0.069861	0.06751
2	0.191093	-0.06703
3	0.191093	-0.06703
4	0.069861	0.06751

		tip_pct			size
	smoker	No	Yes	All	No	Yes	All
sex	day
Female	Sat	0.146062	0.213473	0.161618	2.400000	1.666667	2.230769
	Sun	0.165208	NaN	0.165208	3.000000	NaN	3.000000
	Thur	0.165082	NaN	0.165082	2.500000	NaN	2.500000
Male	Sat	0.153424	0.154363	0.153674	2.727273	2.500000	2.666667
	Sun	0.168870	NaN	0.168870	2.615385	NaN	2.615385
	Thur	0.153344	0.153660	0.153435	2.400000	2.000000	2.285714
All		0.159522	0.167896	0.160788	2.643836	2.230769	2.581395

		tip			tip_pct
		mean	max	peak_to_peak	mean	max	peak_to_peak
sex	smoker
Female	No	2.914000	5.20	4.19	0.155622	0.252672	0.195876
Female	Yes	3.046667	5.00	4.00	0.213473	0.325733	0.208830
Male	No	3.112642	7.58	6.33	0.160993	0.228050	0.156246
Male	Yes	2.888000	5.00	3.24	0.154222	0.205577	0.126650

			total_bill	tip	sex	smoker	day	time	size	tip_pct
sex	smoker
Female	No	85	34.83	5.17	Female	No	Thur	Lunch	4	0.148435
	No	11	35.26	5.00	Female	No	Sun	Dinner	4	0.141804
	Yes	73	25.28	5.00	Female	Yes	Sat	Dinner	2	0.197785
	Yes	72	26.86	3.14	Female	Yes	Sat	Dinner	2	0.116902
Male	No	23	39.42	7.58	Male	No	Sat	Dinner	4	0.192288
	No	59	48.27	6.73	Male	No	Sat	Dinner	4	0.139424
	Yes	83	32.68	5.00	Male	Yes	Thur	Lunch	2	0.152999
	Yes	56	38.01	3.00	Male	Yes	Sat	Dinner	4	0.078927

	count	max	mean	min
data1
(-3.441, -1.82]	35.0	2.674004	-0.106755	-2.363754
(-1.82, -0.205]	400.0	3.227679	0.009128	-2.788863
(-0.205, 1.409]	479.0	3.147800	-0.001439	-2.961218
(1.409, 3.0239]	86.0	2.439445	-0.052254	-2.709633

		size	tip	tip_pct	total_bill
sex	smoker
Female	No	2.650000	2.914000	0.155622	19.601000
Female	Yes	1.666667	3.046667	0.213473	18.403333
Male	No	2.641509	3.112642	0.160993	19.440000
Male	Yes	2.400000	2.888000	0.154222	19.771000

	day	Sat	Sun	Thur	All
sex	smoker
Female	No	10.0	8.0	2.0	20.0
Female	Yes	3.0	0.0	0.0	3.0
Male	No	22.0	26.0	5.0	53.0
Male	Yes	8.0	0.0	2.0	10.0
All		43.0	34.0	9.0	86.0

	data1	data2	key1	key2
0	-0.301597	-0.576311	a	one
1	1.196534	-0.461711	a	two
2	0.200715	0.195816	b	one
3	0.335171	0.969410	b	two
4	1.374061	-1.194680	a	one