notebook.community

Edit and run



In [ ]:

    
import pandas as pd
import numpy as np



In [ ]:

    
# 异常值检查和过滤
randframe = pd.DataFrame(np.random.random((1000,3)))
print(randframe)
randframe.describe()
print(randframe.std())

# 根据每一列的标准差，对DataFrame对象的所有元素进行过滤
# any()函数, 可以对每一列应用筛选条件
randframe[(np.abs(randframe)> (3*randframe.std())).any(1)]



In [ ]:

    
# 排序
# numpy.random.permutation()函数，调整Series和DataFrame对象各行的顺序
nframe = pd.DataFrame(np.arange(25).reshape((5,5)))
nframe
new_order = np.random.permutation(5) # 行排序
new_order
nframe.take(new_order) # take()函数应用到行

new_order = [3,4,2]
nframe.take(new_order)



In [ ]:

    
# 随机取样
sample = np.random.randint(0, len(nframe), size=3)
sample
nframe.take(sample)



In [ ]:

    
# 字符串处理
text = '16 Bolton Avenue , Boston'
text.split(',')



In [ ]:

    
tokens = [s.strip() for s in text.split(',')]
tokens



In [ ]:

    
address, city = [s.strip() for s in text.split(',')]
address
city



In [ ]:

    
# 更实用的拼接方法
strings = ['A+', 'A', 'A-', 'B', 'BB', 'BBB', 'C+']
';'.join(strings)



In [ ]:

    
# 查找字符串
text.index('Boston')
text.find('Boston')



In [ ]:

    
#text.index('New York') # 会报错
text.find('New York')



In [ ]:

    
# 获取字符串或字符串组合在文本中出现的次数
text.count('e')
text.count('Avenue')



In [ ]:

    
# 替换replace()函数
text.replace('Avenue', 'Street')
text.replace('1', '')



In [ ]:

    
import re
# 正则表达式
# 一个或多个空白字符的正则表达式: \s+
text = "This is     an\t odd \n text!"
re.split('\s+', text)



In [ ]:

    
text = "This is my address: 16 Bolton Avenue, Boston"
re.findall('A\w+', text)



In [ ]:

    
re.findall('[A,a]\w+', text)



In [ ]:

    
ret = re.search('[A,a]\w+', text)
ret.start()
ret.end()



In [ ]:

    
re.match('[A,a]\w+', text)



In [ ]:

    
match = re.match('T\w+', text)
text[match.start():match.end()]



In [37]:

    
frame = pd.DataFrame({'color':['white','red','green','red','green'],
                     'object':['pen','pencil','pencil','ashtray','pen'],
                     'price1':[5.56,4.20,1.30,0.56,2.75],
                     'price2':[4.75,4.12,1.60,0.75,3.15]})
frame









    Out[37]:







  
    
      
      color
      object
      price1
      price2
    
  
  
    
      0
      white
      pen
      5.56
      4.75
    
    
      1
      red
      pencil
      4.20
      4.12
    
    
      2
      green
      pencil
      1.30
      1.60
    
    
      3
      red
      ashtray
      0.56
      0.75
    
    
      4
      green
      pen
      2.75
      3.15



In [38]:

    
# 第一步进行分组
group = frame['price1'].groupby(frame['color'])
group.groups









    Out[38]:





{'green': Int64Index([2, 4], dtype='int64'),
 'red': Int64Index([1, 3], dtype='int64'),
 'white': Int64Index([0], dtype='int64')}



In [42]:

    
# 第二步进行函数操作
group.mean()
group.sum()
new = pd.DataFrame(group.count() > 1)
new









    Out[42]:







  
    
      
      price1
    
    
      color
      
    
  
  
    
      green
      True
    
    
      red
      True
    
    
      white
      False



In [ ]:

    
# 等级分组
ggroup = frame['price1'].groupby([frame['color'], frame['object']])
ggroup.groups



In [ ]:

    
frame[['price1', 'price2']].groupby(frame['color']).mean()



In [ ]:

    
frame.groupby(frame['color']).mean()



In [ ]:

    
# 组迭代
# GroupBy对象还支持迭代操作，它可以生成一系列由各组名称及其数据部分组成的元组
for name, group in frame.groupby('color'):
    print(name)
    print(group)



In [ ]:

    
result1 = frame['price1'].groupby(frame['color']).mean()
type(result1)



In [ ]:

    
result2 = frame.groupby(frame['color']).mean()
type(result2)



In [ ]:

    
frame.groupby(frame['color'])['price1'].mean()



In [ ]:

    
(frame.groupby(frame['color']).mean())['price1']



In [ ]:

    
means = frame.groupby('color').mean().add_prefix('mean_')
means



In [ ]:

    
# 分组函数
group = frame.groupby('color')
group['price1'].quantile(0.6) # 计算分位数



In [ ]:

    
def range(series):
    return series.max() - series.min()

group['price1'].agg(range)



In [ ]:

    
group['price1'].agg(['mean', 'std', range]) # 使用多个聚合函数



In [ ]:

    
frame = pd.DataFrame({'color':['white','red','green','red','green'],
                     'price1':[5.56,4.20,1.30,0.56,2.75],
                     'price2':[4.75,4.12,1.60,0.75,3.15]})
frame
sums = frame.groupby('color').sum().add_prefix('tot_')
sums



In [ ]:

    
pd.merge(frame, sums, left_on='color', right_index=True)



In [ ]:

    
frame.groupby('color').transform(np.sum).add_prefix('tot_')



In [ ]:

    
frame = pd.DataFrame({
    'color':['white','black','white','white','black','black'],
    'status':['up','up','down','down','down','up'],
    'value1':[12.33,14.55,22.34,34.27,23.40,18.33],
    'value2':[11.33,31.80,29.99,31.18,18.25,22.44],
})
frame



In [ ]:

    
frame.groupby(['color','status']).apply(lambda x: x.max())



In [ ]:

    
frame.rename(index=reindex, columns=recolumn)



In [ ]:

    
temp = pd.date_range('1/1/2015', periods=10, freq='h')
temp



In [ ]:

    
timeseries = pd.Series(np.random.rand(10), index=temp)
timeseries



In [ ]:

    
timetable = pd.DataFrame({
    'date':temp,
    'value1':np.random.rand(10),
    'value2':np.random.rand(10)
})
timetable



In [ ]:

    
# 添加基准列
timetable['cat'] = ['up','down','left','left','up','up','down','right','right','up']
timetable



In [36]:









    



---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-36-f6171da2c842> in <module>()
----> 1 import md5
      2 md5

ModuleNotFoundError: No module named 'md5'



In [ ]:

	color	object	price1	price2
0	white	pen	5.56	4.75
1	red	pencil	4.20	4.12
2	green	pencil	1.30	1.60
3	red	ashtray	0.56	0.75
4	green	pen	2.75	3.15