In [ ]:
import pandas as pd
import numpy as np

In [ ]:
# 异常值检查和过滤
randframe = pd.DataFrame(np.random.random((1000,3)))
print(randframe)
randframe.describe()
print(randframe.std())

# 根据每一列的标准差,对DataFrame对象的所有元素进行过滤
# any()函数, 可以对每一列应用筛选条件
randframe[(np.abs(randframe)> (3*randframe.std())).any(1)]

In [ ]:
# 排序
# numpy.random.permutation()函数,调整Series和DataFrame对象各行的顺序
nframe = pd.DataFrame(np.arange(25).reshape((5,5)))
nframe
new_order = np.random.permutation(5) # 行排序
new_order
nframe.take(new_order) # take()函数应用到行

new_order = [3,4,2]
nframe.take(new_order)

In [ ]:
# 随机取样
sample = np.random.randint(0, len(nframe), size=3)
sample
nframe.take(sample)

In [ ]:
# 字符串处理
text = '16 Bolton Avenue , Boston'
text.split(',')

In [ ]:
tokens = [s.strip() for s in text.split(',')]
tokens

In [ ]:
address, city = [s.strip() for s in text.split(',')]
address
city

In [ ]:
# 更实用的拼接方法
strings = ['A+', 'A', 'A-', 'B', 'BB', 'BBB', 'C+']
';'.join(strings)

In [ ]:
# 查找字符串
text.index('Boston')
text.find('Boston')

In [ ]:
#text.index('New York') # 会报错
text.find('New York')

In [ ]:
# 获取字符串或字符串组合在文本中出现的次数
text.count('e')
text.count('Avenue')

In [ ]:
# 替换replace()函数
text.replace('Avenue', 'Street')
text.replace('1', '')

In [ ]:
import re
# 正则表达式
# 一个或多个空白字符的正则表达式: \s+
text = "This is     an\t odd \n text!"
re.split('\s+', text)

In [ ]:
text = "This is my address: 16 Bolton Avenue, Boston"
re.findall('A\w+', text)

In [ ]:
re.findall('[A,a]\w+', text)

In [ ]:
ret = re.search('[A,a]\w+', text)
ret.start()
ret.end()

In [ ]:
re.match('[A,a]\w+', text)

In [ ]:
match = re.match('T\w+', text)
text[match.start():match.end()]

In [37]:
frame = pd.DataFrame({'color':['white','red','green','red','green'],
                     'object':['pen','pencil','pencil','ashtray','pen'],
                     'price1':[5.56,4.20,1.30,0.56,2.75],
                     'price2':[4.75,4.12,1.60,0.75,3.15]})
frame


Out[37]:
color object price1 price2
0 white pen 5.56 4.75
1 red pencil 4.20 4.12
2 green pencil 1.30 1.60
3 red ashtray 0.56 0.75
4 green pen 2.75 3.15

In [38]:
# 第一步进行分组
group = frame['price1'].groupby(frame['color'])
group.groups


Out[38]:
{'green': Int64Index([2, 4], dtype='int64'),
 'red': Int64Index([1, 3], dtype='int64'),
 'white': Int64Index([0], dtype='int64')}

In [42]:
# 第二步进行函数操作
group.mean()
group.sum()
new = pd.DataFrame(group.count() > 1)
new


Out[42]:
price1
color
green True
red True
white False

In [ ]:
# 等级分组
ggroup = frame['price1'].groupby([frame['color'], frame['object']])
ggroup.groups

In [ ]:
frame[['price1', 'price2']].groupby(frame['color']).mean()

In [ ]:
frame.groupby(frame['color']).mean()

In [ ]:
# 组迭代
# GroupBy对象还支持迭代操作,它可以生成一系列由各组名称及其数据部分组成的元组
for name, group in frame.groupby('color'):
    print(name)
    print(group)

In [ ]:
result1 = frame['price1'].groupby(frame['color']).mean()
type(result1)

In [ ]:
result2 = frame.groupby(frame['color']).mean()
type(result2)

In [ ]:
frame.groupby(frame['color'])['price1'].mean()

In [ ]:
(frame.groupby(frame['color']).mean())['price1']

In [ ]:
means = frame.groupby('color').mean().add_prefix('mean_')
means

In [ ]:
# 分组函数
group = frame.groupby('color')
group['price1'].quantile(0.6) # 计算分位数

In [ ]:
def range(series):
    return series.max() - series.min()

group['price1'].agg(range)

In [ ]:
group['price1'].agg(['mean', 'std', range]) # 使用多个聚合函数

In [ ]:
frame = pd.DataFrame({'color':['white','red','green','red','green'],
                     'price1':[5.56,4.20,1.30,0.56,2.75],
                     'price2':[4.75,4.12,1.60,0.75,3.15]})
frame
sums = frame.groupby('color').sum().add_prefix('tot_')
sums

In [ ]:
pd.merge(frame, sums, left_on='color', right_index=True)

In [ ]:
frame.groupby('color').transform(np.sum).add_prefix('tot_')

In [ ]:
frame = pd.DataFrame({
    'color':['white','black','white','white','black','black'],
    'status':['up','up','down','down','down','up'],
    'value1':[12.33,14.55,22.34,34.27,23.40,18.33],
    'value2':[11.33,31.80,29.99,31.18,18.25,22.44],
})
frame

In [ ]:
frame.groupby(['color','status']).apply(lambda x: x.max())

In [ ]:
frame.rename(index=reindex, columns=recolumn)

In [ ]:
temp = pd.date_range('1/1/2015', periods=10, freq='h')
temp

In [ ]:
timeseries = pd.Series(np.random.rand(10), index=temp)
timeseries

In [ ]:
timetable = pd.DataFrame({
    'date':temp,
    'value1':np.random.rand(10),
    'value2':np.random.rand(10)
})
timetable

In [ ]:
# 添加基准列
timetable['cat'] = ['up','down','left','left','up','up','down','right','right','up']
timetable

In [36]:



---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-36-f6171da2c842> in <module>()
----> 1 import md5
      2 md5

ModuleNotFoundError: No module named 'md5'

In [ ]: