In [2]:
import pandas as pd
# 读取数据,指定索引列
data = pd.read_excel('/home/jeff/python_data/chapter3/chapter3/demo/data/catering_sale.xls', index_col = u'日期')
# 处理缺失值
data.fillna(method='ffill').describe()
data.dropna().describe()
Out[2]:
In [3]:
# 通过箱线图查看离群值
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.figure()
p = data.boxplot()
x = p['fliers'][0].get_xdata()
y = p['fliers'][0].get_ydata()
y.sort()
for i in range(len(x)):
if i>0:
plt.annotate(y[i], xy = (x[i],y[i]), xytext=(x[i]+0.05-0.8/(y[i]-y[i-1]), y[i]))
else:
plt.annotate(y[i], xy = (x[i],y[i]), xytext=(x[i]+0.08, y[i]))
plt.show()
In [5]:
import numpy as np
print(data.销量.max() - data.销量.min())
data = data.dropna()
data['sale_group'] = pd.cut(data['销量'], 4)#range(0, 10000, 1000))
data.groupby('sale_group')['销量'].count()
Out[5]: