In [ ]:
import pandas as pd
import numpy as np
import math as math
In [ ]:
from plotly.offline import plot
from plotly.graph_objs import *
In [ ]:
import statsmodels.api as sm
#from statsmodels.graphics.api import qqplot
#from scipy import stats
#from sklearn import linear_model
#from sklearn.cross_validation import train_test_split
#from sklearn.linear_model import LinearRegression
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
In [ ]:
NSize=963
In [ ]:
def read_data():
sales=pd.read_csv("G:\\TianChi\\logistic\\CAINIAO\\data\\item_feature2.csv",names=['date','item_id','cate_id','cate_level_id','brand_id',\
'supplier_id','pv_ipv','pv_uv','cart_ipv','cart_uv','collect_uv',\
'num_gmv','amt_gmv','qty_gmv','unum_gmv','amt_alipay',\
'num_alipay','qty_alipay','unum_alipay','ztc_pv_ipv',\
'tbk_pv_ipv','ss_pv_ipv','jhs_pv_ipv','ztc_pv_uv',\
'tbk_pv_uv','ss_pv_uv','jhs_pv_uv','num_alipay_njhs',\
'amt_alipay_njhs','qty_alipay_njhs','unum_alipay_njhs'],\
header=None,index_col=0,parse_dates=['date'])
sales_info=sales[['item_id','cate_id','collect_uv','qty_alipay_njhs','num_alipay_njhs']]
return sales_info
def read_data_district():
sales=pd.read_csv("G:\\TianChi\\logistic\\CAINIAO\\data\\item_store_feature2.csv",names=['date','item_id','store_code','cate_id','cate_level_id','brand_id',\
'supplier_id','pv_ipv','pv_uv','cart_ipv','cart_uv','collect_uv',\
'num_gmv','amt_gmv','qty_gmv','unum_gmv','amt_alipay',\
'num_alipay','qty_alipay','unum_alipay','ztc_pv_ipv',\
'tbk_pv_ipv','ss_pv_ipv','jhs_pv_ipv','ztc_pv_uv',\
'tbk_pv_uv','ss_pv_uv','jhs_pv_uv','num_alipay_njhs',\
'amt_alipay_njhs','qty_alipay_njhs','unum_alipay_njhs'],\
header=None,index_col=0,parse_dates=['date'])
sales_info=sales[['item_id','store_code','cate_id','collect_uv','qty_alipay_njhs','num_alipay_njhs']]
return sales_info
def data_ETL_district(sales_info):
grouper=pd.TimeGrouper("14D")
category_sum=sales_info[sales_info.index>'2014-10-19'].groupby([grouper,'store_code','item_id']).sum()[['qty_alipay_njhs']]
res_data=category_sum.unstack().fillna(value=0)
return res_data
def select_store_dat(index_code,info_dat):
query_snt="store_code==%d"%index_code
return info_dat.query(query_snt).reset_index(level=1,drop=True)
def data_ETL(sales_info):
#groupby_id=sales_info.groupby('item_id')
#groupby_id.size()
#total_nums=groupby_id.sum()
#total_nums.sort_values(by='num_alipay_njhs').tail()[['num_alipay_njhs']]
grouper=pd.TimeGrouper("14D")
category_sum=sales_info[sales_info.index>'2014-10-19'].groupby([grouper,'item_id']).sum()[['qty_alipay_njhs']]
res_data=category_sum.unstack().fillna(value=0)
#res_data[res_data<1]=0
#res_data.apply(np.floor)
return res_data
#mplt.legend([""], loc=9,ncol=4)
In [ ]:
def predict_numsum(num_item,src_data):
#'series' in the 'log' arithmetic cannot be ZERO !!!
# transform the original data series to be stable using method 'log' and 'difference'
df=np.log(src_data.iloc[:,num_item]+1)
#df=src_data.iloc[:,num_item]
df_diff1=df-df.shift(1)
df_diff2=df_diff1.dropna()
diff2_count=df_diff2[df_diff2.values==0].count()
if diff2_count>21:
return src_data.columns[num_item][1],src_data.iloc[30,num_item]
p=0
q=1
sparams=[0.1]*(p+q)
try:
arma_diff = sm.tsa.ARMA(df_diff2, (p,q)).fit(start_params = sparams,\
transparams=True,method='css',\
trend='nc',disp=-1)
except Exception as err:
print err
print "fit error happened num_item:%d"%num_item
predict_sums = arma_diff.predict('2015-12-28', '2015-12-28', dynamic=True)
res_pow=predict_sums['2015-12-28']+df['2015-12-14']
num_sum=math.exp(res_pow)-1
#fig, ax = plt.subplots(figsize=(12, 8))
#ax = df_diff2.plot(ax=ax)
#fig = arma_diff.plot_predict('2015-12-28','2015-12-28', dynamic=True, ax=ax, plot_insample=False)
return src_data.columns[num_item][1],num_sum
In [ ]:
def predict_store_numsum(store_code,num_item,src_data):
#'series' in the 'log' arithmetic cannot be ZERO !!!
df=np.log(src_data.iloc[:,num_item]+1)
#df=src_data.iloc[:,num_item]
df_diff1=df-df.shift(1)
df_diff2=df_diff1.dropna()
diff2_count=df_diff2[df_diff2.values==0].count()
if diff2_count>21:
return src_data.columns[num_item][1],src_data.iloc[30,num_item]
p=0
q=1
sparams=[0.1]*(p+q)
try:
arma_diff = sm.tsa.ARMA(df_diff2, (p,q)).fit(start_params = sparams,\
transparams=True,method='css',\
trend='nc',disp=-1)
except Exception as err:
print err
print "fit error happened code:%d,num_item:,%d"%(store_code,num_item)
predict_sums = arma_diff.predict('2015-12-28', '2015-12-28', dynamic=True)
res_pow=predict_sums['2015-12-28']+df['2015-12-14']
num_sum=math.exp(res_pow)-1
#fig, ax = plt.subplots(figsize=(12, 8))
#ax = df_diff2.plot(ax=ax)
#fig = arma_diff.plot_predict('2015-12-28','2015-12-28', dynamic=True, ax=ax, plot_insample=False)
return src_data.columns[num_item][1],num_sum
In [ ]:
def predict_store_id(st_code,src_data):
res=np.arange(NSize*2,dtype=np.int32).reshape(NSize,2)
for icount in range(0,NSize):
res[icount,0],res[icount,1]=predict_store_numsum(st_code,icount,src_data)
res_df=pd.DataFrame(res,columns=['item_id','target'])
return res_df
def predict_all_id(src_data):
res=np.arange(NSize*2,dtype=np.int32).reshape(NSize,2)
for icount in range(0,NSize):
res[icount,0],res[icount,1]=predict_numsum(icount,src_data)
res_df=pd.DataFrame(res,columns=['item_id','target'])
return res_df
def predict_one(icount,src_data):
res=np.arange(2,dtype=np.int32).reshape(1,2)
res[0,0],res[0,1]=predict_numsum(icount,src_data)
res_df=pd.DataFrame(res,columns=['item_id','target'])
return res_df
In [ ]:
def data_ETL4avg(sales_info):
grouper=pd.TimeGrouper("14D")
category_sum=sales_info[sales_info.index>'2014-12-13'].groupby([grouper,'item_id']).sum()[['qty_alipay_njhs']]
res_data=category_sum.unstack().fillna(value=0)
return res_data
In [ ]:
csv_data=read_data()
res_cate=data_ETL(csv_data)
In [ ]:
#所有商品id的销量随时间的变化趋势
total_trend=res_cate.T.reset_index(drop=True).T
total_trend.plot(title="",figsize=(15, 9),legend=None)
In [ ]:
'''
temp_plot=res_cate.iloc[:,122:123].stack().reset_index(drop=True)
print temp_plot
temp_res=predict_one(122,res_cate)
print temp_res
temp_plot=temp_plot.append([{'qty_alipay_njhs':temp_res['target'][0]}],ignore_index=True)
temp_plot.plot(title="",figsize=(9, 7),legend=None)
'''
In [ ]:
res_target=predict_all_id(res_cate)
res_target[res_target<0]=0
In [ ]:
#将预测结果数量分布与上一期实际数量做对比
res_tg=res_target
temp_plot=res_cate.iloc[29:30,:].stack().reset_index(drop=True)
res_tg=pd.concat([res_tg, temp_plot], axis=1)
res_tg[['target','qty_alipay_njhs']].plot(title="",figsize=(20, 7))
In [ ]:
plot([Bar(x=res_tg.index, y=res_tg.target),Bar(x=res_tg.index, y=res_tg.qty_alipay_njhs)])
In [ ]:
#验证销量预测是否服从指数衰减,或者长尾定律
#res_tg[['target','qty_alipay_njhs']].iloc[:,0:1].sort_values(by='target',ascending=False).reset_index(drop=True).plot(title="",figsize=(12, 7))
In [ ]:
res_target['code']="all"
In [ ]:
store_data=read_data_district()
store_dat=data_ETL_district(store_data)
In [ ]:
for i in range(1,6):
temp_dat=select_store_dat(i,store_dat)
store_target=predict_store_id(i,temp_dat)
store_target[store_target<0]=0
code_name="%d"%i
store_target['code']=code_name
res_target=res_target.append(store_target)
res_target[['item_id','code','target']].to_csv("g:\\2tt9_arma01reform.csv",header=False,index=False)