In [ ]:
import pandas as pd
import numpy as np
import math as math

In [ ]:
from plotly.offline import plot
from plotly.graph_objs import *

In [ ]:
import statsmodels.api as sm

#from statsmodels.graphics.api import qqplot

#from scipy import stats


#from sklearn import linear_model
#from sklearn.cross_validation import train_test_split
#from sklearn.linear_model import LinearRegression

In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt

In [ ]:
NSize=963

In [ ]:
def read_data():
    sales=pd.read_csv("G:\\TianChi\\logistic\\CAINIAO\\data\\item_feature2.csv",names=['date','item_id','cate_id','cate_level_id','brand_id',\
                                                                            'supplier_id','pv_ipv','pv_uv','cart_ipv','cart_uv','collect_uv',\
                                                                            'num_gmv','amt_gmv','qty_gmv','unum_gmv','amt_alipay',\
                                                                            'num_alipay','qty_alipay','unum_alipay','ztc_pv_ipv',\
                                                                            'tbk_pv_ipv','ss_pv_ipv','jhs_pv_ipv','ztc_pv_uv',\
                                                                            'tbk_pv_uv','ss_pv_uv','jhs_pv_uv','num_alipay_njhs',\
                                                                            'amt_alipay_njhs','qty_alipay_njhs','unum_alipay_njhs'],\
                                                                            header=None,index_col=0,parse_dates=['date'])
    
    sales_info=sales[['item_id','cate_id','collect_uv','qty_alipay_njhs','num_alipay_njhs']]
    
    return sales_info



def read_data_district():
    sales=pd.read_csv("G:\\TianChi\\logistic\\CAINIAO\\data\\item_store_feature2.csv",names=['date','item_id','store_code','cate_id','cate_level_id','brand_id',\
                                                                            'supplier_id','pv_ipv','pv_uv','cart_ipv','cart_uv','collect_uv',\
                                                                            'num_gmv','amt_gmv','qty_gmv','unum_gmv','amt_alipay',\
                                                                            'num_alipay','qty_alipay','unum_alipay','ztc_pv_ipv',\
                                                                            'tbk_pv_ipv','ss_pv_ipv','jhs_pv_ipv','ztc_pv_uv',\
                                                                            'tbk_pv_uv','ss_pv_uv','jhs_pv_uv','num_alipay_njhs',\
                                                                            'amt_alipay_njhs','qty_alipay_njhs','unum_alipay_njhs'],\
                                                                            header=None,index_col=0,parse_dates=['date'])
    
    sales_info=sales[['item_id','store_code','cate_id','collect_uv','qty_alipay_njhs','num_alipay_njhs']]
    
    return sales_info


def data_ETL_district(sales_info):
    grouper=pd.TimeGrouper("14D")
    category_sum=sales_info[sales_info.index>'2014-10-19'].groupby([grouper,'store_code','item_id']).sum()[['qty_alipay_njhs']]
    
    res_data=category_sum.unstack().fillna(value=0)
    return res_data


def select_store_dat(index_code,info_dat):
    
    query_snt="store_code==%d"%index_code
    
    return info_dat.query(query_snt).reset_index(level=1,drop=True)


def data_ETL(sales_info):
    
        
    #groupby_id=sales_info.groupby('item_id')
    #groupby_id.size()
    #total_nums=groupby_id.sum()
    #total_nums.sort_values(by='num_alipay_njhs').tail()[['num_alipay_njhs']]
    
    grouper=pd.TimeGrouper("14D")
    category_sum=sales_info[sales_info.index>'2014-10-19'].groupby([grouper,'item_id']).sum()[['qty_alipay_njhs']]
    
    res_data=category_sum.unstack().fillna(value=0)
    #res_data[res_data<1]=0
    #res_data.apply(np.floor)
    return res_data
    #mplt.legend([""], loc=9,ncol=4)

In [ ]:
def predict_numsum(num_item,src_data):
    
    #'series' in the 'log' arithmetic cannot be ZERO !!!          
    # transform the original data series to be stable using method 'log' and 'difference'
    df=np.log(src_data.iloc[:,num_item]+1)
    
    #df=src_data.iloc[:,num_item]
    
    df_diff1=df-df.shift(1)
    df_diff2=df_diff1.dropna()
    
    diff2_count=df_diff2[df_diff2.values==0].count()
    
    if diff2_count>21:
       
        return src_data.columns[num_item][1],src_data.iloc[30,num_item]
    
    p=0
    q=1
    sparams=[0.1]*(p+q)

    try:
        arma_diff = sm.tsa.ARMA(df_diff2, (p,q)).fit(start_params = sparams,\
                                                     transparams=True,method='css',\
                                                     trend='nc',disp=-1)
    except Exception as err:
        print err
        print "fit error happened num_item:%d"%num_item     
    
    
    predict_sums = arma_diff.predict('2015-12-28', '2015-12-28', dynamic=True)
    res_pow=predict_sums['2015-12-28']+df['2015-12-14']   
    num_sum=math.exp(res_pow)-1

    #fig, ax = plt.subplots(figsize=(12, 8))
    #ax = df_diff2.plot(ax=ax)
    #fig = arma_diff.plot_predict('2015-12-28','2015-12-28', dynamic=True, ax=ax, plot_insample=False)    
    
    return src_data.columns[num_item][1],num_sum

In [ ]:
def predict_store_numsum(store_code,num_item,src_data):
    
    #'series' in the 'log' arithmetic cannot be ZERO !!!          
    df=np.log(src_data.iloc[:,num_item]+1)
    
    #df=src_data.iloc[:,num_item]
    
    df_diff1=df-df.shift(1)
    df_diff2=df_diff1.dropna()
    
    diff2_count=df_diff2[df_diff2.values==0].count()
    
    if diff2_count>21:
        return src_data.columns[num_item][1],src_data.iloc[30,num_item]

    p=0
    q=1
    sparams=[0.1]*(p+q)

    try:
        arma_diff = sm.tsa.ARMA(df_diff2, (p,q)).fit(start_params = sparams,\
                                                     transparams=True,method='css',\
                                                     trend='nc',disp=-1)
    except Exception as err:
        print err
        print "fit error happened code:%d,num_item:,%d"%(store_code,num_item)
    
    
    predict_sums = arma_diff.predict('2015-12-28', '2015-12-28', dynamic=True)
    
    res_pow=predict_sums['2015-12-28']+df['2015-12-14']   
    num_sum=math.exp(res_pow)-1
    #fig, ax = plt.subplots(figsize=(12, 8))
    #ax = df_diff2.plot(ax=ax)
    #fig = arma_diff.plot_predict('2015-12-28','2015-12-28', dynamic=True, ax=ax, plot_insample=False)    
    
    return src_data.columns[num_item][1],num_sum

In [ ]:
def predict_store_id(st_code,src_data):   
    
    res=np.arange(NSize*2,dtype=np.int32).reshape(NSize,2)    
    for icount in range(0,NSize):
        res[icount,0],res[icount,1]=predict_store_numsum(st_code,icount,src_data)
      
    res_df=pd.DataFrame(res,columns=['item_id','target'])    
    return res_df

def predict_all_id(src_data):   
    
    res=np.arange(NSize*2,dtype=np.int32).reshape(NSize,2)    
    for icount in range(0,NSize):
        res[icount,0],res[icount,1]=predict_numsum(icount,src_data)
      
    res_df=pd.DataFrame(res,columns=['item_id','target'])    
    return res_df

def predict_one(icount,src_data):
    res=np.arange(2,dtype=np.int32).reshape(1,2)    

    res[0,0],res[0,1]=predict_numsum(icount,src_data)

    res_df=pd.DataFrame(res,columns=['item_id','target'])    
    return res_df

In [ ]:
def data_ETL4avg(sales_info):
    
    grouper=pd.TimeGrouper("14D")
    category_sum=sales_info[sales_info.index>'2014-12-13'].groupby([grouper,'item_id']).sum()[['qty_alipay_njhs']]
    
    res_data=category_sum.unstack().fillna(value=0)
    
    return res_data

In [ ]:
csv_data=read_data()
res_cate=data_ETL(csv_data)

In [ ]:
#所有商品id的销量随时间的变化趋势
total_trend=res_cate.T.reset_index(drop=True).T
total_trend.plot(title="",figsize=(15, 9),legend=None)

In [ ]:
'''
temp_plot=res_cate.iloc[:,122:123].stack().reset_index(drop=True)
print temp_plot

temp_res=predict_one(122,res_cate)
print temp_res

temp_plot=temp_plot.append([{'qty_alipay_njhs':temp_res['target'][0]}],ignore_index=True)

temp_plot.plot(title="",figsize=(9, 7),legend=None)
'''

In [ ]:
res_target=predict_all_id(res_cate)

res_target[res_target<0]=0

In [ ]:
#将预测结果数量分布与上一期实际数量做对比
res_tg=res_target

temp_plot=res_cate.iloc[29:30,:].stack().reset_index(drop=True)
res_tg=pd.concat([res_tg, temp_plot], axis=1)

res_tg[['target','qty_alipay_njhs']].plot(title="",figsize=(20, 7))

In [ ]:
plot([Bar(x=res_tg.index, y=res_tg.target),Bar(x=res_tg.index, y=res_tg.qty_alipay_njhs)])

In [ ]:
#验证销量预测是否服从指数衰减,或者长尾定律
#res_tg[['target','qty_alipay_njhs']].iloc[:,0:1].sort_values(by='target',ascending=False).reset_index(drop=True).plot(title="",figsize=(12, 7))

In [ ]:
res_target['code']="all"

In [ ]:
store_data=read_data_district()
store_dat=data_ETL_district(store_data)

In [ ]:
for i in range(1,6):
    temp_dat=select_store_dat(i,store_dat)

    store_target=predict_store_id(i,temp_dat)
    
    store_target[store_target<0]=0
    
    code_name="%d"%i
    store_target['code']=code_name
    
    res_target=res_target.append(store_target)

res_target[['item_id','code','target']].to_csv("g:\\2tt9_arma01reform.csv",header=False,index=False)