notebook.community

Edit and run



In [ ]:

    
import pandas as pd
import numpy as np
import math as math



In [ ]:

    
#from plotly.offline import plot
#from plotly.graph_objs import *



In [ ]:

    
#import statsmodels.api as sm

#from statsmodels.graphics.api import qqplot

#from scipy import stats

#from sklearn import linear_model
#from sklearn.cross_validation import train_test_split
#from sklearn.linear_model import LinearRegression



In [ ]:

    
%matplotlib inline
import matplotlib.pyplot as plt



In [ ]:

    
NSize=50



In [ ]:

    
def read_user_data():
    user_action = pd.read_csv("G:\\TianChi\\music\\p2_mars_tianchi_user_actions.csv",names=['user_id','song_id','gmt_create','action_type','Ds'],\
                                                                                  header=None,index_col=0)
    #index_col=None
    #parse_dates=['date'])
    
    #user_action = user_action[['user_id','song_id','gmt_create','action_type','Ds']]
    
    return user_action



def read_singer_data():
    singer_data = pd.read_csv("G:\\TianChi\\music\\p2_mars_tianchi_songs.csv",names=['song_id','artist_id','publish_time',\
                                                                                  'song_init_plays','Language','Gender'],\
                                                                           header=None,index_col=0)
    
    #singer_data = singer_data[['song_id','artist_id','publish_time','song_init_plays','Language','Gender']]
    
    return singer_data



In [ ]:

    
def data_ETL_user(data_info):
    #grouper=pd.TimeGrouper("1D")
    
    category_user = data_info[data_info['action_type'] == 1]
        
    res_data = category_user.groupby(['Ds','song_id']).count()[['gmt_create']]    
    
    res_data=res_data.unstack().fillna(value=0).T.reset_index().groupby(['song_id']).sum()
    
    return res_data

def data_ETL_singer(data_info):
    
        
    #groupby_id=sales_info.groupby('item_id')
    #groupby_id.size()
    #total_nums=groupby_id.sum()
    #total_nums.sort_values(by='num_alipay_njhs').tail()[['num_alipay_njhs']]
    
    #grouper=pd.TimeGrouper("14D")
    category_data=data_info.groupby(['artist_id'])
    
    #res_data=category_sum.unstack().fillna(value=0)
    #res_data[res_data<1]=0
    #res_data.apply(np.floor)
    return category_data



In [ ]:

    
def predict_next(item_no,src_data):
    
    df_src = src_data.iloc[:,item_no]
    
    df=np.log(df_src+1)

    df_diff1=df-df.shift(1)
    df_diff2=df_diff1.dropna()


    pred_mean = df_diff2.mean()
    pred_std = df_diff2.std()
    
    pred_values=np.random.normal(pred_mean,pred_std,61)

    
    res_pow  = pred_values+df.mean()
    
    # Another way to caculate the values!
    #st_values=df[-1]
    #res_pow2 = res_pow.copy() 
    #for i in range(0,61):
    #    res_pow2[i] = pred_values[i] + st_values
    #    st_values = res_pow2[i]    

    pred_df_values = np.floor(np.exp(res_pow)-1)
    
    hist_df = pd.DataFrame(np.floor(df_src.values),columns=[df_res.columns[item_no]],index=pd.date_range('20150301',periods=183))

    pred_df = pd.DataFrame(pred_df_values,columns=[df_res.columns[item_no]],index=pd.date_range('20150901',periods=61))

    pred_all_df = hist_df.append(pred_df)
    
    #调整时间格式以及列顺序
    pred_all_df.index=pred_all_df.index.strftime('%Y%m%d')
    
    predict_res=pred_all_df.stack().reset_index()
    
    predict_res.columns=['Ds','artist_id','plays']
    
    predict_res['Ds1']=predict_res['Ds']

    predict_res=predict_res.iloc[:,1:4]
        
    return predict_res



In [ ]:

    
csv_dat=read_user_data()



In [ ]:

    
res_dat=data_ETL_user(csv_dat)

csv_singer=read_singer_data()

csv_singer=csv_singer[['artist_id','song_init_plays']]

temp_dat=csv_singer.join(res_dat,how='inner')

temp_dat=temp_dat.reset_index()

init_plays=temp_dat[['song_id','song_init_plays']]

temp_res=temp_dat.groupby(['artist_id']).sum()

temp_res=temp_res.iloc[:,1:].T



In [ ]:

    
df_res = pd.DataFrame(temp_res.values, columns=temp_res.columns,index=pd.date_range('20150301',periods=183))



In [ ]:

    
#df_res.plot(figsize=(16, 7),legend=None)



In [ ]:

    
pred_all=predict_next(0,df_res)

for iCount in range(1,50):
    predict_df = predict_next(iCount,df_res)
    
    pred_all = pred_all.append(predict_df)

pred_all.to_csv("g:\\music_predict_01.csv",header=False,index=False)



In [ ]:

    
def select_store_dat(index_code,info_dat):
    
    query_snt="store_code==%d"%index_code
    
    return info_dat.query(query_snt).reset_index(level=1,drop=True)



In [ ]:

    
#plot([Bar(x=res_tg.index, y=res_tg.target),Bar(x=res_tg.index, y=res_tg.qty_alipay_njhs)])



In [ ]:

    
#验证销量预测是否服从指数衰减，或者长尾定律
#res_tg[['target','qty_alipay_njhs']].iloc[:,0:1].sort_values(by='target',ascending=False).reset_index(drop=True).plot(title="",figsize=(12, 7))