In [ ]:
import pandas as pd
import numpy as np
import math as math
In [ ]:
#from plotly.offline import plot
#from plotly.graph_objs import *
In [ ]:
#import statsmodels.api as sm
#from statsmodels.graphics.api import qqplot
#from scipy import stats
#from sklearn import linear_model
#from sklearn.cross_validation import train_test_split
#from sklearn.linear_model import LinearRegression
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
In [ ]:
NSize=50
In [ ]:
def read_user_data():
user_action = pd.read_csv("G:\\TianChi\\music\\p2_mars_tianchi_user_actions.csv",names=['user_id','song_id','gmt_create','action_type','Ds'],\
header=None,index_col=0)
#index_col=None
#parse_dates=['date'])
#user_action = user_action[['user_id','song_id','gmt_create','action_type','Ds']]
return user_action
def read_singer_data():
singer_data = pd.read_csv("G:\\TianChi\\music\\p2_mars_tianchi_songs.csv",names=['song_id','artist_id','publish_time',\
'song_init_plays','Language','Gender'],\
header=None,index_col=0)
#singer_data = singer_data[['song_id','artist_id','publish_time','song_init_plays','Language','Gender']]
return singer_data
In [ ]:
def data_ETL_user(data_info):
#grouper=pd.TimeGrouper("1D")
category_user = data_info[data_info['action_type'] == 1]
res_data = category_user.groupby(['Ds','song_id']).count()[['gmt_create']]
res_data=res_data.unstack().fillna(value=0).T.reset_index().groupby(['song_id']).sum()
return res_data
def data_ETL_singer(data_info):
#groupby_id=sales_info.groupby('item_id')
#groupby_id.size()
#total_nums=groupby_id.sum()
#total_nums.sort_values(by='num_alipay_njhs').tail()[['num_alipay_njhs']]
#grouper=pd.TimeGrouper("14D")
category_data=data_info.groupby(['artist_id'])
#res_data=category_sum.unstack().fillna(value=0)
#res_data[res_data<1]=0
#res_data.apply(np.floor)
return category_data
In [ ]:
def predict_next(item_no,src_data):
df_src = src_data.iloc[:,item_no]
df=np.log(df_src+1)
df_diff1=df-df.shift(1)
df_diff2=df_diff1.dropna()
pred_mean = df_diff2.mean()
pred_std = df_diff2.std()
pred_values=np.random.normal(pred_mean,pred_std,61)
res_pow = pred_values+df.mean()
# Another way to caculate the values!
#st_values=df[-1]
#res_pow2 = res_pow.copy()
#for i in range(0,61):
# res_pow2[i] = pred_values[i] + st_values
# st_values = res_pow2[i]
pred_df_values = np.floor(np.exp(res_pow)-1)
hist_df = pd.DataFrame(np.floor(df_src.values),columns=[df_res.columns[item_no]],index=pd.date_range('20150301',periods=183))
pred_df = pd.DataFrame(pred_df_values,columns=[df_res.columns[item_no]],index=pd.date_range('20150901',periods=61))
pred_all_df = hist_df.append(pred_df)
#调整时间格式以及列顺序
pred_all_df.index=pred_all_df.index.strftime('%Y%m%d')
predict_res=pred_all_df.stack().reset_index()
predict_res.columns=['Ds','artist_id','plays']
predict_res['Ds1']=predict_res['Ds']
predict_res=predict_res.iloc[:,1:4]
return predict_res
In [ ]:
csv_dat=read_user_data()
In [ ]:
res_dat=data_ETL_user(csv_dat)
csv_singer=read_singer_data()
csv_singer=csv_singer[['artist_id','song_init_plays']]
temp_dat=csv_singer.join(res_dat,how='inner')
temp_dat=temp_dat.reset_index()
init_plays=temp_dat[['song_id','song_init_plays']]
temp_res=temp_dat.groupby(['artist_id']).sum()
temp_res=temp_res.iloc[:,1:].T
In [ ]:
df_res = pd.DataFrame(temp_res.values, columns=temp_res.columns,index=pd.date_range('20150301',periods=183))
In [ ]:
#df_res.plot(figsize=(16, 7),legend=None)
In [ ]:
pred_all=predict_next(0,df_res)
for iCount in range(1,50):
predict_df = predict_next(iCount,df_res)
pred_all = pred_all.append(predict_df)
pred_all.to_csv("g:\\music_predict_01.csv",header=False,index=False)
In [ ]:
def select_store_dat(index_code,info_dat):
query_snt="store_code==%d"%index_code
return info_dat.query(query_snt).reset_index(level=1,drop=True)
In [ ]:
#plot([Bar(x=res_tg.index, y=res_tg.target),Bar(x=res_tg.index, y=res_tg.qty_alipay_njhs)])
In [ ]:
#验证销量预测是否服从指数衰减,或者长尾定律
#res_tg[['target','qty_alipay_njhs']].iloc[:,0:1].sort_values(by='target',ascending=False).reset_index(drop=True).plot(title="",figsize=(12, 7))