In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from scipy import sparse
from sklearn.feature_extraction import FeatureHasher
from scipy.sparse import coo_matrix,csr_matrix,csc_matrix, hstack
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
from sklearn import linear_model
import gc
In [2]:
%pwd
Out[2]:
In [4]:
%ls
In [2]:
dtypes = {'Semana' : 'int32',
'Agencia_ID' :'int32',
'Canal_ID' : 'int32',
'Ruta_SAK' : 'int32',
'Cliente-ID' : 'int32',
'Producto_ID':'int32',
'Venta_hoy':'float32',
'Venta_uni_hoy': 'int32',
'Dev_uni_proxima':'int32',
'Dev_proxima':'float32',
'Demanda_uni_equil':'int32'}
In [3]:
train_dataset = pd.read_csv('origin/train.csv',
usecols =['Semana','Agencia_ID','Canal_ID','Ruta_SAK','Cliente_ID','Producto_ID','Demanda_uni_equil'],
dtype = dtypes)
train_dataset['log_demand'] = train_dataset['Demanda_uni_equil'].apply(np.log1p)
train_dataset.drop(['Demanda_uni_equil'],axis = 1,inplace = True)
train_dataset.head()
Out[3]:
In [5]:
pivot_train = pd.read_pickle('pivot_train_with_nan.pickle')
pivot_train = pivot_train.rename(columns={3: 'Sem3', 4: 'Sem4',5: 'Sem5', 6: 'Sem6',7: 'Sem7', 8: 'Sem8',9: 'Sem9'})
pivot_train.head()
Out[5]:
In [6]:
pivot_test = pd.read_pickle('pivot_test.pickle')
pivot_test.rename(columns = {'Semana':'sem10_sem11'},inplace = True)
pivot_test.shape
Out[6]:
In [7]:
pivot_test.columns.values
Out[7]:
In [29]:
agencia_for_cliente_producto = train_dataset[['Cliente_ID','Producto_ID'
,'Agencia_ID']].groupby(['Cliente_ID',
'Producto_ID']).agg(lambda x:x.value_counts().index[0]).reset_index()
canal_for_cliente_producto = train_dataset[['Cliente_ID',
'Producto_ID','Canal_ID']].groupby(['Cliente_ID',
'Producto_ID']).agg(lambda x:x.value_counts().index[0]).reset_index()
ruta_for_cliente_producto = train_dataset[['Cliente_ID',
'Producto_ID','Ruta_SAK']].groupby(['Cliente_ID',
'Producto_ID']).agg(lambda x:x.value_counts().index[0]).reset_index()
In [22]:
gc.collect()
Out[22]:
In [37]:
agencia_for_cliente_producto.to_pickle('agencia_for_cliente_producto.csv')
canal_for_cliente_producto.to_pickle('canal_for_cliente_producto.csv')
ruta_for_cliente_producto.to_pickle('ruta_for_cliente_producto.csv')
In [8]:
agencia_for_cliente_producto = pd.read_pickle('agencia_for_cliente_producto.csv')
canal_for_cliente_producto = pd.read_pickle('canal_for_cliente_producto.csv')
ruta_for_cliente_producto = pd.read_pickle('ruta_for_cliente_producto.csv')
In [6]:
# train_dataset['log_demand'] = train_dataset['Demanda_uni_equil'].apply(np.log1p)
pivot_train = pd.pivot_table(data= train_dataset[['Cliente_ID','Producto_ID','log_demand','Semana']],
values='log_demand', index=['Cliente_ID','Producto_ID'],
columns=['Semana'], aggfunc=np.mean,fill_value = 0).reset_index()
In [5]:
pivot_train.head()
Out[5]:
In [9]:
pivot_train = pd.merge(left = pivot_train, right = agencia_for_cliente_producto, how = 'inner', on = ['Cliente_ID','Producto_ID'])
pivot_train = pd.merge(left = pivot_train, right = canal_for_cliente_producto, how = 'inner', on = ['Cliente_ID','Producto_ID'])
pivot_train = pd.merge(left = pivot_train, right = ruta_for_cliente_producto, how = 'inner', on = ['Cliente_ID','Producto_ID'])
In [11]:
pivot_train.to_pickle('pivot_train_with_zero.pickle')
In [4]:
pivot_train = pd.read_pickle('pivot_train_with_zero.pickle')
In [15]:
pivot_train.to_pickle('pivot_train_with_nan.pickle')
In [3]:
pivot_train = pd.read_pickle('pivot_train_with_nan.pickle')
In [15]:
pivot_train = pivot_train.rename(columns={3: 'Sem3', 4: 'Sem4',5: 'Sem5', 6: 'Sem6',7: 'Sem7', 8: 'Sem8',9: 'Sem9'})
In [16]:
pivot_train.head()
Out[16]:
In [8]:
pivot_train.columns.values
Out[8]:
In [7]:
test_dataset = pd.read_csv('origin/test.csv')
test_dataset.head()
Out[7]:
In [7]:
test_dataset[test_dataset['Semana'] == 10].shape
Out[7]:
In [8]:
test_dataset[test_dataset['Semana'] == 11].shape
Out[8]:
In [74]:
pivot_test = pd.merge(left=pivot_train, right = test_dataset[['id','Cliente_ID','Producto_ID','Semana']],
on =['Cliente_ID','Producto_ID'],how = 'inner' )
pivot_test.head()
Out[74]:
In [8]:
pivot_test_new = pd.merge(pivot_train[['Cliente_ID', 'Producto_ID', 'Sem3', 'Sem4', 'Sem5', 'Sem6', 'Sem7',
'Sem8', 'Sem9']],right = test_dataset, on = ['Cliente_ID','Producto_ID'],how = 'right')
In [9]:
pivot_test_new.head()
Out[9]:
In [10]:
pivot_test_new.to_pickle('pivot_test.pickle')
In [75]:
pivot_test.to_pickle('pivot_test.pickle')
In [4]:
pivot_test = pd.read_pickle('pivot_test.pickle')
pivot_test.head()
Out[4]:
In [52]:
train_dataset.head()
Out[52]:
In [70]:
import itertools
col_list = ['Agencia_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID']
all_combine = itertools.combinations(col_list,2)
In [71]:
list_2element_combine = [list(tuple) for tuple in all_combine]
In [75]:
col_1elm_2elm = col_list + list_2element_combine
col_1elm_2elm
Out[75]:
In [79]:
train_dataset_test = train_dataset[train_dataset['Semana'] < 8].copy()
In [11]:
def categorical_useful(train_dataset,pivot_train):
# if is_train:
# train_dataset_test = train_dataset[train_dataset['Semana'] < 8].copy()
# elif is_train == False:
train_dataset_test = train_dataset.copy()
log_demand_by_agen = train_dataset_test[['Agencia_ID','log_demand']].groupby('Agencia_ID').mean().reset_index()
log_demand_by_ruta = train_dataset_test[['Ruta_SAK','log_demand']].groupby('Ruta_SAK').mean().reset_index()
log_demand_by_cliente = train_dataset_test[['Cliente_ID','log_demand']].groupby('Cliente_ID').mean().reset_index()
log_demand_by_producto = train_dataset_test[['Producto_ID','log_demand']].groupby('Producto_ID').mean().reset_index()
log_demand_by_agen_ruta = train_dataset_test[['Agencia_ID', 'Ruta_SAK',
'log_demand']].groupby(['Agencia_ID', 'Ruta_SAK']).mean().reset_index()
log_demand_by_agen_cliente = train_dataset_test[['Agencia_ID', 'Cliente_ID',
'log_demand']].groupby(['Agencia_ID', 'Cliente_ID']).mean().reset_index()
log_demand_by_agen_producto = train_dataset_test[['Agencia_ID', 'Producto_ID',
'log_demand']].groupby(['Agencia_ID', 'Producto_ID']).mean().reset_index()
log_demand_by_ruta_cliente = train_dataset_test[['Ruta_SAK', 'Cliente_ID',
'log_demand']].groupby(['Ruta_SAK', 'Cliente_ID']).mean().reset_index()
log_demand_by_ruta_producto = train_dataset_test[['Ruta_SAK', 'Producto_ID',
'log_demand']].groupby(['Ruta_SAK', 'Producto_ID']).mean().reset_index()
log_demand_by_cliente_producto = train_dataset_test[['Cliente_ID', 'Producto_ID',
'log_demand']].groupby(['Cliente_ID', 'Producto_ID']).mean().reset_index()
log_demand_by_cliente_producto_agen = train_dataset_test[[
'Cliente_ID','Producto_ID','Agencia_ID','log_demand']].groupby(['Cliente_ID',
'Agencia_ID','Producto_ID']).mean().reset_index()
log_sum_by_cliente = train_dataset_test[['Cliente_ID','log_demand']].groupby('Cliente_ID').sum().reset_index()
ruta_freq_semana = train_dataset[['Semana','Ruta_SAK']].groupby(['Ruta_SAK']).count().reset_index()
clien_freq_semana = train_dataset[['Semana','Cliente_ID']].groupby(['Cliente_ID']).count().reset_index()
agen_freq_semana = train_dataset[['Semana','Agencia_ID']].groupby(['Agencia_ID']).count().reset_index()
prod_freq_semana = train_dataset[['Semana','Producto_ID']].groupby(['Producto_ID']).count().reset_index()
pivot_train = pd.merge(left = pivot_train,right = ruta_freq_semana,
how = 'left', on = ['Ruta_SAK']).rename(columns={'Semana': 'ruta_freq'})
pivot_train = pd.merge(left = pivot_train,right = clien_freq_semana,
how = 'left', on = ['Cliente_ID']).rename(columns={'Semana': 'clien_freq'})
pivot_train = pd.merge(left = pivot_train,right = agen_freq_semana,
how = 'left', on = ['Agencia_ID']).rename(columns={'Semana': 'agen_freq'})
pivot_train = pd.merge(left = pivot_train,right = prod_freq_semana,
how = 'left', on = ['Producto_ID']).rename(columns={'Semana': 'prod_freq'})
pivot_train = pd.merge(left = pivot_train,
right = log_demand_by_agen,
how = 'left', on = ['Agencia_ID']).rename(columns={'log_demand': 'agen_for_log_de'})
pivot_train = pd.merge(left = pivot_train,
right = log_demand_by_ruta,
how = 'left', on = ['Ruta_SAK']).rename(columns={'log_demand': 'ruta_for_log_de'})
pivot_train = pd.merge(left = pivot_train,
right = log_demand_by_cliente,
how = 'left', on = ['Cliente_ID']).rename(columns={'log_demand': 'cliente_for_log_de'})
pivot_train = pd.merge(left = pivot_train,
right = log_demand_by_producto,
how = 'left', on = ['Producto_ID']).rename(columns={'log_demand': 'producto_for_log_de'})
pivot_train = pd.merge(left = pivot_train,
right = log_demand_by_agen_ruta,
how = 'left', on = ['Agencia_ID', 'Ruta_SAK']).rename(columns={'log_demand': 'agen_ruta_for_log_de'})
pivot_train = pd.merge(left = pivot_train,
right = log_demand_by_agen_cliente,
how = 'left', on = ['Agencia_ID', 'Cliente_ID']).rename(columns={'log_demand': 'agen_cliente_for_log_de'})
pivot_train = pd.merge(left = pivot_train,
right = log_demand_by_agen_producto,
how = 'left', on = ['Agencia_ID', 'Producto_ID']).rename(columns={'log_demand': 'agen_producto_for_log_de'})
pivot_train = pd.merge(left = pivot_train,
right = log_demand_by_ruta_cliente,
how = 'left', on = ['Ruta_SAK', 'Cliente_ID']).rename(columns={'log_demand': 'ruta_cliente_for_log_de'})
pivot_train = pd.merge(left = pivot_train,
right = log_demand_by_ruta_producto,
how = 'left', on = ['Ruta_SAK', 'Producto_ID']).rename(columns={'log_demand': 'ruta_producto_for_log_de'})
pivot_train = pd.merge(left = pivot_train,
right = log_demand_by_cliente_producto,
how = 'left', on = ['Cliente_ID', 'Producto_ID']).rename(columns={'log_demand': 'cliente_producto_for_log_de'})
pivot_train = pd.merge(left = pivot_train,
right = log_sum_by_cliente,
how = 'left', on = ['Cliente_ID']).rename(columns={'log_demand': 'cliente_for_log_sum'})
pivot_train = pd.merge(left = pivot_train,
right = log_demand_by_cliente_producto_agen,
how = 'left', on = ['Cliente_ID', 'Producto_ID',
'Agencia_ID']).rename(columns={'log_demand': 'cliente_producto_agen_for_log_sum'})
pivot_train['corr'] = pivot_train['producto_for_log_de'] * pivot_train['cliente_for_log_de'] / train_dataset_test['log_demand'].median()
return pivot_train
In [12]:
def define_time_features(df, to_predict = 't_plus_1' , t_0 = 8):
if(to_predict == 't_plus_1' ):
df['t_min_1'] = df['Sem'+str(t_0-1)]
if(to_predict == 't_plus_2' ):
df['t_min_6'] = df['Sem'+str(t_0-6)]
df['t_min_2'] = df['Sem'+str(t_0-2)]
df['t_min_3'] = df['Sem'+str(t_0-3)]
df['t_min_4'] = df['Sem'+str(t_0-4)]
df['t_min_5'] = df['Sem'+str(t_0-5)]
if(to_predict == 't_plus_1' ):
df['t1_min_t2'] = df['t_min_1'] - df['t_min_2']
df['t1_min_t3'] = df['t_min_1'] - df['t_min_3']
df['t1_min_t4'] = df['t_min_1'] - df['t_min_4']
df['t1_min_t5'] = df['t_min_1'] - df['t_min_5']
if(to_predict == 't_plus_2' ):
df['t2_min_t6'] = df['t_min_2'] - df['t_min_6']
df['t3_min_t6'] = df['t_min_3'] - df['t_min_6']
df['t4_min_t6'] = df['t_min_4'] - df['t_min_6']
df['t5_min_t6'] = df['t_min_5'] - df['t_min_6']
df['t2_min_t3'] = df['t_min_2'] - df['t_min_3']
df['t2_min_t4'] = df['t_min_2'] - df['t_min_4']
df['t2_min_t5'] = df['t_min_2'] - df['t_min_5']
df['t3_min_t4'] = df['t_min_3'] - df['t_min_4']
df['t3_min_t5'] = df['t_min_3'] - df['t_min_5']
df['t4_min_t5'] = df['t_min_4'] - df['t_min_5']
return df
In [13]:
def lin_regr(row, to_predict, t_0, semanas_numbers):
row = row.copy()
row.index = semanas_numbers
row = row.dropna()
if(len(row>2)):
X = np.ones(shape=(len(row), 2))
X[:,1] = row.index
y = row.values
regr = linear_model.LinearRegression()
regr.fit(X, y)
if(to_predict == 't_plus_1'):
return regr.predict([[1,t_0+1]])[0]
elif(to_predict == 't_plus_2'):
return regr.predict([[1,t_0+2]])[0]
else:
return None
In [14]:
def lin_regr_features(pivot_df,to_predict, semanas_numbers,t_0):
pivot_df = pivot_df.copy()
semanas_names = ['Sem%i' %i for i in semanas_numbers]
columns = ['Sem%i' %i for i in semanas_numbers]
columns.append('Producto_ID')
pivot_grouped = pivot_df[columns].groupby('Producto_ID').aggregate('mean')
pivot_grouped['LR_prod'] = np.zeros(len(pivot_grouped))
pivot_grouped['LR_prod'] = pivot_grouped[semanas_names].apply(lin_regr, axis = 1,
to_predict = to_predict,
t_0 = t_0, semanas_numbers = semanas_numbers )
pivot_df = pd.merge(pivot_df, pivot_grouped[['LR_prod']], how='left', left_on = 'Producto_ID', right_index=True)
pivot_df['LR_prod_corr'] = pivot_df['LR_prod'] * pivot_df['cliente_for_log_sum'] / 100
return pivot_df
In [15]:
cliente_tabla = pd.read_csv('origin/cliente_tabla.csv')
town_state = pd.read_csv('origin/town_state.csv')
In [16]:
town_state['town_id'] = town_state['Town'].str.split()
town_state['town_id'] = town_state['Town'].str.split(expand = True)
In [17]:
def add_pro_info(dataset):
train_basic_feature = dataset[['Cliente_ID','Producto_ID','Agencia_ID']].copy()
train_basic_feature.drop_duplicates(inplace = True)
cliente_per_town = pd.merge(train_basic_feature,cliente_tabla,on = 'Cliente_ID',how= 'inner' )
# print cliente_per_town.shape
cliente_per_town = pd.merge(cliente_per_town,town_state[['Agencia_ID','town_id']],on = 'Agencia_ID',how= 'inner' )
# print cliente_per_town.shape
cliente_per_town_count = cliente_per_town[['NombreCliente','town_id']].groupby('town_id').count().reset_index()
# print cliente_per_town_count.head()
cliente_per_town_count_final = pd.merge(cliente_per_town[['Cliente_ID','Producto_ID','town_id','Agencia_ID']],
cliente_per_town_count,on = 'town_id',how = 'inner')
# print cliente_per_town_count_final.head()
cliente_per_town_count_final.drop_duplicates(inplace = True)
dataset_final = pd.merge(dataset,cliente_per_town_count_final[['Cliente_ID','Producto_ID','NombreCliente','Agencia_ID']],
on = ['Cliente_ID','Producto_ID','Agencia_ID'],how = 'left')
return dataset_final
In [18]:
pre_product = pd.read_csv('preprocessed_products.csv',index_col = 0)
pre_product['weight_per_piece'] = pd.to_numeric(pre_product['weight_per_piece'], errors='coerce')
pre_product['weight'] = pd.to_numeric(pre_product['weight'], errors='coerce')
pre_product['pieces'] = pd.to_numeric(pre_product['pieces'], errors='coerce')
In [19]:
def add_product(dataset):
dataset = pd.merge(dataset,pre_product[['ID','weight','weight_per_piece','pieces']],
left_on = 'Producto_ID',right_on = 'ID',how = 'left')
return dataset
In [ ]:
train_34567 = train_dataset.loc[train_dataset['Semana'].isin([3,4,5,6,7]), :].copy()
train_pivot_34567_to_9 = pivot_train.loc[(pivot_train['Sem9'].notnull()),:].copy()
train_pivot_34567_to_9 = categorical_useful(train_34567,train_pivot_34567_to_9)
del train_34567
gc.collect()
train_pivot_34567_to_9 = define_time_features(train_pivot_34567_to_9, to_predict = 't_plus_2' , t_0 = 9)
train_pivot_34567_to_9 = lin_regr_features(train_pivot_34567_to_9,to_predict ='t_plus_2',
semanas_numbers = [3,4,5,6,7],t_0 = 9)
train_pivot_34567_to_9['target'] = train_pivot_34567_to_9['Sem9']
train_pivot_34567_to_9.drop(['Sem8','Sem9'],axis =1,inplace = True)
#add cum_sum
train_pivot_cum_sum = train_pivot_34567_to_9[['Sem3','Sem4','Sem5','Sem6','Sem7']].cumsum(axis = 1)
train_pivot_34567_to_9.drop(['Sem3','Sem4','Sem5','Sem6','Sem7'],axis =1,inplace = True)
train_pivot_34567_to_9 = pd.concat([train_pivot_34567_to_9,train_pivot_cum_sum],axis =1)
train_pivot_34567_to_9 = train_pivot_34567_to_9.rename(columns={'Sem3': 't_m_6_cum',
'Sem4': 't_m_5_cum','Sem5': 't_m_4_cum',
'Sem6': 't_m_3_cum','Sem7': 't_m_2_cum'})
# add geo_info
train_pivot_34567_to_9 = add_pro_info(train_pivot_34567_to_9)
#add product info
train_pivot_34567_to_9 = add_product(train_pivot_34567_to_9)
train_pivot_34567_to_9.drop(['ID'],axis = 1,inplace = True)
gc.collect()
train_pivot_34567_to_9.head()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [15]:
train_45678 = train_dataset.loc[train_dataset['Semana'].isin([4,5,6,7,8]), :].copy()
train_pivot_45678_to_9 = pivot_train.loc[(pivot_train['Sem9'].notnull()),:].copy()
train_pivot_45678_to_9 = categorical_useful(train_45678,train_pivot_45678_to_9)
del train_45678
gc.collect()
train_pivot_45678_to_9 = define_time_features(train_pivot_45678_to_9, to_predict = 't_plus_1' , t_0 = 9)
train_pivot_45678_to_9 = lin_regr_features(train_pivot_45678_to_9,to_predict ='t_plus_1',
semanas_numbers = [4,5,6,7,8],t_0 = 8)
train_pivot_45678_to_9['target'] = train_pivot_45678_to_9['Sem9']
train_pivot_45678_to_9.drop(['Sem3','Sem9'],axis =1,inplace = True)
#add cum_sum
train_pivot_cum_sum = train_pivot_45678_to_9[['Sem4','Sem5','Sem6','Sem7','Sem8']].cumsum(axis = 1)
train_pivot_45678_to_9.drop(['Sem4','Sem5','Sem6','Sem7','Sem8'],axis =1,inplace = True)
train_pivot_45678_to_9 = pd.concat([train_pivot_45678_to_9,train_pivot_cum_sum],axis =1)
train_pivot_45678_to_9 = train_pivot_45678_to_9.rename(columns={'Sem4': 't_m_5_cum',
'Sem5': 't_m_4_cum','Sem6': 't_m_3_cum', 'Sem7': 't_m_2_cum','Sem8': 't_m_1_cum'})
# add geo_info
train_pivot_45678_to_9 = add_pro_info(train_pivot_45678_to_9)
#add product info
train_pivot_45678_to_9 = add_product(train_pivot_45678_to_9)
train_pivot_45678_to_9.drop(['ID'],axis = 1,inplace = True)
gc.collect()
train_pivot_45678_to_9.head()
In [42]:
train_pivot_45678_to_9.columns.values
Out[42]:
In [47]:
train_pivot_45678_to_9.to_csv('train_pivot_45678_to_9.csv')
In [ ]:
train_pivot_45678_to_9 = pd.read_csv('train_pivot_45678_to_9.csv',index_col = 0)
In [47]:
train_pivot_45678_to_9.to_csv('train_pivot_45678_to_9_new.csv')
In [18]:
train_pivot_45678_to_9 = pd.read_csv('train_pivot_45678_to_9_new.csv',index_col = 0)
In [16]:
train_34567 = train_dataset.loc[train_dataset['Semana'].isin([3,4,5,6,7]), :].copy()
train_pivot_34567_to_8 = pivot_train.loc[(pivot_train['Sem8'].notnull()),:].copy()
train_pivot_34567_to_8 = categorical_useful(train_34567,train_pivot_34567_to_8)
del train_34567
gc.collect()
train_pivot_34567_to_8 = define_time_features(train_pivot_34567_to_8, to_predict = 't_plus_1' , t_0 = 8)
train_pivot_34567_to_8 = lin_regr_features(train_pivot_34567_to_8,to_predict = 't_plus_1',
semanas_numbers = [3,4,5,6,7],t_0 = 7)
train_pivot_34567_to_8['target'] = train_pivot_34567_to_8['Sem8']
train_pivot_34567_to_8.drop(['Sem8','Sem9'],axis =1,inplace = True)
#add cum_sum
train_pivot_cum_sum = train_pivot_34567_to_8[['Sem3','Sem4','Sem5','Sem6','Sem7']].cumsum(axis = 1)
train_pivot_34567_to_8.drop(['Sem3','Sem4','Sem5','Sem6','Sem7'],axis =1,inplace = True)
train_pivot_34567_to_8 = pd.concat([train_pivot_34567_to_8,train_pivot_cum_sum],axis =1)
train_pivot_34567_to_8 = train_pivot_34567_to_8.rename(columns={'Sem3': 't_m_5_cum','Sem4': 't_m_4_cum',
'Sem5': 't_m_3_cum','Sem6': 't_m_2_cum',
'Sem7': 't_m_1_cum'})
# add product_info
train_pivot_34567_to_8 = add_pro_info(train_pivot_34567_to_8)
#add product
train_pivot_34567_to_8 = add_product(train_pivot_34567_to_8)
train_pivot_34567_to_8.drop(['ID'],axis = 1,inplace = True)
gc.collect()
train_pivot_34567_to_8.head()
Out[16]:
In [49]:
train_pivot_34567_to_8.columns.values
Out[49]:
In [50]:
train_pivot_34567_to_8.to_csv('train_pivot_34567_to_8.csv')
In [5]:
train_pivot_34567_to_8 = pd.read_csv('train_pivot_34567_to_8.csv',index_col = 0)
In [20]:
gc.collect()
Out[20]:
In [22]:
train_pivot_xgb_time1 = pd.concat([train_pivot_45678_to_9, train_pivot_34567_to_8],axis = 0,copy = False)
In [23]:
train_pivot_xgb_time1.columns.values
Out[23]:
In [24]:
train_pivot_xgb_time1.shape
Out[24]:
In [25]:
train_pivot_xgb_time1.to_csv('train_pivot_xgb_time1_44fea.csv')
In [54]:
train_pivot_xgb_time1.to_csv('train_pivot_xgb_time1.csv')
In [26]:
del train_pivot_xgb_time1
del train_pivot_45678_to_9
del train_pivot_34567_to_8
gc.collect()
Out[26]:
In [7]:
pivot_test.head()
Out[7]:
In [20]:
pivot_test_week10 = pivot_test.loc[pivot_test['sem10_sem11'] == 10]
pivot_test_week10.reset_index(drop=True,inplace = True)
pivot_test_week10.head()
Out[20]:
In [21]:
pivot_test_week10.shape
Out[21]:
In [22]:
train_56789 = train_dataset.loc[train_dataset['Semana'].isin([5,6,7,8,9]), :].copy()
train_pivot_56789_to_10 = pivot_test_week10.copy()
train_pivot_56789_to_10 = categorical_useful(train_56789,train_pivot_56789_to_10)
del train_56789
gc.collect()
train_pivot_56789_to_10 = define_time_features(train_pivot_56789_to_10, to_predict = 't_plus_1' , t_0 = 10)
train_pivot_56789_to_10 = lin_regr_features(train_pivot_56789_to_10,to_predict ='t_plus_1' ,
semanas_numbers = [5,6,7,8,9],t_0 = 9)
train_pivot_56789_to_10.drop(['Sem3','Sem4'],axis =1,inplace = True)
#add cum_sum
train_pivot_cum_sum = train_pivot_56789_to_10[['Sem5','Sem6','Sem7','Sem8','Sem9']].cumsum(axis = 1)
train_pivot_56789_to_10.drop(['Sem5','Sem6','Sem7','Sem8','Sem9'],axis =1,inplace = True)
train_pivot_56789_to_10 = pd.concat([train_pivot_56789_to_10,train_pivot_cum_sum],axis =1)
train_pivot_56789_to_10 = train_pivot_56789_to_10.rename(columns={'Sem5': 't_m_5_cum',
'Sem6': 't_m_4_cum','Sem7': 't_m_3_cum',
'Sem8': 't_m_2_cum','Sem9': 't_m_1_cum'})
# add product_info
train_pivot_56789_to_10 = add_pro_info(train_pivot_56789_to_10)
#
train_pivot_56789_to_10 = add_product(train_pivot_56789_to_10)
train_pivot_56789_to_10.drop(['ID'],axis =1,inplace = True)
train_pivot_56789_to_10.head()
Out[22]:
In [23]:
train_pivot_56789_to_10.columns.values
Out[23]:
In [24]:
train_pivot_56789_to_10.to_pickle('train_pivot_56789_to_10_44fea.pickle')
In [27]:
train_3456 = train_dataset.loc[train_dataset['Semana'].isin([3,4,5,6]), :].copy()
train_pivot_3456_to_8 = pivot_train.loc[(pivot_train['Sem8'].notnull()),:].copy()
train_pivot_3456_to_8 = categorical_useful(train_3456,train_pivot_3456_to_8)
del train_3456
gc.collect()
train_pivot_3456_to_8 = define_time_features(train_pivot_3456_to_8, to_predict = 't_plus_2' , t_0 = 8)
#notice that the t_0 means different
train_pivot_3456_to_8 = lin_regr_features(train_pivot_3456_to_8,to_predict = 't_plus_2', semanas_numbers = [3,4,5,6],t_0 = 6)
train_pivot_3456_to_8['target'] = train_pivot_3456_to_8['Sem8']
train_pivot_3456_to_8.drop(['Sem7','Sem8','Sem9'],axis =1,inplace = True)
#add cum_sum
train_pivot_cum_sum = train_pivot_3456_to_8[['Sem3','Sem4','Sem5','Sem6']].cumsum(axis = 1)
train_pivot_3456_to_8.drop(['Sem3','Sem4','Sem5','Sem6'],axis =1,inplace = True)
train_pivot_3456_to_8 = pd.concat([train_pivot_3456_to_8,train_pivot_cum_sum],axis =1)
train_pivot_3456_to_8 = train_pivot_3456_to_8.rename(columns={'Sem4': 't_m_4_cum',
'Sem5': 't_m_3_cum','Sem6': 't_m_2_cum', 'Sem3': 't_m_5_cum'})
# add product_info
train_pivot_3456_to_8 = add_pro_info(train_pivot_3456_to_8)
train_pivot_3456_to_8 = add_product(train_pivot_3456_to_8)
train_pivot_3456_to_8.drop(['ID'],axis =1,inplace = True)
train_pivot_3456_to_8.head()
Out[27]:
In [30]:
train_pivot_3456_to_8.columns.values
Out[30]:
In [58]:
train_pivot_3456_to_8.to_csv('train_pivot_3456_to_8.csv')
In [28]:
train_4567 = train_dataset.loc[train_dataset['Semana'].isin([4,5,6,7]), :].copy()
train_pivot_4567_to_9 = pivot_train.loc[(pivot_train['Sem9'].notnull()),:].copy()
train_pivot_4567_to_9 = categorical_useful(train_4567,train_pivot_4567_to_9)
del train_4567
gc.collect()
train_pivot_4567_to_9 = define_time_features(train_pivot_4567_to_9, to_predict = 't_plus_2' , t_0 = 9)
#notice that the t_0 means different
train_pivot_4567_to_9 = lin_regr_features(train_pivot_4567_to_9,to_predict = 't_plus_2',
semanas_numbers = [4,5,6,7],t_0 = 7)
train_pivot_4567_to_9['target'] = train_pivot_4567_to_9['Sem9']
train_pivot_4567_to_9.drop(['Sem3','Sem8','Sem9'],axis =1,inplace = True)
#add cum_sum
train_pivot_cum_sum = train_pivot_4567_to_9[['Sem7','Sem4','Sem5','Sem6']].cumsum(axis = 1)
train_pivot_4567_to_9.drop(['Sem7','Sem4','Sem5','Sem6'],axis =1,inplace = True)
train_pivot_4567_to_9 = pd.concat([train_pivot_4567_to_9,train_pivot_cum_sum],axis =1)
train_pivot_4567_to_9 = train_pivot_4567_to_9.rename(columns={'Sem4': 't_m_5_cum',
'Sem5': 't_m_4_cum','Sem6': 't_m_3_cum', 'Sem7': 't_m_2_cum'})
# add product_info
train_pivot_4567_to_9 = add_pro_info(train_pivot_4567_to_9)
train_pivot_4567_to_9 = add_product(train_pivot_4567_to_9)
train_pivot_4567_to_9.drop(['ID'],axis =1,inplace = True)
train_pivot_4567_to_9.head()
Out[28]:
In [31]:
train_pivot_4567_to_9.columns.values
Out[31]:
In [16]:
train_pivot_4567_to_9.to_csv('train_pivot_4567_to_9.csv')
In [32]:
train_pivot_xgb_time2 = pd.concat([train_pivot_3456_to_8, train_pivot_4567_to_9],axis = 0,copy = False)
In [33]:
train_pivot_xgb_time2.columns.values
Out[33]:
In [34]:
train_pivot_xgb_time2.shape
Out[34]:
In [35]:
train_pivot_xgb_time2.to_csv('train_pivot_xgb_time2_38fea.csv')
In [66]:
train_pivot_xgb_time2 = pd.read_csv('train_pivot_xgb_time2.csv',index_col = 0)
train_pivot_xgb_time2.head()
Out[66]:
In [72]:
del train_pivot_3456_to_8
del train_pivot_4567_to_9
del train_pivot_xgb_time2
del train_pivot_34567_to_8
del train_pivot_45678_to_9
del train_pivot_xgb_time1
gc.collect()
Out[72]:
In [29]:
pivot_test_week11 = pivot_test_new.loc[pivot_test_new['Semana'] == 11]
pivot_test_week11.reset_index(drop=True,inplace = True)
pivot_test_week11.head()
Out[29]:
In [83]:
pivot_test_week11.shape
Out[83]:
In [30]:
train_6789 = train_dataset.loc[train_dataset['Semana'].isin([6,7,8,9]), :].copy()
train_pivot_6789_to_11 = pivot_test_week11.copy()
train_pivot_6789_to_11 = categorical_useful(train_6789,train_pivot_6789_to_11)
del train_6789
gc.collect()
train_pivot_6789_to_11 = define_time_features(train_pivot_6789_to_11, to_predict = 't_plus_2' , t_0 = 11)
train_pivot_6789_to_11 = lin_regr_features(train_pivot_6789_to_11,to_predict ='t_plus_2' ,
semanas_numbers = [6,7,8,9],t_0 = 9)
train_pivot_6789_to_11.drop(['Sem3','Sem4','Sem5'],axis =1,inplace = True)
#add cum_sum
train_pivot_cum_sum = train_pivot_6789_to_11[['Sem6','Sem7','Sem8','Sem9']].cumsum(axis = 1)
train_pivot_6789_to_11.drop(['Sem6','Sem7','Sem8','Sem9'],axis =1,inplace = True)
train_pivot_6789_to_11 = pd.concat([train_pivot_6789_to_11,train_pivot_cum_sum],axis =1)
train_pivot_6789_to_11 = train_pivot_6789_to_11.rename(columns={'Sem6': 't_m_5_cum',
'Sem7': 't_m_4_cum', 'Sem8': 't_m_3_cum','Sem9': 't_m_2_cum'})
# add product_info
train_pivot_6789_to_11 = add_pro_info(train_pivot_6789_to_11)
train_pivot_6789_to_11 = add_product(train_pivot_6789_to_11)
train_pivot_6789_to_11.drop(['ID'],axis = 1,inplace = True)
train_pivot_6789_to_11.head()
Out[30]:
In [85]:
train_pivot_6789_to_11.shape
Out[85]:
In [31]:
train_pivot_6789_to_11.to_pickle('train_pivot_6789_to_11_new.pickle')
In [27]:
% time pivot_train_categorical_useful = categorical_useful(train_dataset,pivot_train,is_train = True)
In [ ]:
% time pivot_train_categorical_useful = categorical_useful(train_dataset,pivot_train,is_train = True)
In [34]:
pivot_train_categorical_useful_train.to_csv('pivot_train_categorical_useful_with_nan.csv')
In [2]:
pivot_train_categorical_useful_train = pd.read_csv('pivot_train_categorical_useful_with_nan.csv',index_col = 0)
pivot_train_categorical_useful_train.head()
Out[2]:
In [13]:
pivot_train_categorical_useful.head()
Out[13]:
In [19]:
pivot_train_categorical_useful_time = define_time_features(pivot_train_categorical_useful,
to_predict = 't_plus_1' , t_0 = 8)
In [20]:
pivot_train_categorical_useful_time.head()
Out[20]:
In [22]:
pivot_train_categorical_useful_time.columns
Out[22]:
In [33]:
# Linear regression features
pivot_train_categorical_useful_time_LR = lin_regr_features(pivot_train_categorical_useful_time, semanas_numbers = [3,4,5,6,7])
pivot_train_categorical_useful_time_LR.head()
Out[33]:
In [34]:
pivot_train_categorical_useful_time_LR.columns
Out[34]:
In [42]:
pivot_train_categorical_useful_time_LR.to_csv('pivot_train_categorical_useful_time_LR.csv')
In [16]:
pivot_train_categorical_useful_time_LR = pd.read_csv('pivot_train_categorical_useful_time_LR.csv',index_col = 0)
In [17]:
pivot_train_categorical_useful_time_LR.head()
Out[17]:
In [4]:
# pivot_train_canal = pd.get_dummies(pivot_train_categorical_useful_train['Canal_ID'])
In [5]:
# pivot_train_categorical_useful_train = pivot_train_categorical_useful_train.join(pivot_train_canal)
# pivot_train_categorical_useful_train.head()
Out[5]:
In [2]:
%ls
In [4]:
pre_product = pd.read_csv('preprocessed_products.csv',index_col = 0)
pre_product.head()
pre_product['weight_per_piece'] = pd.to_numeric(pre_product['weight_per_piece'], errors='coerce')
pre_product['weight'] = pd.to_numeric(pre_product['weight'], errors='coerce')
pre_product['pieces'] = pd.to_numeric(pre_product['pieces'], errors='coerce')
pivot_train_categorical_useful_time_LR_weight = pd.merge(pivot_train_categorical_useful_time_LR,
pre_product[['ID','weight','weight_per_piece']],
left_on = 'Producto_ID',right_on = 'ID',how = 'left')
pivot_train_categorical_useful_time_LR_weight.head()
Out[4]:
In [15]:
In [18]:
pivot_train_categorical_useful_time_LR_weight = pd.merge(pivot_train_categorical_useful_time_LR,
pre_product[['ID','weight','weight_per_piece']],
left_on = 'Producto_ID',right_on = 'ID',how = 'left')
pivot_train_categorical_useful_time_LR_weight.head()
Out[18]:
In [50]:
pivot_train_categorical_useful_time_LR_weight.to_csv('pivot_train_categorical_useful_time_LR_weight.csv')
In [2]:
pivot_train_categorical_useful_time_LR_weight = pd.read_csv('pivot_train_categorical_useful_time_LR_weight.csv',index_col = 0)
pivot_train_categorical_useful_time_LR_weight.head()
Out[2]:
In [22]:
%cd '/media/siyuan/0009E198000CD19B/bimbo/origin'
%ls
In [23]:
cliente_tabla = pd.read_csv('cliente_tabla.csv')
town_state = pd.read_csv('town_state.csv')
town_state['town_id'] = town_state['Town'].str.split()
town_state['town_id'] = town_state['Town'].str.split(expand = True)
train_basic_feature = pivot_train_categorical_useful_time_LR_weight[['Cliente_ID','Producto_ID','Agencia_ID']]
cliente_per_town = pd.merge(train_basic_feature,cliente_tabla,on = 'Cliente_ID',how= 'inner' )
cliente_per_town = pd.merge(cliente_per_town,town_state[['Agencia_ID','town_id']],on = 'Agencia_ID',how= 'inner' )
cliente_per_town_count = cliente_per_town[['NombreCliente','town_id']].groupby('town_id').count().reset_index()
cliente_per_town_count['NombreCliente'] = cliente_per_town_count['NombreCliente']/float(100000)
cliente_per_town_count_final = pd.merge(cliente_per_town[['Cliente_ID','Producto_ID','Agencia_ID','town_id']],
cliente_per_town_count,on = 'town_id',how = 'left')
pivot_train_categorical_useful_time_LR_weight_town = pd.merge(pivot_train_categorical_useful_time_LR_weight,
cliente_per_town_count_final[['Cliente_ID','Producto_ID','NombreCliente']],
on = ['Cliente_ID','Producto_ID'],how = 'left')
In [24]:
cliente_tabla.head()
Out[24]:
In [25]:
town_state.head()
Out[25]:
In [26]:
town_state['town_id'] = town_state['Town'].str.split()
town_state['town_id'] = town_state['Town'].str.split(expand = True)
In [27]:
town_state.head()
Out[27]:
In [13]:
pivot_train_categorical_useful_time_LR_weight.columns.values
Out[13]:
In [34]:
train_basic_feature = pivot_train_categorical_useful_time_LR_weight[['Cliente_ID','Producto_ID','Agencia_ID']]
In [35]:
cliente_per_town = pd.merge(train_basic_feature,cliente_tabla,on = 'Cliente_ID',how= 'inner' )
cliente_per_town = pd.merge(cliente_per_town,town_state[['Agencia_ID','town_id']],on = 'Agencia_ID',how= 'inner' )
In [36]:
cliente_per_town.head()
Out[36]:
In [37]:
cliente_per_town_count = cliente_per_town[['NombreCliente','town_id']].groupby('town_id').count().reset_index()
cliente_per_town_count['NombreCliente'] = cliente_per_town_count['NombreCliente']/float(100000)
In [38]:
cliente_per_town_count.head()
Out[38]:
In [39]:
cliente_per_town_count_final = pd.merge(cliente_per_town[['Cliente_ID','Producto_ID','Agencia_ID','town_id']],
cliente_per_town_count,on = 'town_id',how = 'left')
cliente_per_town_count_final.head()
Out[39]:
In [42]:
pivot_train_categorical_useful_time_LR_weight_town = pd.merge(pivot_train_categorical_useful_time_LR_weight,
cliente_per_town_count_final[['Cliente_ID','Producto_ID','NombreCliente']],
on = ['Cliente_ID','Producto_ID'],how = 'left')
pivot_train_categorical_useful_time_LR_weight_town.head()
Out[42]:
In [43]:
pivot_train_categorical_useful_time_LR_weight_town.columns.values
Out[43]:
In [7]:
train_pivot_xgb_time1.columns.values
Out[7]:
In [8]:
train_pivot_xgb_time1 = train_pivot_xgb_time1.drop(['Cliente_ID','Producto_ID','Agencia_ID',
'Ruta_SAK','Canal_ID'],axis = 1)
In [45]:
pivot_train_categorical_useful_train_time_no_nan = pivot_train_categorical_useful_train[pivot_train_categorical_useful_train['Sem8'].notnull()]
# pivot_train_categorical_useful_train_time_no_nan = pivot_train_categorical_useful_train[pivot_train_categorical_useful_train['Sem9'].notnull()]
In [46]:
pivot_train_categorical_useful_train_time_no_nan_sample = pivot_train_categorical_useful_train_time_no_nan.sample(1000000)
In [47]:
train_feature = pivot_train_categorical_useful_train_time_no_nan_sample.drop(['Sem8','Sem9'],axis = 1)
train_label = pivot_train_categorical_useful_train_time_no_nan_sample[['Sem8','Sem9']]
In [9]:
#seperate train and test data
# datasource: sparse_week_Agencia_Canal_Ruta_normalized_csr label:train_label
%time train_set, valid_set, train_labels, valid_labels = train_test_split(train_feature,\
train_label, test_size=0.10)
In [48]:
# dtrain = xgb.DMatrix(train_feature,label = train_label['Sem8'],missing=NaN)
dtrain = xgb.DMatrix(train_feature,label = train_label['Sem8'],missing=NaN)
In [49]:
param = {'booster':'gbtree',
'nthread': 7,
'max_depth':6,
'eta':0.2,
'silent':0,
'subsample':0.7,
'objective':'reg:linear',
'eval_metric':'rmse',
'colsample_bytree':0.7}
# param = {'eta':0.1, 'eval_metric':'rmse','nthread': 8}
# evallist = [(dvalid,'eval'), (dtrain,'train')]
In [ ]:
num_round = 1000
# plst = param.items()
# bst = xgb.train( plst, dtrain, num_round, evallist )
cvresult = xgb.cv(param, dtrain, num_round, nfold=5,show_progress=True,show_stdv=False,
seed = 0, early_stopping_rounds=10)
print(cvresult.tail())
In [51]:
# xgb.plot_importance(cvresult)
In [ ]: