In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from scipy import sparse
from sklearn.feature_extraction import FeatureHasher
from scipy.sparse import coo_matrix,csr_matrix,csc_matrix, hstack
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
from sklearn import linear_model
import gc
from sklearn import preprocessing

In [1]:
%ls


1_predata.ipynb                   preprocessed_products.csv
3_xgb.ipynb                       ruta_for_cliente_producto.csv
3_xgb_prediction.ipynb            stack_sub/
44fea_bst.model                   submission_10_new.csv
4_keras_nn.ipynb                  submission_11_new.csv
5_random_forest.ipynb             submission_44fea.csv
6_stack_model.ipynb               submission_nn.csv
agencia_for_cliente_producto.csv  submission_nn_xgb
canal_for_cliente_producto.csv    train_pivot_56789_to_10_44fea.pickle
model_nn_10_after_l2reg.h5        train_pivot_56789_to_10_new.pickle
model_nn_10.h5                    train_pivot_6789_to_11_new.pickle
model_nn_10_whole.h5              train_pivot_xgb_time1_44fea.csv
old_submission/                   train_pivot_xgb_time1.csv
origin/                           train_pivot_xgb_time2_38fea.csv
pivot_test.pickle                 train_pivot_xgb_time2.csv
pivot_train_with_nan.pickle

read train data


  • for xgb
  • for nn


In [7]:
stack_train_nn_10= pd.read_pickle('stack_sub/stack_train_nn_10.pickle')
stack_train_xgb_10= pd.read_csv('stack_sub/stack_train_xgb_10.csv',index_col = False,header = None)
train_label = pd.read_csv('train_pivot_xgb_time1.csv',usecols = ['target'])

print stack_train_nn_10.shape
print stack_train_xgb_10.shape


(20768652, 1)
(20768652, 2)

In [10]:
stack_train_xgb_10.rename(columns = {1:'xgb'},inplace = True)
stack_train_nn_10.rename(columns = {'predict':'nn'},inplace = True)
stack_train = pd.DataFrame()
stack_train['xgb'] = stack_train_xgb_10['xgb']
stack_train['nn'] = stack_train_nn_10['nn']
stack_train['target'] =  train_label['target']
stack_train.head()


Out[10]:
xgb nn
0 2.767780 3.470119
1 2.835551 2.667063
2 1.999626 1.882208
3 3.505517 3.318728
4 4.278482 4.153247

In [14]:
stack_train.head()


Out[14]:
xgb nn target
0 2.767780 3.470119 4.574711
1 2.835551 2.667063 2.639057
2 1.999626 1.882208 2.397895
3 3.505517 3.318728 3.784190
4 4.278482 4.153247 4.682131

begin xgboost



In [43]:
param_10 = {'booster':'gbtree',
         'nthread': 7,
         'max_depth':5, 
         'eta':0.4,
         'silent':1,
         'subsample':0.7, 
         'objective':'reg:linear',
         'eval_metric':'rmse'}

In [36]:
train_label_10 = stack_train['target']
train_feature_10 = stack_train.drop(['target'],axis = 1)

dtrain_10 = xgb.DMatrix(train_feature_10,label = train_label_10,missing= np.nan)

In [44]:
num_round = 1500

cvresult = xgb.cv(param_10, dtrain_10, num_round, nfold=5,show_stdv=False,
                        seed = 42, early_stopping_rounds=5,verbose_eval = 1)
print(cvresult.tail())


[0]	train-rmse:0.89846	test-rmse:0.898465
[1]	train-rmse:0.647989	test-rmse:0.647995
[2]	train-rmse:0.529401	test-rmse:0.529408
[3]	train-rmse:0.479527	test-rmse:0.479536
[4]	train-rmse:0.460222	test-rmse:0.460231
[5]	train-rmse:0.45305	test-rmse:0.453061
[6]	train-rmse:0.450429	test-rmse:0.450443
[7]	train-rmse:0.449472	test-rmse:0.449488
[8]	train-rmse:0.449121	test-rmse:0.44914
[9]	train-rmse:0.448989	test-rmse:0.44901
[10]	train-rmse:0.448935	test-rmse:0.448957
[11]	train-rmse:0.44891	test-rmse:0.448935
[12]	train-rmse:0.448898	test-rmse:0.448925
[13]	train-rmse:0.44889	test-rmse:0.448919
[14]	train-rmse:0.448885	test-rmse:0.448916
[15]	train-rmse:0.448879	test-rmse:0.448911
[16]	train-rmse:0.448874	test-rmse:0.448908
[17]	train-rmse:0.44887	test-rmse:0.448904
[18]	train-rmse:0.448867	test-rmse:0.448902
[19]	train-rmse:0.448863	test-rmse:0.4489
[20]	train-rmse:0.448859	test-rmse:0.448896
[21]	train-rmse:0.448854	test-rmse:0.448893
[22]	train-rmse:0.448851	test-rmse:0.44889
[23]	train-rmse:0.448848	test-rmse:0.448889
[24]	train-rmse:0.448845	test-rmse:0.448887
[25]	train-rmse:0.448842	test-rmse:0.448885
[26]	train-rmse:0.44884	test-rmse:0.448885
[27]	train-rmse:0.448838	test-rmse:0.448883
[28]	train-rmse:0.448836	test-rmse:0.448881
[29]	train-rmse:0.448834	test-rmse:0.448881
[30]	train-rmse:0.448833	test-rmse:0.44888
[31]	train-rmse:0.448831	test-rmse:0.44888
[32]	train-rmse:0.448829	test-rmse:0.448879
[33]	train-rmse:0.448827	test-rmse:0.448878
[34]	train-rmse:0.448826	test-rmse:0.448878
[35]	train-rmse:0.448824	test-rmse:0.448877
[36]	train-rmse:0.448822	test-rmse:0.448877
[37]	train-rmse:0.448821	test-rmse:0.448877
[38]	train-rmse:0.448819	test-rmse:0.448876
[39]	train-rmse:0.448818	test-rmse:0.448874
[40]	train-rmse:0.448817	test-rmse:0.448874
[41]	train-rmse:0.448816	test-rmse:0.448874
[42]	train-rmse:0.448815	test-rmse:0.448874
[43]	train-rmse:0.448814	test-rmse:0.448873
[44]	train-rmse:0.448813	test-rmse:0.448874
[45]	train-rmse:0.448811	test-rmse:0.448873
[46]	train-rmse:0.44881	test-rmse:0.448873
[47]	train-rmse:0.448809	test-rmse:0.448873
[48]	train-rmse:0.448808	test-rmse:0.448874
[49]	train-rmse:0.448807	test-rmse:0.448873
[50]	train-rmse:0.448806	test-rmse:0.448873
[51]	train-rmse:0.448805	test-rmse:0.448873
[52]	train-rmse:0.448805	test-rmse:0.448873
[53]	train-rmse:0.448804	test-rmse:0.448873
    test-rmse-mean  test-rmse-std  train-rmse-mean  train-rmse-std
45        0.448873       0.000128         0.448811        0.000033
46        0.448873       0.000128         0.448810        0.000033
47        0.448873       0.000128         0.448809        0.000033
48        0.448874       0.000128         0.448808        0.000033
49        0.448873       0.000129         0.448807        0.000033

read submission file



In [31]:
stack_train_nn_10= pd.read_csv('stack_sub/submission_nn_2.csv',index_col=0)
stack_train_xgb_10= pd.read_csv('stack_sub/submission_xgb_2.csv',index_col=0)

stack_train_xgb_10.reset_index(inplace = True)
stack_train_nn_10.reset_index(inplace = True)

stack_train_xgb_10.rename(columns = {'Demanda_uni_equil':'xgb'},inplace = True)
stack_train_nn_10.rename(columns = {'Demanda_uni_equil':'nn'},inplace = True)

print stack_train_nn_10.shape
print stack_train_xgb_10.shape


(6999251, 2)
(6999251, 2)

In [32]:
stack_train_xgb_10['nn'] = stack_train_nn_10['nn']
stack_train_xgb_10['nn'] = stack_train_xgb_10['nn'].apply(np.log1p)
stack_train_xgb_10['xgb'] = stack_train_xgb_10['xgb'].apply(np.log1p)

In [33]:
stack_train_xgb_10.head()


Out[33]:
id xgb nn
0 1569352 2.128232 1.740466
1 6667200 3.627004 3.629660
2 1592616 2.990720 3.000720
3 3909690 4.172848 4.207673
4 3659672 3.634951 3.577948

In [34]:
stack_train_xgb_10 = stack_train_xgb_10.iloc[:3538385]
print stack_train_xgb_10.shape
stack_train_xgb_10.head()


(3538385, 3)
Out[34]:
id xgb nn
0 1569352 2.128232 1.740466
1 6667200 3.627004 3.629660
2 1592616 2.990720 3.000720
3 3909690 4.172848 4.207673
4 3659672 3.634951 3.577948