In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from scipy import sparse
from sklearn.feature_extraction import FeatureHasher
from scipy.sparse import coo_matrix,csr_matrix,csc_matrix, hstack
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
from sklearn import linear_model
import gc
from sklearn import preprocessing

In [2]:
%ls


1_xgboost.ipynb*         origin/
3_xgb_prediction.ipynb*  pivot_test.pickle*
4_keras_nn.ipynb*        pivot_train_with_nan.pickle*
5_random_forest.ipynb*   train_pivot_45678_to_9.csv*
matplotlib-siyuan/       train_pivot_56789_to_10_new.pickle*
model_stack/             train_pivot_6789_to_11_new.pickle*
old_ipython/             train_pivot_xgb_time1.csv*
old_submission/          train_pivot_xgb_time2.csv*

In [ ]:

begin training, for week 11



In [18]:
predictors_target_11 = ['LR_prod', 'LR_prod_corr',
       'NombreCliente',
       'agen_cliente_for_log_de', 'agen_for_log_de',
       'agen_producto_for_log_de', 'agen_ruta_for_log_de',
       'cliente_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_for_log_de', 'corr', 'pieces',
       'producto_for_log_de', 'ruta_cliente_for_log_de', 'ruta_for_log_de',
       'ruta_producto_for_log_de', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 't_m_2_cum', 't_m_3_cum',
       't_m_4_cum', 't_m_5_cum', 't_min_2', 't_min_3', 't_min_4',
       't_min_5', 'target', 'weight', 'weight_per_piece']

In [19]:
predictors_11 = ['LR_prod', 'LR_prod_corr',
       'NombreCliente',
       'agen_cliente_for_log_de', 'agen_for_log_de',
       'agen_producto_for_log_de', 'agen_ruta_for_log_de',
       'cliente_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_for_log_de', 'corr', 'pieces',
       'producto_for_log_de', 'ruta_cliente_for_log_de', 'ruta_for_log_de',
       'ruta_producto_for_log_de', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 't_m_2_cum', 't_m_3_cum',
       't_m_4_cum', 't_m_5_cum', 't_min_2', 't_min_3', 't_min_4',
       't_min_5', 'weight', 'weight_per_piece']

In [10]:
f = lambda x : (x-x.mean())/x.std(ddof=0)

In [14]:
train_pivot_xgb_time2 = pd.read_csv('train_pivot_xgb_time2.csv',index_col = 0)

In [7]:
train_pivot_6789_to_11 = pd.read_pickle('train_pivot_6789_to_11_new.pickle')

In [8]:
train_pivot_xgb_time2.head()


Out[8]:
Agencia_ID Canal_ID Cliente_ID LR_prod LR_prod_corr NombreCliente Producto_ID Ruta_SAK agen_cliente_for_log_de agen_for_log_de ... t_m_3_cum t_m_4_cum t_m_5_cum t_min_2 t_min_3 t_min_4 t_min_5 target weight weight_per_piece
0 2061 2 26 2.001190 7.293554 18434 1182 7212 2.852285 3.491654 ... NaN NaN 3.688879 NaN NaN NaN 3.688879 0.000000 210.0 210.00
1 2061 2 26 1.839411 6.703932 18434 4767 7212 2.852285 3.491654 ... NaN NaN 3.761200 NaN NaN NaN 3.761200 3.761200 250.0 NaN
2 2061 2 26 1.911283 6.965878 18434 31393 7212 2.852285 3.491654 ... 8.650325 5.877736 3.044522 2.772589 2.772589 2.833213 3.044522 3.135494 640.0 NaN
3 2061 2 26 3.113374 11.347029 18434 34204 7212 2.852285 3.491654 ... 11.024839 7.218177 3.784190 3.555348 3.806662 3.433987 3.784190 3.828641 450.0 56.25
4 2061 2 26 2.031231 7.403043 18434 34206 7212 2.852285 3.491654 ... 12.963710 9.202510 4.795791 4.248495 3.761200 4.406719 4.795791 4.499810 340.0 42.50

5 rows × 38 columns


In [15]:
train_pivot_xgb_time2.columns.values


Out[15]:
array(['Agencia_ID', 'Canal_ID', 'Cliente_ID', 'LR_prod', 'LR_prod_corr',
       'NombreCliente', 'Producto_ID', 'Ruta_SAK',
       'agen_cliente_for_log_de', 'agen_for_log_de',
       'agen_producto_for_log_de', 'agen_ruta_for_log_de',
       'cliente_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_for_log_de', 'corr', 'pieces',
       'producto_for_log_de', 'ruta_cliente_for_log_de', 'ruta_for_log_de',
       'ruta_producto_for_log_de', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 't_m_2_cum', 't_m_3_cum',
       't_m_4_cum', 't_m_5_cum', 't_min_2', 't_min_3', 't_min_4',
       't_min_5', 'target', 'weight', 'weight_per_piece'], dtype=object)

In [4]:
def normalize_dataset(train_dataset,test_dataset):
    train_dataset_normalize = train_dataset[predictors_11].copy()
    train_dataset_normalize['label'] = 0    
    
    test_dataset_normalize = test_dataset[predictors_11].copy()
    test_dataset_normalize['label'] = 1
    
    whole_dataset = pd.concat([train_dataset_normalize,test_dataset_normalize])
    whole_dataset_normalize = whole_dataset.apply(f,axis = 0)
    
    train_dataset_normalize = whole_dataset_normalize.loc[whole_dataset['label'] == 0]
    test_dataset_normalize = whole_dataset_normalize.loc[whole_dataset['label']==1]
    
    train_dataset_normalize.drop(['label'],axis = 1,inplace = True)
    test_dataset_normalize.drop(['label'],axis =1,inplace = True)
    
    train_dataset_normalize['target'] = train_dataset['target'].copy() 
    
#     target = train_dataset['target']
    return train_dataset_normalize,test_dataset_normalize

In [21]:
train_dataset_normalize, test_dataset_normalize = normalize_dataset(train_pivot_xgb_time2,train_pivot_6789_to_11)


/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [22]:
train_dataset_normalize.head()


Out[22]:
Semana LR_prod LR_prod_corr NombreCliente agen_cliente_for_log_de agen_for_log_de agen_producto_for_log_de agen_ruta_for_log_de cliente_for_log_de cliente_for_log_sum cliente_producto_for_log_de ... t_m_3_cum t_m_4_cum t_m_5_cum t_min_2 t_min_3 t_min_4 t_min_5 weight weight_per_piece target
0 0.440004 0.007984 -1.198863 2.893915 7.040262 4.922515 3.707101 2.951726 0.023732 2.468511 ... NaN NaN 0.841755 NaN NaN NaN 2.148223 0.041948 0.552880 0.000000
1 0.136858 0.004843 -1.198863 2.893915 7.040262 4.561988 3.707101 2.951726 0.023732 2.558317 ... NaN NaN 0.888582 NaN NaN NaN 2.230611 0.180385 NaN 3.761200
2 0.271533 0.006239 -1.198863 2.893915 7.040262 3.000979 3.707101 2.951726 0.023732 1.433925 ... 1.028141 0.851082 0.424536 1.090852 1.086921 1.149949 1.414172 1.530144 NaN 3.135494
3 2.524041 0.029578 -1.198863 2.893915 7.040262 3.917930 3.707101 2.951726 0.023732 2.414081 ... 1.752516 1.424506 0.903467 1.968861 2.245187 1.825675 2.256800 0.872569 -0.290791 3.828641
4 0.496296 0.008568 -1.198863 2.893915 7.040262 5.719378 3.707101 2.951726 0.023732 3.231175 ... 2.343992 2.273380 1.558475 2.746354 2.194265 2.919764 3.409216 0.491868 -0.366242 4.499810

5 rows × 33 columns


In [26]:
train_pivot_xgb_time2_sample = train_dataset_normalize.sample(2000000)

train_feature_11 = train_pivot_xgb_time2_sample.drop(['target'],axis = 1)
train_label_11 = train_pivot_xgb_time2_sample[['target']]

dtrain_11 = xgb.DMatrix(train_feature_11,label = train_label_11,missing=np.nan)

In [27]:
num_round = 1000

cvresult = xgb.cv(param_11, dtrain_11, num_round, nfold=5,verbose_eval = 1,show_stdv=False,
                        seed = 0, early_stopping_rounds=5)
print(cvresult.tail())


[0]	train-rmse:1.14019	test-rmse:1.14021
[1]	train-rmse:0.961458	test-rmse:0.961473
[2]	train-rmse:0.824843	test-rmse:0.824873
[3]	train-rmse:0.722328	test-rmse:0.722397
[4]	train-rmse:0.647489	test-rmse:0.647603
[5]	train-rmse:0.592862	test-rmse:0.593041
[6]	train-rmse:0.555021	test-rmse:0.555254
[7]	train-rmse:0.528662	test-rmse:0.528924
[8]	train-rmse:0.510662	test-rmse:0.510977
[9]	train-rmse:0.498551	test-rmse:0.498903
[10]	train-rmse:0.49003	test-rmse:0.490423
[11]	train-rmse:0.484163	test-rmse:0.484588
[12]	train-rmse:0.480003	test-rmse:0.480457
[13]	train-rmse:0.477039	test-rmse:0.47752
[14]	train-rmse:0.474897	test-rmse:0.475405
[15]	train-rmse:0.473305	test-rmse:0.473844
[16]	train-rmse:0.472061	test-rmse:0.472641
[17]	train-rmse:0.471142	test-rmse:0.471752
[18]	train-rmse:0.470369	test-rmse:0.471
[19]	train-rmse:0.469729	test-rmse:0.470375
[20]	train-rmse:0.469123	test-rmse:0.469783
[21]	train-rmse:0.468683	test-rmse:0.469369
[22]	train-rmse:0.468239	test-rmse:0.468944
[23]	train-rmse:0.467862	test-rmse:0.468596
[24]	train-rmse:0.467516	test-rmse:0.468284
[25]	train-rmse:0.467183	test-rmse:0.467968
[26]	train-rmse:0.46683	test-rmse:0.467642
[27]	train-rmse:0.466558	test-rmse:0.467384
[28]	train-rmse:0.466233	test-rmse:0.467087
[29]	train-rmse:0.465967	test-rmse:0.46684
[30]	train-rmse:0.46573	test-rmse:0.466622
[31]	train-rmse:0.465487	test-rmse:0.466401
[32]	train-rmse:0.465309	test-rmse:0.466247
[33]	train-rmse:0.465145	test-rmse:0.466103
[34]	train-rmse:0.464955	test-rmse:0.465931
[35]	train-rmse:0.464779	test-rmse:0.465778
[36]	train-rmse:0.464622	test-rmse:0.465647
[37]	train-rmse:0.46445	test-rmse:0.465496
[38]	train-rmse:0.464265	test-rmse:0.465335
[39]	train-rmse:0.464089	test-rmse:0.46519
[40]	train-rmse:0.463953	test-rmse:0.465087
[41]	train-rmse:0.463803	test-rmse:0.464965
[42]	train-rmse:0.463669	test-rmse:0.46485
[43]	train-rmse:0.463526	test-rmse:0.46473
[44]	train-rmse:0.463345	test-rmse:0.46458
[45]	train-rmse:0.46323	test-rmse:0.464488
[46]	train-rmse:0.463083	test-rmse:0.46436
[47]	train-rmse:0.462961	test-rmse:0.464254
[48]	train-rmse:0.462844	test-rmse:0.464164
[49]	train-rmse:0.462671	test-rmse:0.464006
[50]	train-rmse:0.462573	test-rmse:0.463937
[51]	train-rmse:0.462433	test-rmse:0.463811
[52]	train-rmse:0.462326	test-rmse:0.463717
[53]	train-rmse:0.462213	test-rmse:0.46363
[54]	train-rmse:0.462113	test-rmse:0.463553
[55]	train-rmse:0.462006	test-rmse:0.463481
[56]	train-rmse:0.461917	test-rmse:0.463415
[57]	train-rmse:0.461803	test-rmse:0.463323
[58]	train-rmse:0.461723	test-rmse:0.463264
[59]	train-rmse:0.461629	test-rmse:0.463187
[60]	train-rmse:0.461547	test-rmse:0.463137
[61]	train-rmse:0.461431	test-rmse:0.46305
[62]	train-rmse:0.461339	test-rmse:0.462979
[63]	train-rmse:0.461268	test-rmse:0.462928
[64]	train-rmse:0.461137	test-rmse:0.462813
[65]	train-rmse:0.461038	test-rmse:0.462732
[66]	train-rmse:0.460964	test-rmse:0.462681
[67]	train-rmse:0.460849	test-rmse:0.462587
[68]	train-rmse:0.460744	test-rmse:0.462503
[69]	train-rmse:0.460662	test-rmse:0.462446
[70]	train-rmse:0.460593	test-rmse:0.462395
[71]	train-rmse:0.460521	test-rmse:0.462347
[72]	train-rmse:0.460432	test-rmse:0.462281
[73]	train-rmse:0.460351	test-rmse:0.46221
[74]	train-rmse:0.46028	test-rmse:0.462161
[75]	train-rmse:0.460207	test-rmse:0.462113
[76]	train-rmse:0.460113	test-rmse:0.462038
[77]	train-rmse:0.460044	test-rmse:0.461992
[78]	train-rmse:0.459984	test-rmse:0.461954
[79]	train-rmse:0.459917	test-rmse:0.461911
[80]	train-rmse:0.45985	test-rmse:0.461865
[81]	train-rmse:0.459802	test-rmse:0.461841
[82]	train-rmse:0.459757	test-rmse:0.461811
[83]	train-rmse:0.4597	test-rmse:0.461779
[84]	train-rmse:0.459629	test-rmse:0.461723
[85]	train-rmse:0.459556	test-rmse:0.46167
[86]	train-rmse:0.45951	test-rmse:0.461643
[87]	train-rmse:0.459459	test-rmse:0.461609
[88]	train-rmse:0.459389	test-rmse:0.461553
[89]	train-rmse:0.459321	test-rmse:0.461508
[90]	train-rmse:0.45926	test-rmse:0.46146
[91]	train-rmse:0.459208	test-rmse:0.461435
[92]	train-rmse:0.459144	test-rmse:0.461394
[93]	train-rmse:0.459064	test-rmse:0.461334
[94]	train-rmse:0.458977	test-rmse:0.461258
[95]	train-rmse:0.458892	test-rmse:0.461193
[96]	train-rmse:0.45883	test-rmse:0.461158
[97]	train-rmse:0.458763	test-rmse:0.461109
[98]	train-rmse:0.458715	test-rmse:0.461084
[99]	train-rmse:0.458667	test-rmse:0.461051
[100]	train-rmse:0.45862	test-rmse:0.461025
[101]	train-rmse:0.458561	test-rmse:0.460985
[102]	train-rmse:0.458485	test-rmse:0.460929
[103]	train-rmse:0.458429	test-rmse:0.46089
[104]	train-rmse:0.458358	test-rmse:0.46085
[105]	train-rmse:0.458298	test-rmse:0.46081
[106]	train-rmse:0.458249	test-rmse:0.460781
[107]	train-rmse:0.458208	test-rmse:0.460761
[108]	train-rmse:0.458152	test-rmse:0.460726
[109]	train-rmse:0.458104	test-rmse:0.460691
[110]	train-rmse:0.458062	test-rmse:0.460668
[111]	train-rmse:0.458015	test-rmse:0.46064
[112]	train-rmse:0.45795	test-rmse:0.460591
[113]	train-rmse:0.457892	test-rmse:0.460552
[114]	train-rmse:0.457838	test-rmse:0.460513
[115]	train-rmse:0.457792	test-rmse:0.460485
[116]	train-rmse:0.457736	test-rmse:0.460451
[117]	train-rmse:0.457677	test-rmse:0.460414
[118]	train-rmse:0.457629	test-rmse:0.460385
[119]	train-rmse:0.457591	test-rmse:0.460364
[120]	train-rmse:0.457548	test-rmse:0.460338
[121]	train-rmse:0.457504	test-rmse:0.460321
[122]	train-rmse:0.457451	test-rmse:0.46029
[123]	train-rmse:0.4574	test-rmse:0.460258
[124]	train-rmse:0.457357	test-rmse:0.460242
[125]	train-rmse:0.457313	test-rmse:0.460215
[126]	train-rmse:0.457275	test-rmse:0.460195
[127]	train-rmse:0.457239	test-rmse:0.460181
[128]	train-rmse:0.457175	test-rmse:0.460133
[129]	train-rmse:0.457127	test-rmse:0.460101
[130]	train-rmse:0.457072	test-rmse:0.460069
[131]	train-rmse:0.45704	test-rmse:0.460059
[132]	train-rmse:0.456989	test-rmse:0.460023
[133]	train-rmse:0.456961	test-rmse:0.46001
[134]	train-rmse:0.456914	test-rmse:0.459983
[135]	train-rmse:0.456855	test-rmse:0.459941
[136]	train-rmse:0.4568	test-rmse:0.459908
[137]	train-rmse:0.456758	test-rmse:0.459893
[138]	train-rmse:0.456722	test-rmse:0.459872
[139]	train-rmse:0.456691	test-rmse:0.459851
[140]	train-rmse:0.456637	test-rmse:0.459813
[141]	train-rmse:0.456591	test-rmse:0.459779
[142]	train-rmse:0.456547	test-rmse:0.459753
[143]	train-rmse:0.456497	test-rmse:0.459717
[144]	train-rmse:0.456462	test-rmse:0.4597
[145]	train-rmse:0.456428	test-rmse:0.459689
[146]	train-rmse:0.45639	test-rmse:0.459668
[147]	train-rmse:0.456343	test-rmse:0.459636
[148]	train-rmse:0.456296	test-rmse:0.459606
[149]	train-rmse:0.456264	test-rmse:0.459592
[150]	train-rmse:0.456216	test-rmse:0.459563
[151]	train-rmse:0.45618	test-rmse:0.459541
[152]	train-rmse:0.456143	test-rmse:0.459521
[153]	train-rmse:0.456107	test-rmse:0.459505
[154]	train-rmse:0.45607	test-rmse:0.45948
[155]	train-rmse:0.456035	test-rmse:0.459458
[156]	train-rmse:0.455993	test-rmse:0.459439
[157]	train-rmse:0.455953	test-rmse:0.459421
[158]	train-rmse:0.455928	test-rmse:0.459423
[159]	train-rmse:0.455896	test-rmse:0.459411
[160]	train-rmse:0.455849	test-rmse:0.459382
[161]	train-rmse:0.455798	test-rmse:0.459353
[162]	train-rmse:0.455758	test-rmse:0.459332
[163]	train-rmse:0.45572	test-rmse:0.459312
[164]	train-rmse:0.455671	test-rmse:0.459274
[165]	train-rmse:0.455643	test-rmse:0.459267
[166]	train-rmse:0.455599	test-rmse:0.459238
[167]	train-rmse:0.455578	test-rmse:0.459229
[168]	train-rmse:0.455557	test-rmse:0.459222
[169]	train-rmse:0.455519	test-rmse:0.459205
[170]	train-rmse:0.455479	test-rmse:0.459182
[171]	train-rmse:0.455424	test-rmse:0.459142
[172]	train-rmse:0.455386	test-rmse:0.459117
[173]	train-rmse:0.455333	test-rmse:0.459082
[174]	train-rmse:0.455288	test-rmse:0.459061
[175]	train-rmse:0.455247	test-rmse:0.459037
[176]	train-rmse:0.455217	test-rmse:0.459022
[177]	train-rmse:0.455182	test-rmse:0.459006
[178]	train-rmse:0.45515	test-rmse:0.458995
[179]	train-rmse:0.455126	test-rmse:0.458987
[180]	train-rmse:0.455077	test-rmse:0.458954
[181]	train-rmse:0.455041	test-rmse:0.45894
[182]	train-rmse:0.455009	test-rmse:0.458925
[183]	train-rmse:0.454971	test-rmse:0.458911
[184]	train-rmse:0.454937	test-rmse:0.458897
[185]	train-rmse:0.454906	test-rmse:0.458882
[186]	train-rmse:0.454867	test-rmse:0.458865
[187]	train-rmse:0.454838	test-rmse:0.458856
[188]	train-rmse:0.454808	test-rmse:0.458839
[189]	train-rmse:0.454758	test-rmse:0.458799
[190]	train-rmse:0.454732	test-rmse:0.458785
[191]	train-rmse:0.454707	test-rmse:0.458781
[192]	train-rmse:0.454676	test-rmse:0.458763
[193]	train-rmse:0.454644	test-rmse:0.458749
[194]	train-rmse:0.454612	test-rmse:0.458727
[195]	train-rmse:0.454586	test-rmse:0.458717
[196]	train-rmse:0.454553	test-rmse:0.458698
[197]	train-rmse:0.454522	test-rmse:0.458686
[198]	train-rmse:0.454498	test-rmse:0.45868
[199]	train-rmse:0.454471	test-rmse:0.45867
[200]	train-rmse:0.454441	test-rmse:0.458658
[201]	train-rmse:0.454419	test-rmse:0.458648
[202]	train-rmse:0.454394	test-rmse:0.458632
[203]	train-rmse:0.454364	test-rmse:0.458618
[204]	train-rmse:0.454324	test-rmse:0.458591
[205]	train-rmse:0.454293	test-rmse:0.45858
[206]	train-rmse:0.454252	test-rmse:0.45856
[207]	train-rmse:0.454214	test-rmse:0.458538
[208]	train-rmse:0.454184	test-rmse:0.458523
[209]	train-rmse:0.454149	test-rmse:0.458506
[210]	train-rmse:0.45412	test-rmse:0.458498
[211]	train-rmse:0.454096	test-rmse:0.458492
[212]	train-rmse:0.454066	test-rmse:0.458483
[213]	train-rmse:0.454031	test-rmse:0.458469
[214]	train-rmse:0.454001	test-rmse:0.458454
[215]	train-rmse:0.453978	test-rmse:0.458447
[216]	train-rmse:0.453957	test-rmse:0.458442
[217]	train-rmse:0.45393	test-rmse:0.45844
[218]	train-rmse:0.453906	test-rmse:0.458437
[219]	train-rmse:0.453883	test-rmse:0.458433
[220]	train-rmse:0.453854	test-rmse:0.458424
[221]	train-rmse:0.453833	test-rmse:0.458417
[222]	train-rmse:0.453804	test-rmse:0.458404
[223]	train-rmse:0.453771	test-rmse:0.458387
[224]	train-rmse:0.453738	test-rmse:0.458379
[225]	train-rmse:0.453705	test-rmse:0.458364
[226]	train-rmse:0.453666	test-rmse:0.45834
[227]	train-rmse:0.453637	test-rmse:0.458324
[228]	train-rmse:0.453604	test-rmse:0.458308
[229]	train-rmse:0.453568	test-rmse:0.458286
[230]	train-rmse:0.453544	test-rmse:0.458278
[231]	train-rmse:0.453517	test-rmse:0.458261
[232]	train-rmse:0.453487	test-rmse:0.458256
[233]	train-rmse:0.453461	test-rmse:0.458247
[234]	train-rmse:0.453427	test-rmse:0.458229
[235]	train-rmse:0.453395	test-rmse:0.458214
[236]	train-rmse:0.453372	test-rmse:0.458208
[237]	train-rmse:0.453348	test-rmse:0.458201
[238]	train-rmse:0.453319	test-rmse:0.458192
[239]	train-rmse:0.453297	test-rmse:0.458187
[240]	train-rmse:0.453274	test-rmse:0.458178
[241]	train-rmse:0.453253	test-rmse:0.45817
[242]	train-rmse:0.45322	test-rmse:0.458155
[243]	train-rmse:0.453195	test-rmse:0.458147
[244]	train-rmse:0.453173	test-rmse:0.458142
[245]	train-rmse:0.453138	test-rmse:0.458128
[246]	train-rmse:0.453114	test-rmse:0.45812
[247]	train-rmse:0.453088	test-rmse:0.45811
[248]	train-rmse:0.453068	test-rmse:0.458109
[249]	train-rmse:0.453038	test-rmse:0.458096
[250]	train-rmse:0.45301	test-rmse:0.458085
[251]	train-rmse:0.452982	test-rmse:0.458079
[252]	train-rmse:0.452954	test-rmse:0.458069
[253]	train-rmse:0.452933	test-rmse:0.458065
[254]	train-rmse:0.452898	test-rmse:0.458045
[255]	train-rmse:0.452879	test-rmse:0.45804
[256]	train-rmse:0.452847	test-rmse:0.458028
[257]	train-rmse:0.452822	test-rmse:0.458019
[258]	train-rmse:0.452791	test-rmse:0.458005
[259]	train-rmse:0.452766	test-rmse:0.457988
[260]	train-rmse:0.452739	test-rmse:0.457979
[261]	train-rmse:0.452722	test-rmse:0.457977
[262]	train-rmse:0.452691	test-rmse:0.457961
[263]	train-rmse:0.452669	test-rmse:0.457954
[264]	train-rmse:0.452642	test-rmse:0.457943
[265]	train-rmse:0.452607	test-rmse:0.45792
[266]	train-rmse:0.452585	test-rmse:0.457913
[267]	train-rmse:0.452562	test-rmse:0.457908
[268]	train-rmse:0.452537	test-rmse:0.457899
[269]	train-rmse:0.452509	test-rmse:0.457888
[270]	train-rmse:0.452482	test-rmse:0.45788
[271]	train-rmse:0.452453	test-rmse:0.457868
[272]	train-rmse:0.452423	test-rmse:0.45786
[273]	train-rmse:0.452389	test-rmse:0.457846
[274]	train-rmse:0.45236	test-rmse:0.457834
[275]	train-rmse:0.452339	test-rmse:0.457828
[276]	train-rmse:0.452316	test-rmse:0.457822
[277]	train-rmse:0.452296	test-rmse:0.457817
[278]	train-rmse:0.452268	test-rmse:0.457805
[279]	train-rmse:0.452244	test-rmse:0.457801
[280]	train-rmse:0.452222	test-rmse:0.45779
[281]	train-rmse:0.452196	test-rmse:0.457782
[282]	train-rmse:0.452174	test-rmse:0.457773
[283]	train-rmse:0.452149	test-rmse:0.457759
[284]	train-rmse:0.452122	test-rmse:0.457752
[285]	train-rmse:0.452101	test-rmse:0.457744
[286]	train-rmse:0.452083	test-rmse:0.457738
[287]	train-rmse:0.452059	test-rmse:0.457734
[288]	train-rmse:0.452025	test-rmse:0.457713
[289]	train-rmse:0.451997	test-rmse:0.457701
[290]	train-rmse:0.451978	test-rmse:0.457696
[291]	train-rmse:0.451958	test-rmse:0.457692
[292]	train-rmse:0.45194	test-rmse:0.457688
[293]	train-rmse:0.451917	test-rmse:0.457677
[294]	train-rmse:0.451895	test-rmse:0.457669
[295]	train-rmse:0.451867	test-rmse:0.457656
[296]	train-rmse:0.451842	test-rmse:0.457648
[297]	train-rmse:0.451822	test-rmse:0.457639
[298]	train-rmse:0.451796	test-rmse:0.457633
[299]	train-rmse:0.45177	test-rmse:0.457627
[300]	train-rmse:0.451738	test-rmse:0.457606
[301]	train-rmse:0.451706	test-rmse:0.457592
[302]	train-rmse:0.451689	test-rmse:0.457587
[303]	train-rmse:0.451668	test-rmse:0.457585
[304]	train-rmse:0.451628	test-rmse:0.457558
[305]	train-rmse:0.451599	test-rmse:0.457544
[306]	train-rmse:0.451579	test-rmse:0.457537
[307]	train-rmse:0.451549	test-rmse:0.457524
[308]	train-rmse:0.451526	test-rmse:0.457518
[309]	train-rmse:0.451494	test-rmse:0.457504
[310]	train-rmse:0.451475	test-rmse:0.457502
[311]	train-rmse:0.451454	test-rmse:0.457501
[312]	train-rmse:0.451425	test-rmse:0.457489
[313]	train-rmse:0.451399	test-rmse:0.457478
[314]	train-rmse:0.451369	test-rmse:0.457469
[315]	train-rmse:0.451351	test-rmse:0.457465
[316]	train-rmse:0.45133	test-rmse:0.457461
[317]	train-rmse:0.451299	test-rmse:0.457448
[318]	train-rmse:0.451279	test-rmse:0.457441
[319]	train-rmse:0.45126	test-rmse:0.457432
[320]	train-rmse:0.451242	test-rmse:0.457431
[321]	train-rmse:0.451222	test-rmse:0.457429
[322]	train-rmse:0.451201	test-rmse:0.457426
[323]	train-rmse:0.451184	test-rmse:0.45742
[324]	train-rmse:0.451152	test-rmse:0.457406
[325]	train-rmse:0.451125	test-rmse:0.457398
[326]	train-rmse:0.451092	test-rmse:0.457376
[327]	train-rmse:0.451059	test-rmse:0.457362
[328]	train-rmse:0.45104	test-rmse:0.457359
[329]	train-rmse:0.451024	test-rmse:0.457355
[330]	train-rmse:0.451007	test-rmse:0.457355
[331]	train-rmse:0.450975	test-rmse:0.457341
[332]	train-rmse:0.450949	test-rmse:0.457331
[333]	train-rmse:0.450932	test-rmse:0.457331
[334]	train-rmse:0.450912	test-rmse:0.457331
[335]	train-rmse:0.450881	test-rmse:0.457317
[336]	train-rmse:0.450859	test-rmse:0.45732
[337]	train-rmse:0.450838	test-rmse:0.457311
[338]	train-rmse:0.450819	test-rmse:0.457301
[339]	train-rmse:0.450802	test-rmse:0.457296
[340]	train-rmse:0.45078	test-rmse:0.457294
[341]	train-rmse:0.450761	test-rmse:0.457294
[342]	train-rmse:0.450742	test-rmse:0.457292
[343]	train-rmse:0.450721	test-rmse:0.457293
[344]	train-rmse:0.450694	test-rmse:0.457288
[345]	train-rmse:0.450671	test-rmse:0.457282
[346]	train-rmse:0.450648	test-rmse:0.457273
[347]	train-rmse:0.450629	test-rmse:0.457266
[348]	train-rmse:0.450604	test-rmse:0.457255
[349]	train-rmse:0.450572	test-rmse:0.45724
[350]	train-rmse:0.450548	test-rmse:0.457233
[351]	train-rmse:0.450533	test-rmse:0.457233
[352]	train-rmse:0.450518	test-rmse:0.457229
[353]	train-rmse:0.450495	test-rmse:0.457227
[354]	train-rmse:0.450471	test-rmse:0.457216
[355]	train-rmse:0.450453	test-rmse:0.457215
[356]	train-rmse:0.450432	test-rmse:0.45721
[357]	train-rmse:0.450418	test-rmse:0.457208
[358]	train-rmse:0.450395	test-rmse:0.457203
[359]	train-rmse:0.45038	test-rmse:0.457203
[360]	train-rmse:0.45036	test-rmse:0.457198
[361]	train-rmse:0.450338	test-rmse:0.457192
[362]	train-rmse:0.450324	test-rmse:0.45719
[363]	train-rmse:0.450307	test-rmse:0.457186
[364]	train-rmse:0.450278	test-rmse:0.457174
[365]	train-rmse:0.450263	test-rmse:0.457174
[366]	train-rmse:0.450242	test-rmse:0.457167
[367]	train-rmse:0.450219	test-rmse:0.457159
[368]	train-rmse:0.450198	test-rmse:0.457153
[369]	train-rmse:0.450185	test-rmse:0.457154
[370]	train-rmse:0.450171	test-rmse:0.457156
[371]	train-rmse:0.450149	test-rmse:0.457148
[372]	train-rmse:0.450132	test-rmse:0.457147
[373]	train-rmse:0.450115	test-rmse:0.45714
[374]	train-rmse:0.450093	test-rmse:0.457139
[375]	train-rmse:0.45007	test-rmse:0.45713
[376]	train-rmse:0.450047	test-rmse:0.457126
[377]	train-rmse:0.450026	test-rmse:0.45712
[378]	train-rmse:0.450009	test-rmse:0.457117
[379]	train-rmse:0.449995	test-rmse:0.457116
[380]	train-rmse:0.449971	test-rmse:0.457112
[381]	train-rmse:0.449951	test-rmse:0.457108
[382]	train-rmse:0.449929	test-rmse:0.4571
[383]	train-rmse:0.449909	test-rmse:0.457098
[384]	train-rmse:0.44989	test-rmse:0.457091
[385]	train-rmse:0.449869	test-rmse:0.457089
[386]	train-rmse:0.449847	test-rmse:0.457085
[387]	train-rmse:0.449825	test-rmse:0.457078
[388]	train-rmse:0.449803	test-rmse:0.457071
[389]	train-rmse:0.449785	test-rmse:0.457065
[390]	train-rmse:0.449762	test-rmse:0.457064
[391]	train-rmse:0.44974	test-rmse:0.45706
[392]	train-rmse:0.449724	test-rmse:0.457057
[393]	train-rmse:0.449703	test-rmse:0.457052
[394]	train-rmse:0.449669	test-rmse:0.457036
[395]	train-rmse:0.449647	test-rmse:0.457033
[396]	train-rmse:0.44963	test-rmse:0.457029
[397]	train-rmse:0.44961	test-rmse:0.45702
[398]	train-rmse:0.449595	test-rmse:0.457021
[399]	train-rmse:0.44958	test-rmse:0.45702
[400]	train-rmse:0.44956	test-rmse:0.457016
[401]	train-rmse:0.449546	test-rmse:0.457014
[402]	train-rmse:0.449526	test-rmse:0.457008
[403]	train-rmse:0.449508	test-rmse:0.457004
[404]	train-rmse:0.449488	test-rmse:0.457006
[405]	train-rmse:0.449471	test-rmse:0.457006
[406]	train-rmse:0.449454	test-rmse:0.457002
[407]	train-rmse:0.449436	test-rmse:0.456998
[408]	train-rmse:0.449418	test-rmse:0.456996
[409]	train-rmse:0.449395	test-rmse:0.456988
[410]	train-rmse:0.449375	test-rmse:0.456983
[411]	train-rmse:0.449358	test-rmse:0.456984
[412]	train-rmse:0.449327	test-rmse:0.456967
[413]	train-rmse:0.449308	test-rmse:0.456963
[414]	train-rmse:0.449288	test-rmse:0.456956
[415]	train-rmse:0.449272	test-rmse:0.456953
[416]	train-rmse:0.44925	test-rmse:0.456944
[417]	train-rmse:0.449239	test-rmse:0.456944
[418]	train-rmse:0.449218	test-rmse:0.456938
[419]	train-rmse:0.449199	test-rmse:0.456932
[420]	train-rmse:0.449184	test-rmse:0.456927
[421]	train-rmse:0.449165	test-rmse:0.456921
[422]	train-rmse:0.44915	test-rmse:0.456918
[423]	train-rmse:0.449134	test-rmse:0.456916
[424]	train-rmse:0.449113	test-rmse:0.456904
[425]	train-rmse:0.44909	test-rmse:0.456897
[426]	train-rmse:0.449071	test-rmse:0.456891
[427]	train-rmse:0.449052	test-rmse:0.456888
[428]	train-rmse:0.449039	test-rmse:0.45689
[429]	train-rmse:0.449017	test-rmse:0.456883
[430]	train-rmse:0.449	test-rmse:0.456877
[431]	train-rmse:0.448982	test-rmse:0.456871
[432]	train-rmse:0.448959	test-rmse:0.456863
[433]	train-rmse:0.448943	test-rmse:0.456859
[434]	train-rmse:0.448927	test-rmse:0.456857
[435]	train-rmse:0.448908	test-rmse:0.456855
[436]	train-rmse:0.44889	test-rmse:0.456854
[437]	train-rmse:0.448871	test-rmse:0.45685
[438]	train-rmse:0.448851	test-rmse:0.456844
[439]	train-rmse:0.448827	test-rmse:0.456838
[440]	train-rmse:0.448804	test-rmse:0.456829
[441]	train-rmse:0.44879	test-rmse:0.456828
[442]	train-rmse:0.448771	test-rmse:0.456826
[443]	train-rmse:0.448749	test-rmse:0.456815
[444]	train-rmse:0.448731	test-rmse:0.456818
[445]	train-rmse:0.448716	test-rmse:0.456814
[446]	train-rmse:0.448701	test-rmse:0.456811
[447]	train-rmse:0.448682	test-rmse:0.456807
[448]	train-rmse:0.448666	test-rmse:0.456805
[449]	train-rmse:0.448649	test-rmse:0.456802
[450]	train-rmse:0.448632	test-rmse:0.456801
[451]	train-rmse:0.448611	test-rmse:0.456795
[452]	train-rmse:0.448593	test-rmse:0.456793
[453]	train-rmse:0.448578	test-rmse:0.456789
[454]	train-rmse:0.448562	test-rmse:0.456788
[455]	train-rmse:0.448545	test-rmse:0.456783
[456]	train-rmse:0.448524	test-rmse:0.456777
[457]	train-rmse:0.448512	test-rmse:0.456773
[458]	train-rmse:0.448495	test-rmse:0.456773
[459]	train-rmse:0.448478	test-rmse:0.45677
[460]	train-rmse:0.44845	test-rmse:0.456756
[461]	train-rmse:0.448433	test-rmse:0.456753
[462]	train-rmse:0.448415	test-rmse:0.456752
[463]	train-rmse:0.448397	test-rmse:0.456748
[464]	train-rmse:0.448383	test-rmse:0.456745
[465]	train-rmse:0.448363	test-rmse:0.45674
[466]	train-rmse:0.448339	test-rmse:0.456728
[467]	train-rmse:0.448317	test-rmse:0.456722
[468]	train-rmse:0.448299	test-rmse:0.456723
[469]	train-rmse:0.448281	test-rmse:0.456715
[470]	train-rmse:0.448263	test-rmse:0.456718
[471]	train-rmse:0.448242	test-rmse:0.456717
[472]	train-rmse:0.448222	test-rmse:0.456711
[473]	train-rmse:0.448204	test-rmse:0.456709
[474]	train-rmse:0.448185	test-rmse:0.456704
[475]	train-rmse:0.448164	test-rmse:0.456699
[476]	train-rmse:0.448147	test-rmse:0.456699
[477]	train-rmse:0.448128	test-rmse:0.456691
[478]	train-rmse:0.448102	test-rmse:0.456677
[479]	train-rmse:0.448086	test-rmse:0.456676
[480]	train-rmse:0.448074	test-rmse:0.456675
[481]	train-rmse:0.448059	test-rmse:0.456676
[482]	train-rmse:0.448042	test-rmse:0.456676
[483]	train-rmse:0.448024	test-rmse:0.45667
[484]	train-rmse:0.448	test-rmse:0.456662
[485]	train-rmse:0.44798	test-rmse:0.456656
[486]	train-rmse:0.447958	test-rmse:0.456655
[487]	train-rmse:0.447942	test-rmse:0.456655
[488]	train-rmse:0.44792	test-rmse:0.456646
[489]	train-rmse:0.447898	test-rmse:0.456638
[490]	train-rmse:0.447879	test-rmse:0.456633
[491]	train-rmse:0.44786	test-rmse:0.456633
[492]	train-rmse:0.447843	test-rmse:0.456626
[493]	train-rmse:0.447823	test-rmse:0.456621
[494]	train-rmse:0.447803	test-rmse:0.456618
[495]	train-rmse:0.447784	test-rmse:0.456614
[496]	train-rmse:0.447762	test-rmse:0.456607
[497]	train-rmse:0.447743	test-rmse:0.456604
[498]	train-rmse:0.447727	test-rmse:0.456602
[499]	train-rmse:0.447715	test-rmse:0.456601
[500]	train-rmse:0.4477	test-rmse:0.4566
[501]	train-rmse:0.447679	test-rmse:0.456594
[502]	train-rmse:0.447659	test-rmse:0.456587
[503]	train-rmse:0.447638	test-rmse:0.456583
[504]	train-rmse:0.447619	test-rmse:0.456581
[505]	train-rmse:0.447602	test-rmse:0.456581
[506]	train-rmse:0.447585	test-rmse:0.456579
[507]	train-rmse:0.447569	test-rmse:0.456579
[508]	train-rmse:0.447555	test-rmse:0.456575
[509]	train-rmse:0.447539	test-rmse:0.456571
[510]	train-rmse:0.447523	test-rmse:0.456566
[511]	train-rmse:0.447508	test-rmse:0.456564
[512]	train-rmse:0.447489	test-rmse:0.456559
[513]	train-rmse:0.447473	test-rmse:0.456559
[514]	train-rmse:0.447455	test-rmse:0.456555
[515]	train-rmse:0.447437	test-rmse:0.456552
[516]	train-rmse:0.447422	test-rmse:0.45655
[517]	train-rmse:0.447405	test-rmse:0.45655
[518]	train-rmse:0.447389	test-rmse:0.456549
[519]	train-rmse:0.447374	test-rmse:0.456546
[520]	train-rmse:0.447357	test-rmse:0.45654
[521]	train-rmse:0.447342	test-rmse:0.456538
[522]	train-rmse:0.447329	test-rmse:0.456535
[523]	train-rmse:0.44731	test-rmse:0.456529
[524]	train-rmse:0.447293	test-rmse:0.456528
[525]	train-rmse:0.447278	test-rmse:0.456527
[526]	train-rmse:0.447257	test-rmse:0.456523
[527]	train-rmse:0.447241	test-rmse:0.456522
[528]	train-rmse:0.447226	test-rmse:0.456519
[529]	train-rmse:0.447208	test-rmse:0.456513
[530]	train-rmse:0.447191	test-rmse:0.456511
[531]	train-rmse:0.447175	test-rmse:0.45651
[532]	train-rmse:0.447157	test-rmse:0.456505
[533]	train-rmse:0.447134	test-rmse:0.456498
[534]	train-rmse:0.447115	test-rmse:0.456495
[535]	train-rmse:0.447096	test-rmse:0.456494
[536]	train-rmse:0.447084	test-rmse:0.456493
[537]	train-rmse:0.447066	test-rmse:0.456491
[538]	train-rmse:0.447051	test-rmse:0.456491
[539]	train-rmse:0.447028	test-rmse:0.456484
[540]	train-rmse:0.447012	test-rmse:0.456482
[541]	train-rmse:0.446996	test-rmse:0.456481
[542]	train-rmse:0.446977	test-rmse:0.456479
[543]	train-rmse:0.44696	test-rmse:0.456476
[544]	train-rmse:0.446945	test-rmse:0.456477
[545]	train-rmse:0.446925	test-rmse:0.456472
[546]	train-rmse:0.446903	test-rmse:0.456464
[547]	train-rmse:0.446884	test-rmse:0.456464
[548]	train-rmse:0.446865	test-rmse:0.456464
[549]	train-rmse:0.44685	test-rmse:0.456462
[550]	train-rmse:0.446826	test-rmse:0.456447
[551]	train-rmse:0.446809	test-rmse:0.456444
[552]	train-rmse:0.44679	test-rmse:0.456443
[553]	train-rmse:0.446772	test-rmse:0.456443
[554]	train-rmse:0.446761	test-rmse:0.456443
[555]	train-rmse:0.446743	test-rmse:0.45644
[556]	train-rmse:0.446725	test-rmse:0.456437
[557]	train-rmse:0.446707	test-rmse:0.456438
[558]	train-rmse:0.44669	test-rmse:0.456435
[559]	train-rmse:0.446669	test-rmse:0.456423
[560]	train-rmse:0.446651	test-rmse:0.456424
[561]	train-rmse:0.446636	test-rmse:0.456423
[562]	train-rmse:0.446622	test-rmse:0.456422
[563]	train-rmse:0.446608	test-rmse:0.456422
[564]	train-rmse:0.446596	test-rmse:0.456422
[565]	train-rmse:0.44658	test-rmse:0.45642
[566]	train-rmse:0.446561	test-rmse:0.456417
[567]	train-rmse:0.446544	test-rmse:0.456418
[568]	train-rmse:0.446529	test-rmse:0.45642
[569]	train-rmse:0.446512	test-rmse:0.45642
[570]	train-rmse:0.446497	test-rmse:0.456418
     test-rmse-mean  test-rmse-std  train-rmse-mean  train-rmse-std
562        0.456422       0.000445         0.446622        0.000113
563        0.456422       0.000447         0.446608        0.000115
564        0.456422       0.000447         0.446596        0.000116
565        0.456420       0.000451         0.446580        0.000116
566        0.456417       0.000442         0.446561        0.000120

In [24]:
param_11 = {'booster':'gbtree',
         'nthread': 10,
         'max_depth':5, 
         'eta':0.2,
         'silent':1,
         'subsample':0.7, 
         'objective':'reg:linear',
         'eval_metric':'rmse',
         'colsample_bytree':0.7}

In [28]:
num_round = 566

dtest_11 = xgb.DMatrix(test_dataset_normalize[predictors_11], missing=np.nan)
submission_11 = train_pivot_6789_to_11[['id']].copy()
j =0 
for j in range(20):
    
    train_pivot_xgb_time2_sample = train_dataset_normalize[predictors_target_11].sample(2000000)
    train_feature_11 = train_pivot_xgb_time2_sample.drop(['target'],axis = 1)
    train_label_11 = train_pivot_xgb_time2_sample[['target']]

    dtrain_11 = xgb.DMatrix(train_feature_11,label = train_label_11,missing= np.nan)
    
    bst_11 = xgb.train(param_11, dtrain_11, num_round)
    print str(j) + 'training finished!'
    submission_11['predict_' + str(j)] = bst_11.predict(dtest_11)

print 'finished'


0training finished!
1training finished!
2training finished!
3training finished!
4training finished!
5training finished!
6training finished!
7training finished!
8training finished!
9training finished!
10training finished!
11training finished!
12training finished!
13training finished!
14training finished!
15training finished!
16training finished!
17training finished!
18training finished!
19training finished!
finished

In [12]:
# make prediction
dtest_11 = xgb.DMatrix(train_pivot_6789_to_11[predictors], missing=NaN)
submission_11 = train_pivot_6789_to_11[['id']].copy()
submission_11['predict'] = bst.predict(dtest)
xgb.plot_importance(bst)

In [29]:
submission_11.to_csv('submission_11_new.csv')

In [25]:
submission_11 = pd.read_csv('submission_11_new.csv',index_col =0)

for week 10



In [4]:
%ls


1_xgboost.ipynb*        pivot_train_with_nan.pickle*
3_prediction.ipynb*     submission_nn.csv*
4_keras_nn.ipynb*       train_pivot_45678_to_9.csv*
5_random_forest.ipynb*  train_pivot_56789_to_10_new.pickle*
old_backup/             train_pivot_6789_to_11_new.pickle*
old_ipython/            train_pivot_xgb_time1.csv*
origin/                 train_pivot_xgb_time2.csv*
pivot_test.pickle*

In [2]:
predictors_target_10 = ['agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
       't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
       't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
       't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
       'LR_prod_corr', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
       't_m_2_cum', 't_m_1_cum', 'NombreCliente', 'weight',
       'weight_per_piece', 'pieces','target']

In [3]:
predictors_10 = ['agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
       't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
       't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
       't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
       'LR_prod_corr', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
       't_m_2_cum', 't_m_1_cum', 'NombreCliente', 'weight',
       'weight_per_piece', 'pieces']

In [13]:
f = lambda x : (x-x.mean())/x.std(ddof=0)

In [14]:
def normalize_dataset_10(train_dataset,test_dataset):
    train_dataset_normalize = train_dataset[predictors_10]
    train_dataset_normalize['label'] = 0    
    
    test_dataset_normalize = test_dataset[predictors_10]
    test_dataset_normalize['label'] = 1
    
    whole_dataset = pd.concat([train_dataset_normalize,test_dataset_normalize],copy = False)
    whole_dataset_normalize = whole_dataset.apply(f,axis = 0)
    
    train_dataset_normalize = whole_dataset_normalize.loc[whole_dataset['label'] == 0]
    test_dataset_normalize = whole_dataset_normalize.loc[whole_dataset['label']==1]
    
    train_dataset_normalize.drop(['label'],axis = 1,inplace = True)
    test_dataset_normalize.drop(['label'],axis =1,inplace = True)
    
    train_dataset_normalize['target'] = train_dataset['target']
    
#     target = train_dataset['target']
    return train_dataset_normalize,test_dataset_normalize

In [5]:
train_pivot_xgb_time1 = pd.read_csv('train_pivot_xgb_time1.csv',
                                    usecols = predictors_target_10,dtype = dtypes)
train_pivot_xgb_time1.reset_index(drop = True,inplace = True)

In [5]:
train_pivot_56789_to_10 = pd.read_pickle('train_pivot_56789_to_10_new.pickle')
train_pivot_56789_to_10.reset_index(drop = True,inplace = True)

In [7]:
train_pivot_56789_to_10.columns.values


Out[7]:
array(['Cliente_ID', 'Producto_ID', 'id', 'Semana', 'Agencia_ID',
       'Canal_ID', 'Ruta_SAK', 'agen_for_log_de', 'ruta_for_log_de',
       'cliente_for_log_de', 'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
       't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
       't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
       't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
       'LR_prod_corr', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum', 't_m_2_cum',
       't_m_1_cum', 'NombreCliente', 'weight', 'weight_per_piece', 'pieces'], dtype=object)

In [6]:
train_pivot_xgb_time1.columns.values


Out[6]:
array(['agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
       't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
       't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
       't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
       'LR_prod_corr', 'target', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
       't_m_2_cum', 't_m_1_cum', 'NombreCliente', 'weight',
       'weight_per_piece', 'pieces'], dtype=object)

In [17]:
# train_dataset_10_normalize, test_dataset_10_normalize = normalize_dataset_10(train_pivot_xgb_time1,
#                                                                           train_pivot_56789_to_10)

In [ ]:
train_dataset_10_normalize.head()

In [8]:
param_10 = {'booster':'gbtree',
         'nthread': 7,
         'max_depth':5, 
         'eta':0.2,
         'silent':1,
         'subsample':0.7, 
         'objective':'reg:linear',
         'eval_metric':'rmse',
         'colsample_bytree':0.7}

In [6]:
# train_pivot_xgb_time1_sample = train_pivot_xgb_time1[predictors_target_10].sample(1000000)

# train_feature_10 = train_pivot_xgb_time1_sample.drop(['target'],axis = 1)
# train_label_10 = train_pivot_xgb_time1_sample[['target']]

# dtrain_10 = xgb.DMatrix(train_feature_10,label = train_label_10,missing= np.nan)

In [7]:
train_label_10 = train_pivot_xgb_time1['target']
train_feature_10 = train_pivot_xgb_time1.drop(['target'],axis = 1)



dtrain_10 = xgb.DMatrix(train_feature_10,label = train_label_10,missing= np.nan)

In [6]:
# num_round = 1500

# cvresult = xgb.cv(param_10, dtrain_10, num_round, nfold=5,show_stdv=False,
#                         seed = 0, early_stopping_rounds=5,show_progress = True)
# print(cvresult.tail())

model stacking for training data


  • sample 10% data, use the rest as prediction
  • in each fold, 40 bagging

In [15]:
# train_pivot_xgb_time1_sample = train_pivot_xgb_time1[predictors_target_10].sample(100000)
# print train_pivot_xgb_time1_sample.index
# len(train_pivot_xgb_time1_sample.loc[train_pivot_xgb_time1_sample.index.values].index.drop_duplicates())


Int64Index([17287656,  4346132,  8566861, 19992474, 15626037, 14513582,
             4768231,  4296957,  9765272,  9454972,
            ...
            14300533,  8217200, 20266266, 12168988, 17641083, 11403739,
             6318002, 16178418, 18167686, 12633318],
           dtype='int64', length=100000)
Out[15]:
100000

In [9]:
num_round = 400
i = 0
d_train_pivot_xgb_time1 = xgb.DMatrix(train_pivot_xgb_time1[predictors_10], missing=np.nan)
gc.collect()
submission_10 = pd.DataFrame()
for i in range(40):
    train_pivot_xgb_time1_sample = train_pivot_xgb_time1[predictors_target_10].sample(2000000)

    train_feature_10 = train_pivot_xgb_time1_sample.drop(['target'],axis = 1)
    train_label_10 = train_pivot_xgb_time1_sample['target']

    dtrain_10 = xgb.DMatrix(train_feature_10,label = train_label_10,missing= np.nan)
    gc.collect()
    
    bst = xgb.train(param_10, dtrain_10, num_round)
    print str(i) + 'training finished!'
    gc.collect()
    
    submission_10['predict_' + str(i)] = bst.predict(d_train_pivot_xgb_time1)
    submission_10['predict_' + str(i)].loc[train_pivot_xgb_time1_sample.index.values] = np.nan
    print str(i) + 'predicting finished!'
    gc.collect()

print 'finished'


0training finished!
0predicting finished!
1training finished!
1predicting finished!
2training finished!
2predicting finished!
3training finished!
3predicting finished!
4training finished!
4predicting finished!
5training finished!
5predicting finished!
6training finished!
6predicting finished!
7training finished!
7predicting finished!
8training finished!
8predicting finished!
9training finished!
9predicting finished!
10training finished!
10predicting finished!
11training finished!
11predicting finished!
12training finished!
12predicting finished!
13training finished!
13predicting finished!
14training finished!
14predicting finished!
15training finished!
15predicting finished!
16training finished!
16predicting finished!
17training finished!
17predicting finished!
18training finished!
18predicting finished!
19training finished!
19predicting finished!
20training finished!
20predicting finished!
21training finished!
21predicting finished!
22training finished!
22predicting finished!
23training finished!
23predicting finished!
24training finished!
24predicting finished!
25training finished!
25predicting finished!
26training finished!
26predicting finished!
27training finished!
27predicting finished!
28training finished!
28predicting finished!
29training finished!
29predicting finished!
30training finished!
30predicting finished!
31training finished!
31predicting finished!
32training finished!
32predicting finished!
33training finished!
33predicting finished!
34training finished!
34predicting finished!
35training finished!
35predicting finished!
36training finished!
36predicting finished!
37training finished!
37predicting finished!
38training finished!
38predicting finished!
39training finished!
39predicting finished!
finished

In [10]:
submission_10.head()


Out[10]:
predict_0 predict_1 predict_2 predict_3 predict_4 predict_5 predict_6 predict_7 predict_8 predict_9 ... predict_30 predict_31 predict_32 predict_33 predict_34 predict_35 predict_36 predict_37 predict_38 predict_39
0 2.560112 2.319709 2.596576 3.334877 2.947561 2.540267 2.928454 2.941043 2.376304 2.995800 ... NaN NaN 2.968065 2.672205 2.867753 2.462488 2.426948 2.630500 2.475198 2.569228
1 2.849533 2.726606 2.858416 2.941954 2.899292 2.894359 2.754009 NaN NaN 2.725377 ... 2.864190 2.890460 2.813638 2.788484 2.835742 2.866130 2.839168 2.906143 2.770211 2.806147
2 1.957559 2.057925 1.908801 2.022024 2.034622 1.946343 2.073445 1.921750 1.859954 NaN ... 2.056635 2.060785 2.062870 2.231127 1.843033 2.060290 1.876254 1.901076 1.958171 1.893032
3 3.498075 3.605519 NaN 3.481079 3.537821 3.634062 3.516993 3.518384 3.555868 3.529919 ... 3.496224 3.376245 3.590961 NaN NaN 3.428569 3.535894 3.513547 3.513322 3.568703
4 NaN 4.240009 4.264554 4.289259 4.356047 4.357327 4.233587 4.251253 4.242721 4.318971 ... 4.333437 4.368112 4.320409 4.339425 4.292994 4.228761 4.276141 NaN 4.339606 4.211953

5 rows × 40 columns


In [12]:
submission_10['predict'] = submission_10[['predict_' + str(i) for i in range(40)]].mean(axis=1)

In [14]:
submission_10.head()


Out[14]:
predict_0 predict_1 predict_2 predict_3 predict_4 predict_5 predict_6 predict_7 predict_8 predict_9 ... predict_31 predict_32 predict_33 predict_34 predict_35 predict_36 predict_37 predict_38 predict_39 predict
0 2.560112 2.319709 2.596576 3.334877 2.947561 2.540267 2.928454 2.941043 2.376304 2.995800 ... NaN 2.968065 2.672205 2.867753 2.462488 2.426948 2.630500 2.475198 2.569228 2.767780
1 2.849533 2.726606 2.858416 2.941954 2.899292 2.894359 2.754009 NaN NaN 2.725377 ... 2.890460 2.813638 2.788484 2.835742 2.866130 2.839168 2.906143 2.770211 2.806147 2.835551
2 1.957559 2.057925 1.908801 2.022024 2.034622 1.946343 2.073445 1.921750 1.859954 NaN ... 2.060785 2.062870 2.231127 1.843033 2.060290 1.876254 1.901076 1.958171 1.893032 1.999626
3 3.498075 3.605519 NaN 3.481079 3.537821 3.634062 3.516993 3.518384 3.555868 3.529919 ... 3.376245 3.590961 NaN NaN 3.428569 3.535894 3.513547 3.513322 3.568703 3.505517
4 NaN 4.240009 4.264554 4.289259 4.356047 4.357327 4.233587 4.251253 4.242721 4.318971 ... 4.368112 4.320409 4.339425 4.292994 4.228761 4.276141 NaN 4.339606 4.211953 4.278482

5 rows × 41 columns


In [16]:
submission_10_final = submission_10['predict']

In [17]:
submission_10_final.head()


Out[17]:
0    2.767780
1    2.835551
2    1.999626
3    3.505517
4    4.278482
Name: predict, dtype: float32

In [18]:
submission_10_final.to_csv('stack_train_xgb_10.csv')

over model stacking



In [21]:
num_round = 392

dtest_10 = xgb.DMatrix(test_dataset_10_normalize[predictors_10], missing=np.nan)
submission_10 = train_pivot_56789_to_10[['id']].copy()
i = 0

for i in range(20):
    train_pivot_xgb_time1_sample = train_dataset_10_normalize[predictors_target_10].sample(2000000)

    train_feature_10 = train_pivot_xgb_time1_sample.drop(['target'],axis = 1)
    train_label_10 = train_pivot_xgb_time1_sample[['target']]

    dtrain_10 = xgb.DMatrix(train_feature_10,label = train_label_10,missing= np.nan)
    
    bst = xgb.train(param_10, dtrain_10, num_round)
    print str(i) + 'training finished!'
    submission_10['predict_' + str(i)] = bst.predict(dtest_10)
    print str(i) + 'predicting finished!'


print 'finished'


0training finished!
0predicting finished!
1training finished!
1predicting finished!
2training finished!
2predicting finished!
3training finished!
3predicting finished!
4training finished!
4predicting finished!
5training finished!
5predicting finished!
6training finished!
6predicting finished!
7training finished!
7predicting finished!
8training finished!
8predicting finished!
9training finished!
9predicting finished!
10training finished!
10predicting finished!
11training finished!
11predicting finished!
12training finished!
12predicting finished!
13training finished!
13predicting finished!
14training finished!
14predicting finished!
15training finished!
15predicting finished!
16training finished!
16predicting finished!
17training finished!
17predicting finished!
18training finished!
18predicting finished!
19training finished!
19predicting finished!
finished

In [22]:
submission_10.to_csv('submission_10_new.csv')

In [26]:
# make prediction
xgb.plot_importance(bst)


Out[26]:
<matplotlib.axes.AxesSubplot at 0x7fc793a07dd0>

combine week10 and week11



In [12]:
submission_10 = pd.read_csv('submission_10.csv',index_col = 0)

In [22]:
submission_10.shape


Out[22]:
(3538385, 21)

In [23]:
submission_10.columns.values


Out[23]:
array(['id', 'predict_0', 'predict_1', 'predict_2', 'predict_3',
       'predict_4', 'predict_5', 'predict_6', 'predict_7', 'predict_8',
       'predict_9', 'predict_10', 'predict_11', 'predict_12', 'predict_13',
       'predict_14', 'predict_15', 'predict_16', 'predict_17',
       'predict_18', 'predict_19'], dtype=object)

In [26]:
submission_11.columns.values


Out[26]:
array(['id', 'predict_0', 'predict_1', 'predict_2', 'predict_3',
       'predict_4', 'predict_5', 'predict_6', 'predict_7', 'predict_8',
       'predict_9', 'predict_10', 'predict_11', 'predict_12', 'predict_13',
       'predict_14', 'predict_15', 'predict_16', 'predict_17',
       'predict_18', 'predict_19'], dtype=object)

In [27]:
submission = pd.concat([submission_10,submission_11],axis = 0)

In [28]:
submission.head()


Out[28]:
Semana id predict_0 predict_1 predict_2 predict_3 predict_4 predict_5 predict_6 predict_7 predict_8 ... predict_10 predict_11 predict_12 predict_13 predict_14 predict_15 predict_16 predict_17 predict_18 predict_19
0 1569352 1.570254 2.058795 2.034031 2.030173 2.360505 2.103804 2.306178 2.905277 2.163551 ... 2.337900 2.052084 1.988659 2.145079 1.940696 1.745424 1.975164 1.969749 2.098094 2.157057
1 6667200 3.562964 3.605132 3.617031 3.654797 3.647879 3.615349 3.648028 3.547087 3.555520 ... 3.617610 3.678215 3.664827 3.635223 3.621170 3.587815 3.619729 3.626218 3.718844 3.676515
2 1592616 3.095027 3.052559 2.981558 2.946887 2.818832 2.871925 2.925531 2.865346 2.925287 ... 3.030976 3.111521 3.083862 2.991664 3.072153 2.873937 2.984680 2.915040 3.102967 3.057709
3 3909690 4.199105 4.092796 4.173037 4.114638 4.228617 4.108154 4.186193 4.186264 4.168976 ... 4.217747 4.271296 4.210735 4.131866 4.077342 4.124441 4.189803 4.178765 4.237868 4.161020
4 3659672 3.582508 3.632398 3.602768 3.671225 3.653172 3.627649 3.626319 3.613807 3.573425 ... 3.626101 3.640444 3.641191 3.594605 3.645134 3.609037 3.617180 3.677435 3.683105 3.669554

5 rows × 21 columns


In [29]:
submission['predict'] = submission[['predict_' + str(i) for i in range(20)]].mean(axis=1)

In [30]:
submission.head()


Out[30]:
Semana id predict_0 predict_1 predict_2 predict_3 predict_4 predict_5 predict_6 predict_7 predict_8 ... predict_11 predict_12 predict_13 predict_14 predict_15 predict_16 predict_17 predict_18 predict_19 predict
0 1569352 1.570254 2.058795 2.034031 2.030173 2.360505 2.103804 2.306178 2.905277 2.163551 ... 2.052084 1.988659 2.145079 1.940696 1.745424 1.975164 1.969749 2.098094 2.157057 2.122850
1 6667200 3.562964 3.605132 3.617031 3.654797 3.647879 3.615349 3.648028 3.547087 3.555520 ... 3.678215 3.664827 3.635223 3.621170 3.587815 3.619729 3.626218 3.718844 3.676515 3.627489
2 1592616 3.095027 3.052559 2.981558 2.946887 2.818832 2.871925 2.925531 2.865346 2.925287 ... 3.111521 3.083862 2.991664 3.072153 2.873937 2.984680 2.915040 3.102967 3.057709 2.989878
3 3909690 4.199105 4.092796 4.173037 4.114638 4.228617 4.108154 4.186193 4.186264 4.168976 ... 4.271296 4.210735 4.131866 4.077342 4.124441 4.189803 4.178765 4.237868 4.161020 4.173270
4 3659672 3.582508 3.632398 3.602768 3.671225 3.653172 3.627649 3.626319 3.613807 3.573425 ... 3.640444 3.641191 3.594605 3.645134 3.609037 3.617180 3.677435 3.683105 3.669554 3.633988

5 rows × 22 columns


In [31]:
submission.rename(columns = {'predict':'Demanda_uni_equil'},inplace = True)

In [32]:
submission['Demanda_uni_equil'] = submission['Demanda_uni_equil'].apply(np.expm1)
submission.head()


Out[32]:
Semana id predict_0 predict_1 predict_2 predict_3 predict_4 predict_5 predict_6 predict_7 predict_8 ... predict_11 predict_12 predict_13 predict_14 predict_15 predict_16 predict_17 predict_18 predict_19 Demanda_uni_equil
0 1569352 1.570254 2.058795 2.034031 2.030173 2.360505 2.103804 2.306178 2.905277 2.163551 ... 2.052084 1.988659 2.145079 1.940696 1.745424 1.975164 1.969749 2.098094 2.157057 7.354917
1 6667200 3.562964 3.605132 3.617031 3.654797 3.647879 3.615349 3.648028 3.547087 3.555520 ... 3.678215 3.664827 3.635223 3.621170 3.587815 3.619729 3.626218 3.718844 3.676515 36.618239
2 1592616 3.095027 3.052559 2.981558 2.946887 2.818832 2.871925 2.925531 2.865346 2.925287 ... 3.111521 3.083862 2.991664 3.072153 2.873937 2.984680 2.915040 3.102967 3.057709 18.883251
3 3909690 4.199105 4.092796 4.173037 4.114638 4.228617 4.108154 4.186193 4.186264 4.168976 ... 4.271296 4.210735 4.131866 4.077342 4.124441 4.189803 4.178765 4.237868 4.161020 63.927416
4 3659672 3.582508 3.632398 3.602768 3.671225 3.653172 3.627649 3.626319 3.613807 3.573425 ... 3.640444 3.641191 3.594605 3.645134 3.609037 3.617180 3.677435 3.683105 3.669554 36.863517

5 rows × 22 columns


In [33]:
submission_final = submission[['id','Demanda_uni_equil']].copy()

In [34]:
submission_final['Demanda_uni_equil'] = submission_final['Demanda_uni_equil'].round(1)

In [35]:
submission_final.head()


Out[35]:
Semana id Demanda_uni_equil
0 1569352 7.4
1 6667200 36.6
2 1592616 18.9
3 3909690 63.9
4 3659672 36.9

In [36]:
submission_final.to_csv('submission_xgb_2.csv',index = False)

read_test



In [84]:
test_id = pd.read_csv('origin/test.csv',usecols = ['id'])

In [88]:
test_id['id'].dtype


Out[88]:
dtype('int64')

In [91]:
len(np.intersect1d(list(submission['id']), list(test_id['id'])))


Out[91]:
5634038

merge the other result



In [22]:
result_no_cli_pro_in_common = pd.read_csv('origin/results1.csv')
result_no_cli_pro_in_common.head()


Out[22]:
id Demanda_uni_equil
0 1288534 13.5
1 1127055 359.1
2 5423268 2.9
3 749471 312.4
4 6193378 4.4

In [23]:
result_no_cli_pro_in_common = result_no_cli_pro_in_common[-result_no_cli_pro_in_common['id'].isin(np.array(submission['id']))]
result_no_cli_pro_in_common.head()


Out[23]:
id Demanda_uni_equil
0 1288534 13.5
6 239863 4.4
7 2540577 3.6
8 3851172 3.4
11 6418979 3.3

In [24]:
result_no_cli_pro_in_common.shape


Out[24]:
(1365213, 2)

In [30]:
submission_final = pd.concat([submission[['id','Demanda_uni_equil']],result_no_cli_pro_in_common],axis = 0)

In [31]:
submission_final['Demanda_uni_equil'] = submission_final['Demanda_uni_equil'].round(1)

In [32]:
submission_final.shape


Out[32]:
(6999251, 2)

In [33]:
submission_final.to_csv('submission_xgb.csv',index = False)

In [ ]: