In [1]:
    
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from scipy import sparse
from sklearn.feature_extraction import FeatureHasher
from scipy.sparse import coo_matrix,csr_matrix,csc_matrix, hstack
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
from sklearn import linear_model
import gc
from sklearn import preprocessing
    
In [2]:
    
%ls
    
    
In [ ]:
    
    
In [18]:
    
predictors_target_11 = ['LR_prod', 'LR_prod_corr',
       'NombreCliente',
       'agen_cliente_for_log_de', 'agen_for_log_de',
       'agen_producto_for_log_de', 'agen_ruta_for_log_de',
       'cliente_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_for_log_de', 'corr', 'pieces',
       'producto_for_log_de', 'ruta_cliente_for_log_de', 'ruta_for_log_de',
       'ruta_producto_for_log_de', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 't_m_2_cum', 't_m_3_cum',
       't_m_4_cum', 't_m_5_cum', 't_min_2', 't_min_3', 't_min_4',
       't_min_5', 'target', 'weight', 'weight_per_piece']
    
In [19]:
    
predictors_11 = ['LR_prod', 'LR_prod_corr',
       'NombreCliente',
       'agen_cliente_for_log_de', 'agen_for_log_de',
       'agen_producto_for_log_de', 'agen_ruta_for_log_de',
       'cliente_for_log_de', 'cliente_for_log_sum',
       'cliente_producto_for_log_de', 'corr', 'pieces',
       'producto_for_log_de', 'ruta_cliente_for_log_de', 'ruta_for_log_de',
       'ruta_producto_for_log_de', 't2_min_t3', 't2_min_t4', 't2_min_t5',
       't3_min_t4', 't3_min_t5', 't4_min_t5', 't_m_2_cum', 't_m_3_cum',
       't_m_4_cum', 't_m_5_cum', 't_min_2', 't_min_3', 't_min_4',
       't_min_5', 'weight', 'weight_per_piece']
    
In [10]:
    
f = lambda x : (x-x.mean())/x.std(ddof=0)
    
In [14]:
    
train_pivot_xgb_time2 = pd.read_csv('train_pivot_xgb_time2.csv',index_col = 0)
    
In [7]:
    
train_pivot_6789_to_11 = pd.read_pickle('train_pivot_6789_to_11_new.pickle')
    
In [8]:
    
train_pivot_xgb_time2.head()
    
    Out[8]:
In [15]:
    
train_pivot_xgb_time2.columns.values
    
    Out[15]:
In [4]:
    
def normalize_dataset(train_dataset,test_dataset):
    train_dataset_normalize = train_dataset[predictors_11].copy()
    train_dataset_normalize['label'] = 0    
    
    test_dataset_normalize = test_dataset[predictors_11].copy()
    test_dataset_normalize['label'] = 1
    
    whole_dataset = pd.concat([train_dataset_normalize,test_dataset_normalize])
    whole_dataset_normalize = whole_dataset.apply(f,axis = 0)
    
    train_dataset_normalize = whole_dataset_normalize.loc[whole_dataset['label'] == 0]
    test_dataset_normalize = whole_dataset_normalize.loc[whole_dataset['label']==1]
    
    train_dataset_normalize.drop(['label'],axis = 1,inplace = True)
    test_dataset_normalize.drop(['label'],axis =1,inplace = True)
    
    train_dataset_normalize['target'] = train_dataset['target'].copy() 
    
#     target = train_dataset['target']
    return train_dataset_normalize,test_dataset_normalize
    
In [21]:
    
train_dataset_normalize, test_dataset_normalize = normalize_dataset(train_pivot_xgb_time2,train_pivot_6789_to_11)
    
    
In [22]:
    
train_dataset_normalize.head()
    
    Out[22]:
In [26]:
    
train_pivot_xgb_time2_sample = train_dataset_normalize.sample(2000000)
train_feature_11 = train_pivot_xgb_time2_sample.drop(['target'],axis = 1)
train_label_11 = train_pivot_xgb_time2_sample[['target']]
dtrain_11 = xgb.DMatrix(train_feature_11,label = train_label_11,missing=np.nan)
    
In [27]:
    
num_round = 1000
cvresult = xgb.cv(param_11, dtrain_11, num_round, nfold=5,verbose_eval = 1,show_stdv=False,
                        seed = 0, early_stopping_rounds=5)
print(cvresult.tail())
    
    
In [24]:
    
param_11 = {'booster':'gbtree',
         'nthread': 10,
         'max_depth':5, 
         'eta':0.2,
         'silent':1,
         'subsample':0.7, 
         'objective':'reg:linear',
         'eval_metric':'rmse',
         'colsample_bytree':0.7}
    
In [28]:
    
num_round = 566
dtest_11 = xgb.DMatrix(test_dataset_normalize[predictors_11], missing=np.nan)
submission_11 = train_pivot_6789_to_11[['id']].copy()
j =0 
for j in range(20):
    
    train_pivot_xgb_time2_sample = train_dataset_normalize[predictors_target_11].sample(2000000)
    train_feature_11 = train_pivot_xgb_time2_sample.drop(['target'],axis = 1)
    train_label_11 = train_pivot_xgb_time2_sample[['target']]
    dtrain_11 = xgb.DMatrix(train_feature_11,label = train_label_11,missing= np.nan)
    
    bst_11 = xgb.train(param_11, dtrain_11, num_round)
    print str(j) + 'training finished!'
    submission_11['predict_' + str(j)] = bst_11.predict(dtest_11)
print 'finished'
    
    
In [12]:
    
# make prediction
dtest_11 = xgb.DMatrix(train_pivot_6789_to_11[predictors], missing=NaN)
submission_11 = train_pivot_6789_to_11[['id']].copy()
submission_11['predict'] = bst.predict(dtest)
xgb.plot_importance(bst)
    
In [29]:
    
submission_11.to_csv('submission_11_new.csv')
    
In [25]:
    
submission_11 = pd.read_csv('submission_11_new.csv',index_col =0)
    
In [4]:
    
%ls
    
    
In [2]:
    
predictors_target_10 = ['agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
       't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
       't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
       't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
       'LR_prod_corr', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
       't_m_2_cum', 't_m_1_cum', 'NombreCliente', 'weight',
       'weight_per_piece', 'pieces','target']
    
In [3]:
    
predictors_10 = ['agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
       't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
       't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
       't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
       'LR_prod_corr', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
       't_m_2_cum', 't_m_1_cum', 'NombreCliente', 'weight',
       'weight_per_piece', 'pieces']
    
In [13]:
    
f = lambda x : (x-x.mean())/x.std(ddof=0)
    
In [14]:
    
def normalize_dataset_10(train_dataset,test_dataset):
    train_dataset_normalize = train_dataset[predictors_10]
    train_dataset_normalize['label'] = 0    
    
    test_dataset_normalize = test_dataset[predictors_10]
    test_dataset_normalize['label'] = 1
    
    whole_dataset = pd.concat([train_dataset_normalize,test_dataset_normalize],copy = False)
    whole_dataset_normalize = whole_dataset.apply(f,axis = 0)
    
    train_dataset_normalize = whole_dataset_normalize.loc[whole_dataset['label'] == 0]
    test_dataset_normalize = whole_dataset_normalize.loc[whole_dataset['label']==1]
    
    train_dataset_normalize.drop(['label'],axis = 1,inplace = True)
    test_dataset_normalize.drop(['label'],axis =1,inplace = True)
    
    train_dataset_normalize['target'] = train_dataset['target']
    
#     target = train_dataset['target']
    return train_dataset_normalize,test_dataset_normalize
    
In [5]:
    
train_pivot_xgb_time1 = pd.read_csv('train_pivot_xgb_time1.csv',
                                    usecols = predictors_target_10,dtype = dtypes)
train_pivot_xgb_time1.reset_index(drop = True,inplace = True)
    
In [5]:
    
train_pivot_56789_to_10 = pd.read_pickle('train_pivot_56789_to_10_new.pickle')
train_pivot_56789_to_10.reset_index(drop = True,inplace = True)
    
In [7]:
    
train_pivot_56789_to_10.columns.values
    
    Out[7]:
In [6]:
    
train_pivot_xgb_time1.columns.values
    
    Out[6]:
In [17]:
    
# train_dataset_10_normalize, test_dataset_10_normalize = normalize_dataset_10(train_pivot_xgb_time1,
#                                                                           train_pivot_56789_to_10)
    
In [ ]:
    
train_dataset_10_normalize.head()
    
In [8]:
    
param_10 = {'booster':'gbtree',
         'nthread': 7,
         'max_depth':5, 
         'eta':0.2,
         'silent':1,
         'subsample':0.7, 
         'objective':'reg:linear',
         'eval_metric':'rmse',
         'colsample_bytree':0.7}
    
In [6]:
    
# train_pivot_xgb_time1_sample = train_pivot_xgb_time1[predictors_target_10].sample(1000000)
# train_feature_10 = train_pivot_xgb_time1_sample.drop(['target'],axis = 1)
# train_label_10 = train_pivot_xgb_time1_sample[['target']]
# dtrain_10 = xgb.DMatrix(train_feature_10,label = train_label_10,missing= np.nan)
    
In [7]:
    
train_label_10 = train_pivot_xgb_time1['target']
train_feature_10 = train_pivot_xgb_time1.drop(['target'],axis = 1)
dtrain_10 = xgb.DMatrix(train_feature_10,label = train_label_10,missing= np.nan)
    
In [6]:
    
# num_round = 1500
# cvresult = xgb.cv(param_10, dtrain_10, num_round, nfold=5,show_stdv=False,
#                         seed = 0, early_stopping_rounds=5,show_progress = True)
# print(cvresult.tail())
    
In [15]:
    
# train_pivot_xgb_time1_sample = train_pivot_xgb_time1[predictors_target_10].sample(100000)
# print train_pivot_xgb_time1_sample.index
# len(train_pivot_xgb_time1_sample.loc[train_pivot_xgb_time1_sample.index.values].index.drop_duplicates())
    
    
    Out[15]:
In [9]:
    
num_round = 400
i = 0
d_train_pivot_xgb_time1 = xgb.DMatrix(train_pivot_xgb_time1[predictors_10], missing=np.nan)
gc.collect()
submission_10 = pd.DataFrame()
for i in range(40):
    train_pivot_xgb_time1_sample = train_pivot_xgb_time1[predictors_target_10].sample(2000000)
    train_feature_10 = train_pivot_xgb_time1_sample.drop(['target'],axis = 1)
    train_label_10 = train_pivot_xgb_time1_sample['target']
    dtrain_10 = xgb.DMatrix(train_feature_10,label = train_label_10,missing= np.nan)
    gc.collect()
    
    bst = xgb.train(param_10, dtrain_10, num_round)
    print str(i) + 'training finished!'
    gc.collect()
    
    submission_10['predict_' + str(i)] = bst.predict(d_train_pivot_xgb_time1)
    submission_10['predict_' + str(i)].loc[train_pivot_xgb_time1_sample.index.values] = np.nan
    print str(i) + 'predicting finished!'
    gc.collect()
print 'finished'
    
    
In [10]:
    
submission_10.head()
    
    Out[10]:
In [12]:
    
submission_10['predict'] = submission_10[['predict_' + str(i) for i in range(40)]].mean(axis=1)
    
In [14]:
    
submission_10.head()
    
    Out[14]:
In [16]:
    
submission_10_final = submission_10['predict']
    
In [17]:
    
submission_10_final.head()
    
    Out[17]:
In [18]:
    
submission_10_final.to_csv('stack_train_xgb_10.csv')
    
In [21]:
    
num_round = 392
dtest_10 = xgb.DMatrix(test_dataset_10_normalize[predictors_10], missing=np.nan)
submission_10 = train_pivot_56789_to_10[['id']].copy()
i = 0
for i in range(20):
    train_pivot_xgb_time1_sample = train_dataset_10_normalize[predictors_target_10].sample(2000000)
    train_feature_10 = train_pivot_xgb_time1_sample.drop(['target'],axis = 1)
    train_label_10 = train_pivot_xgb_time1_sample[['target']]
    dtrain_10 = xgb.DMatrix(train_feature_10,label = train_label_10,missing= np.nan)
    
    bst = xgb.train(param_10, dtrain_10, num_round)
    print str(i) + 'training finished!'
    submission_10['predict_' + str(i)] = bst.predict(dtest_10)
    print str(i) + 'predicting finished!'
print 'finished'
    
    
In [22]:
    
submission_10.to_csv('submission_10_new.csv')
    
In [26]:
    
# make prediction
xgb.plot_importance(bst)
    
    Out[26]:
In [12]:
    
submission_10 = pd.read_csv('submission_10.csv',index_col = 0)
    
In [22]:
    
submission_10.shape
    
    Out[22]:
In [23]:
    
submission_10.columns.values
    
    Out[23]:
In [26]:
    
submission_11.columns.values
    
    Out[26]:
In [27]:
    
submission = pd.concat([submission_10,submission_11],axis = 0)
    
In [28]:
    
submission.head()
    
    Out[28]:
In [29]:
    
submission['predict'] = submission[['predict_' + str(i) for i in range(20)]].mean(axis=1)
    
In [30]:
    
submission.head()
    
    Out[30]:
In [31]:
    
submission.rename(columns = {'predict':'Demanda_uni_equil'},inplace = True)
    
In [32]:
    
submission['Demanda_uni_equil'] = submission['Demanda_uni_equil'].apply(np.expm1)
submission.head()
    
    Out[32]:
In [33]:
    
submission_final = submission[['id','Demanda_uni_equil']].copy()
    
In [34]:
    
submission_final['Demanda_uni_equil'] = submission_final['Demanda_uni_equil'].round(1)
    
In [35]:
    
submission_final.head()
    
    Out[35]:
In [36]:
    
submission_final.to_csv('submission_xgb_2.csv',index = False)
    
In [84]:
    
test_id = pd.read_csv('origin/test.csv',usecols = ['id'])
    
In [88]:
    
test_id['id'].dtype
    
    Out[88]:
In [91]:
    
len(np.intersect1d(list(submission['id']), list(test_id['id'])))
    
    Out[91]:
In [22]:
    
result_no_cli_pro_in_common = pd.read_csv('origin/results1.csv')
result_no_cli_pro_in_common.head()
    
    Out[22]:
In [23]:
    
result_no_cli_pro_in_common = result_no_cli_pro_in_common[-result_no_cli_pro_in_common['id'].isin(np.array(submission['id']))]
result_no_cli_pro_in_common.head()
    
    Out[23]:
In [24]:
    
result_no_cli_pro_in_common.shape
    
    Out[24]:
In [30]:
    
submission_final = pd.concat([submission[['id','Demanda_uni_equil']],result_no_cli_pro_in_common],axis = 0)
    
In [31]:
    
submission_final['Demanda_uni_equil'] = submission_final['Demanda_uni_equil'].round(1)
    
In [32]:
    
submission_final.shape
    
    Out[32]:
In [33]:
    
submission_final.to_csv('submission_xgb.csv',index = False)
    
In [ ]: