In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from scipy import sparse
from sklearn.feature_extraction import FeatureHasher
from scipy.sparse import coo_matrix,csr_matrix,csc_matrix, hstack
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
from sklearn import linear_model
import gc
from sklearn import preprocessing
In [2]:
%ls
In [ ]:
In [18]:
predictors_target_11 = ['LR_prod', 'LR_prod_corr',
'NombreCliente',
'agen_cliente_for_log_de', 'agen_for_log_de',
'agen_producto_for_log_de', 'agen_ruta_for_log_de',
'cliente_for_log_de', 'cliente_for_log_sum',
'cliente_producto_for_log_de', 'corr', 'pieces',
'producto_for_log_de', 'ruta_cliente_for_log_de', 'ruta_for_log_de',
'ruta_producto_for_log_de', 't2_min_t3', 't2_min_t4', 't2_min_t5',
't3_min_t4', 't3_min_t5', 't4_min_t5', 't_m_2_cum', 't_m_3_cum',
't_m_4_cum', 't_m_5_cum', 't_min_2', 't_min_3', 't_min_4',
't_min_5', 'target', 'weight', 'weight_per_piece']
In [19]:
predictors_11 = ['LR_prod', 'LR_prod_corr',
'NombreCliente',
'agen_cliente_for_log_de', 'agen_for_log_de',
'agen_producto_for_log_de', 'agen_ruta_for_log_de',
'cliente_for_log_de', 'cliente_for_log_sum',
'cliente_producto_for_log_de', 'corr', 'pieces',
'producto_for_log_de', 'ruta_cliente_for_log_de', 'ruta_for_log_de',
'ruta_producto_for_log_de', 't2_min_t3', 't2_min_t4', 't2_min_t5',
't3_min_t4', 't3_min_t5', 't4_min_t5', 't_m_2_cum', 't_m_3_cum',
't_m_4_cum', 't_m_5_cum', 't_min_2', 't_min_3', 't_min_4',
't_min_5', 'weight', 'weight_per_piece']
In [10]:
f = lambda x : (x-x.mean())/x.std(ddof=0)
In [14]:
train_pivot_xgb_time2 = pd.read_csv('train_pivot_xgb_time2.csv',index_col = 0)
In [7]:
train_pivot_6789_to_11 = pd.read_pickle('train_pivot_6789_to_11_new.pickle')
In [8]:
train_pivot_xgb_time2.head()
Out[8]:
In [15]:
train_pivot_xgb_time2.columns.values
Out[15]:
In [4]:
def normalize_dataset(train_dataset,test_dataset):
train_dataset_normalize = train_dataset[predictors_11].copy()
train_dataset_normalize['label'] = 0
test_dataset_normalize = test_dataset[predictors_11].copy()
test_dataset_normalize['label'] = 1
whole_dataset = pd.concat([train_dataset_normalize,test_dataset_normalize])
whole_dataset_normalize = whole_dataset.apply(f,axis = 0)
train_dataset_normalize = whole_dataset_normalize.loc[whole_dataset['label'] == 0]
test_dataset_normalize = whole_dataset_normalize.loc[whole_dataset['label']==1]
train_dataset_normalize.drop(['label'],axis = 1,inplace = True)
test_dataset_normalize.drop(['label'],axis =1,inplace = True)
train_dataset_normalize['target'] = train_dataset['target'].copy()
# target = train_dataset['target']
return train_dataset_normalize,test_dataset_normalize
In [21]:
train_dataset_normalize, test_dataset_normalize = normalize_dataset(train_pivot_xgb_time2,train_pivot_6789_to_11)
In [22]:
train_dataset_normalize.head()
Out[22]:
In [26]:
train_pivot_xgb_time2_sample = train_dataset_normalize.sample(2000000)
train_feature_11 = train_pivot_xgb_time2_sample.drop(['target'],axis = 1)
train_label_11 = train_pivot_xgb_time2_sample[['target']]
dtrain_11 = xgb.DMatrix(train_feature_11,label = train_label_11,missing=np.nan)
In [27]:
num_round = 1000
cvresult = xgb.cv(param_11, dtrain_11, num_round, nfold=5,verbose_eval = 1,show_stdv=False,
seed = 0, early_stopping_rounds=5)
print(cvresult.tail())
In [24]:
param_11 = {'booster':'gbtree',
'nthread': 10,
'max_depth':5,
'eta':0.2,
'silent':1,
'subsample':0.7,
'objective':'reg:linear',
'eval_metric':'rmse',
'colsample_bytree':0.7}
In [28]:
num_round = 566
dtest_11 = xgb.DMatrix(test_dataset_normalize[predictors_11], missing=np.nan)
submission_11 = train_pivot_6789_to_11[['id']].copy()
j =0
for j in range(20):
train_pivot_xgb_time2_sample = train_dataset_normalize[predictors_target_11].sample(2000000)
train_feature_11 = train_pivot_xgb_time2_sample.drop(['target'],axis = 1)
train_label_11 = train_pivot_xgb_time2_sample[['target']]
dtrain_11 = xgb.DMatrix(train_feature_11,label = train_label_11,missing= np.nan)
bst_11 = xgb.train(param_11, dtrain_11, num_round)
print str(j) + 'training finished!'
submission_11['predict_' + str(j)] = bst_11.predict(dtest_11)
print 'finished'
In [12]:
# make prediction
dtest_11 = xgb.DMatrix(train_pivot_6789_to_11[predictors], missing=NaN)
submission_11 = train_pivot_6789_to_11[['id']].copy()
submission_11['predict'] = bst.predict(dtest)
xgb.plot_importance(bst)
In [29]:
submission_11.to_csv('submission_11_new.csv')
In [25]:
submission_11 = pd.read_csv('submission_11_new.csv',index_col =0)
In [4]:
%ls
In [2]:
predictors_target_10 = ['agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
'producto_for_log_de', 'agen_ruta_for_log_de',
'agen_cliente_for_log_de', 'agen_producto_for_log_de',
'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
'LR_prod_corr', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
't_m_2_cum', 't_m_1_cum', 'NombreCliente', 'weight',
'weight_per_piece', 'pieces','target']
In [3]:
predictors_10 = ['agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
'producto_for_log_de', 'agen_ruta_for_log_de',
'agen_cliente_for_log_de', 'agen_producto_for_log_de',
'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
'LR_prod_corr', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
't_m_2_cum', 't_m_1_cum', 'NombreCliente', 'weight',
'weight_per_piece', 'pieces']
In [13]:
f = lambda x : (x-x.mean())/x.std(ddof=0)
In [14]:
def normalize_dataset_10(train_dataset,test_dataset):
train_dataset_normalize = train_dataset[predictors_10]
train_dataset_normalize['label'] = 0
test_dataset_normalize = test_dataset[predictors_10]
test_dataset_normalize['label'] = 1
whole_dataset = pd.concat([train_dataset_normalize,test_dataset_normalize],copy = False)
whole_dataset_normalize = whole_dataset.apply(f,axis = 0)
train_dataset_normalize = whole_dataset_normalize.loc[whole_dataset['label'] == 0]
test_dataset_normalize = whole_dataset_normalize.loc[whole_dataset['label']==1]
train_dataset_normalize.drop(['label'],axis = 1,inplace = True)
test_dataset_normalize.drop(['label'],axis =1,inplace = True)
train_dataset_normalize['target'] = train_dataset['target']
# target = train_dataset['target']
return train_dataset_normalize,test_dataset_normalize
In [5]:
train_pivot_xgb_time1 = pd.read_csv('train_pivot_xgb_time1.csv',
usecols = predictors_target_10,dtype = dtypes)
train_pivot_xgb_time1.reset_index(drop = True,inplace = True)
In [5]:
train_pivot_56789_to_10 = pd.read_pickle('train_pivot_56789_to_10_new.pickle')
train_pivot_56789_to_10.reset_index(drop = True,inplace = True)
In [7]:
train_pivot_56789_to_10.columns.values
Out[7]:
In [6]:
train_pivot_xgb_time1.columns.values
Out[6]:
In [17]:
# train_dataset_10_normalize, test_dataset_10_normalize = normalize_dataset_10(train_pivot_xgb_time1,
# train_pivot_56789_to_10)
In [ ]:
train_dataset_10_normalize.head()
In [8]:
param_10 = {'booster':'gbtree',
'nthread': 7,
'max_depth':5,
'eta':0.2,
'silent':1,
'subsample':0.7,
'objective':'reg:linear',
'eval_metric':'rmse',
'colsample_bytree':0.7}
In [6]:
# train_pivot_xgb_time1_sample = train_pivot_xgb_time1[predictors_target_10].sample(1000000)
# train_feature_10 = train_pivot_xgb_time1_sample.drop(['target'],axis = 1)
# train_label_10 = train_pivot_xgb_time1_sample[['target']]
# dtrain_10 = xgb.DMatrix(train_feature_10,label = train_label_10,missing= np.nan)
In [7]:
train_label_10 = train_pivot_xgb_time1['target']
train_feature_10 = train_pivot_xgb_time1.drop(['target'],axis = 1)
dtrain_10 = xgb.DMatrix(train_feature_10,label = train_label_10,missing= np.nan)
In [6]:
# num_round = 1500
# cvresult = xgb.cv(param_10, dtrain_10, num_round, nfold=5,show_stdv=False,
# seed = 0, early_stopping_rounds=5,show_progress = True)
# print(cvresult.tail())
In [15]:
# train_pivot_xgb_time1_sample = train_pivot_xgb_time1[predictors_target_10].sample(100000)
# print train_pivot_xgb_time1_sample.index
# len(train_pivot_xgb_time1_sample.loc[train_pivot_xgb_time1_sample.index.values].index.drop_duplicates())
Out[15]:
In [9]:
num_round = 400
i = 0
d_train_pivot_xgb_time1 = xgb.DMatrix(train_pivot_xgb_time1[predictors_10], missing=np.nan)
gc.collect()
submission_10 = pd.DataFrame()
for i in range(40):
train_pivot_xgb_time1_sample = train_pivot_xgb_time1[predictors_target_10].sample(2000000)
train_feature_10 = train_pivot_xgb_time1_sample.drop(['target'],axis = 1)
train_label_10 = train_pivot_xgb_time1_sample['target']
dtrain_10 = xgb.DMatrix(train_feature_10,label = train_label_10,missing= np.nan)
gc.collect()
bst = xgb.train(param_10, dtrain_10, num_round)
print str(i) + 'training finished!'
gc.collect()
submission_10['predict_' + str(i)] = bst.predict(d_train_pivot_xgb_time1)
submission_10['predict_' + str(i)].loc[train_pivot_xgb_time1_sample.index.values] = np.nan
print str(i) + 'predicting finished!'
gc.collect()
print 'finished'
In [10]:
submission_10.head()
Out[10]:
In [12]:
submission_10['predict'] = submission_10[['predict_' + str(i) for i in range(40)]].mean(axis=1)
In [14]:
submission_10.head()
Out[14]:
In [16]:
submission_10_final = submission_10['predict']
In [17]:
submission_10_final.head()
Out[17]:
In [18]:
submission_10_final.to_csv('stack_train_xgb_10.csv')
In [21]:
num_round = 392
dtest_10 = xgb.DMatrix(test_dataset_10_normalize[predictors_10], missing=np.nan)
submission_10 = train_pivot_56789_to_10[['id']].copy()
i = 0
for i in range(20):
train_pivot_xgb_time1_sample = train_dataset_10_normalize[predictors_target_10].sample(2000000)
train_feature_10 = train_pivot_xgb_time1_sample.drop(['target'],axis = 1)
train_label_10 = train_pivot_xgb_time1_sample[['target']]
dtrain_10 = xgb.DMatrix(train_feature_10,label = train_label_10,missing= np.nan)
bst = xgb.train(param_10, dtrain_10, num_round)
print str(i) + 'training finished!'
submission_10['predict_' + str(i)] = bst.predict(dtest_10)
print str(i) + 'predicting finished!'
print 'finished'
In [22]:
submission_10.to_csv('submission_10_new.csv')
In [26]:
# make prediction
xgb.plot_importance(bst)
Out[26]:
In [12]:
submission_10 = pd.read_csv('submission_10.csv',index_col = 0)
In [22]:
submission_10.shape
Out[22]:
In [23]:
submission_10.columns.values
Out[23]:
In [26]:
submission_11.columns.values
Out[26]:
In [27]:
submission = pd.concat([submission_10,submission_11],axis = 0)
In [28]:
submission.head()
Out[28]:
In [29]:
submission['predict'] = submission[['predict_' + str(i) for i in range(20)]].mean(axis=1)
In [30]:
submission.head()
Out[30]:
In [31]:
submission.rename(columns = {'predict':'Demanda_uni_equil'},inplace = True)
In [32]:
submission['Demanda_uni_equil'] = submission['Demanda_uni_equil'].apply(np.expm1)
submission.head()
Out[32]:
In [33]:
submission_final = submission[['id','Demanda_uni_equil']].copy()
In [34]:
submission_final['Demanda_uni_equil'] = submission_final['Demanda_uni_equil'].round(1)
In [35]:
submission_final.head()
Out[35]:
In [36]:
submission_final.to_csv('submission_xgb_2.csv',index = False)
In [84]:
test_id = pd.read_csv('origin/test.csv',usecols = ['id'])
In [88]:
test_id['id'].dtype
Out[88]:
In [91]:
len(np.intersect1d(list(submission['id']), list(test_id['id'])))
Out[91]:
In [22]:
result_no_cli_pro_in_common = pd.read_csv('origin/results1.csv')
result_no_cli_pro_in_common.head()
Out[22]:
In [23]:
result_no_cli_pro_in_common = result_no_cli_pro_in_common[-result_no_cli_pro_in_common['id'].isin(np.array(submission['id']))]
result_no_cli_pro_in_common.head()
Out[23]:
In [24]:
result_no_cli_pro_in_common.shape
Out[24]:
In [30]:
submission_final = pd.concat([submission[['id','Demanda_uni_equil']],result_no_cli_pro_in_common],axis = 0)
In [31]:
submission_final['Demanda_uni_equil'] = submission_final['Demanda_uni_equil'].round(1)
In [32]:
submission_final.shape
Out[32]:
In [33]:
submission_final.to_csv('submission_xgb.csv',index = False)
In [ ]: