In [1]:
import numpy as np
import pandas as pd
import theano
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from keras.regularizers import l2, activity_l2
from sklearn import cross_validation
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import gc
In [2]:
predictors_target_10 = ['agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
'producto_for_log_de', 'agen_ruta_for_log_de',
'agen_cliente_for_log_de', 'agen_producto_for_log_de',
'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
'LR_prod_corr', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
't_m_2_cum', 't_m_1_cum', 'NombreCliente', 'weight',
'weight_per_piece', 'pieces','target']
In [3]:
predictors_10 = ['agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
'producto_for_log_de', 'agen_ruta_for_log_de',
'agen_cliente_for_log_de', 'agen_producto_for_log_de',
'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
'LR_prod_corr', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
't_m_2_cum', 't_m_1_cum', 'NombreCliente', 'weight',
'weight_per_piece', 'pieces']
In [4]:
dtypes = {'agen_for_log_de':'float32',
'ruta_for_log_de':'float32',
'cliente_for_log_de':'float32',
'producto_for_log_de':'float32',
'agen_ruta_for_log_de':'float32',
'agen_cliente_for_log_de':'float32',
'agen_producto_for_log_de':'float32',
'ruta_cliente_for_log_de':'float32',
'ruta_producto_for_log_de':"float32",
'cliente_producto_for_log_de':'float32',
'cliente_for_log_sum':'float32',
'corr':'float32',
't_min_1':'float32',
't_min_2':'float32',
't_min_3':'float32',
't_min_4':'float32',
't_min_5':'float32',
't1_min_t2':'float32',
't1_min_t3':'float32',
't1_min_t4':'float32',
't1_min_t5':'float32',
't2_min_t3':'float32',
't2_min_t4':'float32',
't2_min_t5':'float32',
't3_min_t4':'float32',
't3_min_t5':'float32',
't4_min_t5':'float32',
'LR_prod':'float32',
'LR_prod_corr':'float32',
'target':'float32',
't_m_5_cum':'float32',
't_m_4_cum' :'float32',
't_m_3_cum':'float32',
't_m_2_cum':'float32',
't_m_1_cum':'float32',
'NombreCliente':'int32',
'weight':'float32',
'weight_per_piece':'float32',
'pieces':'float32'}
In [ ]:
f = lambda x : (x-x.mean())/x.std(ddof=0)
In [6]:
def normalize_dataset_10(train_dataset,test_dataset):
train_dataset_normalize = train_dataset[predictors_10]
train_dataset_normalize['label'] = 0
test_dataset_normalize = test_dataset[predictors_10]
test_dataset_normalize['label'] = 1
whole_dataset = pd.concat([train_dataset_normalize,test_dataset_normalize],copy = False)
whole_dataset_normalize = whole_dataset.apply(f,axis = 0)
train_dataset_normalize = whole_dataset_normalize.loc[whole_dataset['label'] == 0]
test_dataset_normalize = whole_dataset_normalize.loc[whole_dataset['label']==1]
train_dataset_normalize.drop(['label'],axis = 1,inplace = True)
test_dataset_normalize.drop(['label'],axis =1,inplace = True)
train_dataset_normalize['target'] = train_dataset['target']
# target = train_dataset['target']
return train_dataset_normalize,test_dataset_normalize
In [7]:
train_pivot_xgb_time1 = pd.read_csv('train_pivot_xgb_time1.csv',
usecols = predictors_target_10,dtype = dtypes)
train_pivot_xgb_time1.reset_index(drop = True,inplace = True)
In [8]:
train_pivot_56789_to_10 = pd.read_pickle('train_pivot_56789_to_10_new.pickle')
train_pivot_56789_to_10.reset_index(drop = True,inplace = True)
In [9]:
train_dataset_10_normalize, test_dataset_10_normalize = normalize_dataset_10(train_pivot_xgb_time1,
train_pivot_56789_to_10)
In [23]:
# train_pivot_xgb_time1 = train_pivot_xgb_time1.sample(1000)
# train_pivot_xgb_time1.reset_index(drop = True,inplace = True)
In [13]:
train_dataset_10_normalize.fillna(-1,inplace = True)
In [14]:
k_fold = cross_validation.KFold(n=train_dataset_10_normalize.shape[0], n_folds=5)
a = np.zeros(shape=(train_dataset_10_normalize.shape[0],1))
stack_submission_nn_10 = pd.DataFrame(a,columns=['predict'])
In [15]:
for train_indices, test_indices in k_fold:
# create model
model = Sequential()
model.add(Dense(128, input_dim=38, init='normal', activation='relu',W_regularizer=l2(0.00000001)))
# model.add(Dropout(0.3))
model.add(Dense(64, init='normal', activation='relu'))
# model.add(Dropout(0.3))
model.add(Dense(32, init='normal', activation='relu'))
# model.add(Dropout(0.3))
model.add(Dense(8, init='normal', activation='relu'))
# model.add(Dropout(0.3))
model.add(Dense(1, init='normal',activation = 'linear'))
# Compile model
model.compile(loss='mean_squared_error', optimizer='adam')
# fit model
model.fit(train_dataset_10_normalize.loc[train_indices,predictors_10].as_matrix(),
train_dataset_10_normalize.loc[train_indices,'target'].as_matrix(),
nb_epoch=100, shuffle = True, batch_size=128,validation_split = 0,verbose = 2)
print 'model fit finished'
stack_submission_nn_10.loc[test_indices] = model.predict(train_dataset_10_normalize.\
loc[test_indices,predictors_10].as_matrix(),
batch_size=128, verbose=2)
print 'model predict finished'
In [16]:
stack_submission_nn_10.head()
Out[16]:
In [17]:
stack_submission_nn_10.to_pickle('stack_train_nn_10.pickle')
In [9]:
train_dataset_10_normalize.columns.values
Out[9]:
In [10]:
test_dataset_10_normalize.shape
Out[10]:
In [11]:
train_nn_time1 = train_dataset_10_normalize[predictors_10].copy()
label_nn_time1 = train_dataset_10_normalize['target'].copy()
In [12]:
train_nn_time1.fillna(-1,inplace = True)
# train_nn_time1.fillna(0,inplace = True)
In [13]:
train_nn_time1 = train_nn_time1.as_matrix()
label_nn_time1 = label_nn_time1.as_matrix()
In [18]:
test_dataset_10_normalize.fillna(-1,inplace = True)
In [19]:
test_nn_time1 = test_dataset_10_normalize.as_matrix()
In [14]:
from sklearn.utils import shuffle
train_nn_time1, label_nn_time1 = shuffle(train_nn_time1, label_nn_time1, random_state=42)
In [99]:
# create model
model = Sequential()
model.add(Dense(128, input_dim=38, init='normal', activation='relu',W_regularizer=l2(0.00000001)))
# model.add(Dropout(0.3))
model.add(Dense(64, init='normal', activation='relu'))
# model.add(Dropout(0.3))
model.add(Dense(32, init='normal', activation='relu'))
# model.add(Dropout(0.3))
model.add(Dense(8, init='normal', activation='relu'))
# model.add(Dropout(0.3))
model.add(Dense(1, init='normal',activation = 'linear'))
# Compile model
model = load_model('model_nn_10_after_l2reg.h5')
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(train_nn_time1, label_nn_time1, nb_epoch=100, shuffle = True,
batch_size=128,validation_split = 0.2,verbose = 2)
time.sleep(0.1)
In [100]:
model.save('model_nn_10_after_l2reg.h5')
In [75]:
sumbission_nn_10 = model.predict(test_nn_time1, batch_size=128, verbose=0)
In [76]:
submission_nn_10_whole = pd.DataFrame()
In [77]:
submission_nn_10_whole['id'] = train_pivot_56789_to_10['id'].copy()
submission_nn_10_whole['predict'] = pd.DataFrame(sumbission_nn_10)
In [78]:
submission_nn_10_whole.head()
Out[78]:
In [74]:
from keras.models import load_model
model = load_model('model_nn_10_after_l2reg.h5')
In [27]:
%ls
In [79]:
submission_xgb_11 = pd.read_csv('submission_11_new.csv',index_col = 0)
submission_xgb_11['predict'] = submission_xgb_11[['predict_' + str(i) for i in range(20)]].mean(axis=1)
submission_xgb_11.drop(['predict_' + str(i) for i in range(20)],axis =1,inplace = True)
submission_xgb_11.head()
Out[79]:
In [88]:
submission_xgb_11['predict'].describe()
Out[88]:
In [80]:
np.expm1(0.1)
Out[80]:
In [90]:
submission_nn = pd.concat([submission_nn_10_whole,submission_xgb_11],axis = 0,copy = True)
In [91]:
mask = submission_nn[submission_nn['predict'] < 0].index
submission_nn.loc[mask, 'predict'] = 0.001
submission_nn['predict'].describe()
Out[91]:
In [92]:
submission_nn['predict'] = submission_nn['predict'].apply(np.expm1)
submission_nn.rename(columns = {'predict':'Demanda_uni_equil'},inplace = True)
In [84]:
submission_nn.head()
Out[84]:
In [93]:
submission_nn['Demanda_uni_equil'] = submission_nn['Demanda_uni_equil'].round(1)
In [94]:
submission_nn['Demanda_uni_equil'].describe()
Out[94]:
In [95]:
submission_nn.to_csv('submission_nn_2.csv',index = False)
In [87]:
-4.000000e-01
Out[87]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [66]:
mask = submission_nn_10[submission_nn_10['predict'] < 0].index
submission_nn_10.loc[mask, 'predict'] = 0
submission_nn_10['predict'].describe()
In [67]:
In [68]:
Out[68]:
In [84]:
submission_nn_10.sort(['id'],inplace = True)
submission_nn_10.reset_index(inplace = True,drop = True)
In [87]:
submission_nn_10.head()
Out[87]:
In [21]:
%ls
In [96]:
In [97]:
submission_xgb_10 = pd.read_csv('submission_10_new.csv',index_col = 0)
submission_xgb_10['predict'] = submission_xgb_10[['predict_' + str(i) for i in range(20)]].mean(axis=1)
submission_xgb_10.drop(['predict_' + str(i) for i in range(20)],axis =1,inplace = True)
In [98]:
submission_xgb_10.sort(['id'],inplace = True)
submission_xgb_10.reset_index(inplace = True,drop = True)
In [99]:
submission_xgb_10.head()
Out[99]:
In [102]:
submission_xgb_10['nn_predict'] = submission_nn_10['predict']
In [130]:
submission_xgb_10['combine_predict'] = submission_xgb_10['predict']*0.8 + submission_xgb_10['nn_predict'] *0.2
In [131]:
submission_xgb_10.head()
Out[131]:
In [140]:
submission_xgb_10.drop(['combine_predict'],axis =1, inplace = True)
submission_xgb_10.rename(columns = {'predict': 'xgb_predict'},inplace = True)
submission_xgb_10.head()
Out[140]:
In [142]:
submission_xgb_10.to_pickle('submission_xgb_nn_10.pickle')
In [128]:
submission_xgb_10.shape
Out[128]:
In [132]:
submission_10 = submission_xgb_10[['id','combine_predict']].copy()
submission_10['combine_predict'] = submission_10['combine_predict'].apply(np.expm1)
submission_10.rename(columns = {'combine_predict':'Demanda_uni_equil'},inplace = True)
In [133]:
submission_10.head()
Out[133]:
In [134]:
submission_xgb_11 = pd.read_csv('submission_11_new.csv',index_col = 0)
In [135]:
submission_xgb_11['Demanda_uni_equil'] = submission_xgb_11[['predict_' + str(i) for i in range(20)]].mean(axis=1)
submission_xgb_11.drop(['predict_' + str(i) for i in range(20)],axis =1,inplace = True)
submission_xgb_11['Demanda_uni_equil'] = submission_xgb_11['Demanda_uni_equil'].apply(np.expm1)
submission_xgb_11.head()
Out[135]:
In [136]:
submission_final = pd.concat([submission_10,submission_xgb_11],axis = 0)
submission_final.head()
Out[136]:
In [137]:
submission_final['Demanda_uni_equil'] = submission_final['Demanda_uni_equil'].round(1)
In [138]:
submission_final.to_csv('submission_nn_xgb',index = False)
In [ ]:
In [ ]:
In [ ]: