In [1]:
# Random_forest_regressor
# extra_tree_regressor
# sklearn.svm.SVR

In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn import grid_search
from sklearn.ensemble import ExtraTreesRegressor
import gc

In [3]:
%ls


1_predata.ipynb                   preprocessed_products.csv
3_xgb.ipynb                       ruta_for_cliente_producto.csv
3_xgb_prediction.ipynb            stack_sub/
44fea_bst.model                   submission_10_new.csv
4_keras_nn.ipynb                  submission_11_new.csv
5_random_forest.ipynb             submission_44fea.csv
6_stack_model.ipynb               submission_nn.csv
agencia_for_cliente_producto.csv  submission_nn_xgb
canal_for_cliente_producto.csv    train_pivot_56789_to_10_44fea.pickle
model_nn_10_after_l2reg.h5        train_pivot_56789_to_10_new.pickle
model_nn_10.h5                    train_pivot_6789_to_11_new.pickle
model_nn_10_whole.h5              train_pivot_xgb_time1_44fea.csv
old_submission/                   train_pivot_xgb_time1.csv
origin/                           train_pivot_xgb_time2_38fea.csv
pivot_test.pickle                 train_pivot_xgb_time2.csv
pivot_train_with_nan.pickle

In [4]:
dtypes = {'agen_for_log_de':'float32',
        'ruta_for_log_de':'float32',
        'cliente_for_log_de':'float32',
        'producto_for_log_de':'float32',
        'agen_ruta_for_log_de':'float32',
        'agen_cliente_for_log_de':'float32',
        'agen_producto_for_log_de':'float32',
        'ruta_cliente_for_log_de':'float32',
        'ruta_producto_for_log_de':"float32",
        'cliente_producto_for_log_de':'float32',
        'cliente_for_log_sum':'float32',
        'corr':'float32',
        't_min_1':'float32',
        't_min_2':'float32',
        't_min_3':'float32',
        't_min_4':'float32',
        't_min_5':'float32',
        't1_min_t2':'float32',
        't1_min_t3':'float32',
        't1_min_t4':'float32',
        't1_min_t5':'float32',
        't2_min_t3':'float32',
        't2_min_t4':'float32',
        't2_min_t5':'float32',
        't3_min_t4':'float32',
        't3_min_t5':'float32',
        't4_min_t5':'float32',
        'LR_prod':'float32',
        'LR_prod_corr':'float32',
        'target':'float32',
        't_m_5_cum':'float32',
        't_m_4_cum' :'float32',
        't_m_3_cum':'float32',
        't_m_2_cum':'float32',
        't_m_1_cum':'float32',
        'NombreCliente':'int32',
        'weight':'float32',
        'weight_per_piece':'float32',
        'pieces':'float32'}

In [2]:
predictors_10 = ['agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
       't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
       't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
       't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
       'LR_prod_corr', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
       't_m_2_cum', 't_m_1_cum', 'NombreCliente', 'weight',
       'weight_per_piece', 'pieces']

In [3]:
predictors_10_target = ['agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
       't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
       't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
       't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
       'LR_prod_corr', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
       't_m_2_cum', 't_m_1_cum', 'NombreCliente', 'weight',
       'weight_per_piece', 'pieces','target']

In [7]:
f = lambda x : (x-x.mean())/x.std(ddof=0)

In [8]:
train_pivot_xgb_time1 = pd.read_csv('train_pivot_xgb_time1.csv',dtype=dtypes,usecols = predictors_10_target)

In [9]:
train_pivot_xgb_time1.columns.values


Out[9]:
array(['agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
       't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
       't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
       't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
       'LR_prod_corr', 'target', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
       't_m_2_cum', 't_m_1_cum', 'NombreCliente', 'weight',
       'weight_per_piece', 'pieces'], dtype=object)

In [10]:
train_pivot_56789_to_10 = pd.read_pickle('train_pivot_56789_to_10_new.pickle')

In [11]:
train_pivot_56789_to_10.columns.values


Out[11]:
array(['Cliente_ID', 'Producto_ID', 'id', 'Semana', 'Agencia_ID',
       'Canal_ID', 'Ruta_SAK', 'agen_for_log_de', 'ruta_for_log_de',
       'cliente_for_log_de', 'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
       't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
       't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
       't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
       'LR_prod_corr', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum', 't_m_2_cum',
       't_m_1_cum', 'NombreCliente', 'weight', 'weight_per_piece', 'pieces'], dtype=object)

In [12]:
def normalize_dataset_10(train_dataset,test_dataset):
    train_dataset_normalize = train_dataset[predictors_10]
    train_dataset_normalize['label'] = 0    
    
    test_dataset_normalize = test_dataset[predictors_10]
    test_dataset_normalize['label'] = 1
    
    whole_dataset = pd.concat([train_dataset_normalize,test_dataset_normalize],copy = False)
    whole_dataset_normalize = whole_dataset.apply(f,axis = 0)
    
    train_dataset_normalize = whole_dataset_normalize.loc[whole_dataset.label == 0]
    test_dataset_normalize = whole_dataset_normalize.loc[whole_dataset.label==1]
    
    train_dataset_normalize.drop(['label'],axis = 1,inplace = True)
    test_dataset_normalize.drop(['label'],axis =1,inplace = True)
    
    train_dataset_normalize['target'] = train_dataset['target'] 
    
#     target = train_dataset['target']
    return train_dataset_normalize,test_dataset_normalize

In [13]:
train_dataset_10_normalize, test_dataset_10_normalize = normalize_dataset_10(train_pivot_xgb_time1,
                                                                          train_pivot_56789_to_10)


/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [15]:
train_dataset_10_normalize.to_csv('train_dataset_10_normalize.csv')
test_dataset_10_normalize.to_csv('test_dataset_10_normalize.csv')

In [5]:
train_dataset_10_normalize = pd.read_csv('train_dataset_10_normalize.csv', index_col=0)

In [5]:
train_dataset_10_normalize.shape


Out[5]:
(20768652, 39)

prepare for stack model training data, 10% sample, 40 bagging



In [4]:
# from sklearn.externals import joblib

# for i in range(20):
#     train_dataset_10_normalize.fillna(-99,inplace = True)
#     train_dataset_10_normalize.reset_index(drop = True, inplace = True)

#     train_dataset_10_normalize_sample = train_dataset_10_normalize[predictors_10_target].sample(2000000)

#     train_label_10 = train_dataset_10_normalize_sample['target']
#     train_feature_10 = train_dataset_10_normalize_sample.drop(['target'],axis = 1)

#     gc.collect()

#     clf = RandomForestRegressor(n_estimators=1400,
#                                  n_jobs = 11,
#                                  max_depth = 22,
#                                  max_features = 'log2',
#                                  bootstrap = True)

#     clf.fit(train_feature_10,train_label_10)
#     print 'model already fitted'
    
#     # save the model to disk
#     filename = 'RF'+str(i)+'.model'
#     joblib.dump(clf, filename)
    
# print 'finished'

In [ ]:
# submission_10 = pd.DataFrame()
# i = 0
# clf = joblib.load('filename.pkl') 
# submission_10['predict_' + str(i)] = clf.predict(train_dataset_10_normalize[predictors_10])

In [1]:
i = 0
train_dataset_10_normalize.fillna(-99,inplace = True)
train_dataset_10_normalize.reset_index(drop = True, inplace = True)

gc.collect()
submission_10 = pd.DataFrame()

for i in range(20):
    train_dataset_10_normalize_sample = train_dataset_10_normalize[predictors_10_target].sample(20000)

    train_label_10 = train_dataset_10_normalize_sample['target']
    train_feature_10 = train_dataset_10_normalize_sample.drop(['target'],axis = 1)

    gc.collect()

    clf = RandomForestRegressor(n_estimators=40,
                                n_jobs = 1,
                                max_depth = 6,
                                max_features = 'log2',
                                bootstrap = False,
                                verbose = 1)

    clf.fit(train_feature_10,train_label_10)

    submission_10['predict_' + str(i)] = clf.predict(train_dataset_10_normalize[predictors_10])
    print submission_10.head()
    submission_10['predict_' + str(i)].loc[train_dataset_10_normalize_sample.index.values] = np.nan
    print clf.score(train_dataset_10_normalize[predictors_10],train_dataset_10_normalize['target'])
    print str(i) + '__predicting finished!'
    gc.collect()

print 'finished'


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-b901674e841c> in <module>()
      1 i = 0
----> 2 train_dataset_10_normalize.fillna(-99,inplace = True)
      3 train_dataset_10_normalize.reset_index(drop = True, inplace = True)
      4 
      5 gc.collect()

NameError: name 'train_dataset_10_normalize' is not defined

In [18]:
# # gird search 
# # create model
# # svr_rbf = SVR(kernel='rbf', C=1e3, epsilon = 0.1,gamma=0.1)
# random_forest_regressor = RandomForestRegressor(n_jobs = 5,
#                                                 verbose = True,
# #                                                 max_depth = 5,
#                                                bootstrap = True)

# # grid search epochs, batch size and optimizer# use a full grid over all parameters
# param_grid = {"n_estimators":[1400,1600],
#               "max_depth": [12,20,25],
# #               "max_depth": [5,None],
#               "max_features": ['log2','sqrt']}
# #               "min_samples_leaf": [5, 10]}
# #               "min_samples_split": [10,15,100],              

# # more complex para:
# # gamma = numpy.array([50, 100, 150])
# # degree = numpy.array([5, 10, 20])
# # param_grid = dict(kernel=kernel, C = C, batch_size=batches, init=init)


# grid = grid_search.GridSearchCV(random_forest_regressor, param_grid=param_grid)
# grid_result = grid.fit(train_nn_time1, label_nn_time1)

# # summarize results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# for params, mean_score, scores in grid_result.grid_scores_:
#     print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))

In [ ]:
# # evaluate model with standardized dataset
# # numpy.random.seed(seed)
seed = 42

# # parameter for svr:
# # C = penalty before 'square loss', the larger the C, the more bias and less variance


train_dataset_10_normalize.fillna(-99,inplace = True)
train_dataset_10_normalize.reset_index(drop = True, inplace = True)

# train_dataset_10_normalize_sample = train_dataset_10_normalize[predictors_10_target]

train_label_10 = train_dataset_10_normalize['target']
train_feature_10 = train_dataset_10_normalize.drop(['target'],axis = 1)
# # gamma means parameter before(in) gussian kernel, the larger the gamma, the larger the bias and less variance
clf = RandomForestRegressor(n_estimators=1400,
                             n_jobs = 11,
                             max_depth = 22,
                             max_features = 'log2',
                             bootstrap = True)



kfold = KFold(n=len(train_label_10), n_folds=5, random_state=seed)
results = cross_val_score(clf,train_feature_10, train_label_10,scoring='mean_squared_error' ,cv=kfold,verbose = 3)
print results
print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))


[CV] no parameters to be set .........................................

In [ ]: