In [1]:
# Random_forest_regressor
# extra_tree_regressor
# sklearn.svm.SVR
In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn import grid_search
from sklearn.ensemble import ExtraTreesRegressor
import gc
In [3]:
%ls
In [4]:
dtypes = {'agen_for_log_de':'float32',
'ruta_for_log_de':'float32',
'cliente_for_log_de':'float32',
'producto_for_log_de':'float32',
'agen_ruta_for_log_de':'float32',
'agen_cliente_for_log_de':'float32',
'agen_producto_for_log_de':'float32',
'ruta_cliente_for_log_de':'float32',
'ruta_producto_for_log_de':"float32",
'cliente_producto_for_log_de':'float32',
'cliente_for_log_sum':'float32',
'corr':'float32',
't_min_1':'float32',
't_min_2':'float32',
't_min_3':'float32',
't_min_4':'float32',
't_min_5':'float32',
't1_min_t2':'float32',
't1_min_t3':'float32',
't1_min_t4':'float32',
't1_min_t5':'float32',
't2_min_t3':'float32',
't2_min_t4':'float32',
't2_min_t5':'float32',
't3_min_t4':'float32',
't3_min_t5':'float32',
't4_min_t5':'float32',
'LR_prod':'float32',
'LR_prod_corr':'float32',
'target':'float32',
't_m_5_cum':'float32',
't_m_4_cum' :'float32',
't_m_3_cum':'float32',
't_m_2_cum':'float32',
't_m_1_cum':'float32',
'NombreCliente':'int32',
'weight':'float32',
'weight_per_piece':'float32',
'pieces':'float32'}
In [2]:
predictors_10 = ['agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
'producto_for_log_de', 'agen_ruta_for_log_de',
'agen_cliente_for_log_de', 'agen_producto_for_log_de',
'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
'LR_prod_corr', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
't_m_2_cum', 't_m_1_cum', 'NombreCliente', 'weight',
'weight_per_piece', 'pieces']
In [3]:
predictors_10_target = ['agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
'producto_for_log_de', 'agen_ruta_for_log_de',
'agen_cliente_for_log_de', 'agen_producto_for_log_de',
'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
'LR_prod_corr', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
't_m_2_cum', 't_m_1_cum', 'NombreCliente', 'weight',
'weight_per_piece', 'pieces','target']
In [7]:
f = lambda x : (x-x.mean())/x.std(ddof=0)
In [8]:
train_pivot_xgb_time1 = pd.read_csv('train_pivot_xgb_time1.csv',dtype=dtypes,usecols = predictors_10_target)
In [9]:
train_pivot_xgb_time1.columns.values
Out[9]:
In [10]:
train_pivot_56789_to_10 = pd.read_pickle('train_pivot_56789_to_10_new.pickle')
In [11]:
train_pivot_56789_to_10.columns.values
Out[11]:
In [12]:
def normalize_dataset_10(train_dataset,test_dataset):
train_dataset_normalize = train_dataset[predictors_10]
train_dataset_normalize['label'] = 0
test_dataset_normalize = test_dataset[predictors_10]
test_dataset_normalize['label'] = 1
whole_dataset = pd.concat([train_dataset_normalize,test_dataset_normalize],copy = False)
whole_dataset_normalize = whole_dataset.apply(f,axis = 0)
train_dataset_normalize = whole_dataset_normalize.loc[whole_dataset.label == 0]
test_dataset_normalize = whole_dataset_normalize.loc[whole_dataset.label==1]
train_dataset_normalize.drop(['label'],axis = 1,inplace = True)
test_dataset_normalize.drop(['label'],axis =1,inplace = True)
train_dataset_normalize['target'] = train_dataset['target']
# target = train_dataset['target']
return train_dataset_normalize,test_dataset_normalize
In [13]:
train_dataset_10_normalize, test_dataset_10_normalize = normalize_dataset_10(train_pivot_xgb_time1,
train_pivot_56789_to_10)
In [15]:
train_dataset_10_normalize.to_csv('train_dataset_10_normalize.csv')
test_dataset_10_normalize.to_csv('test_dataset_10_normalize.csv')
In [5]:
train_dataset_10_normalize = pd.read_csv('train_dataset_10_normalize.csv', index_col=0)
In [5]:
train_dataset_10_normalize.shape
Out[5]:
In [4]:
# from sklearn.externals import joblib
# for i in range(20):
# train_dataset_10_normalize.fillna(-99,inplace = True)
# train_dataset_10_normalize.reset_index(drop = True, inplace = True)
# train_dataset_10_normalize_sample = train_dataset_10_normalize[predictors_10_target].sample(2000000)
# train_label_10 = train_dataset_10_normalize_sample['target']
# train_feature_10 = train_dataset_10_normalize_sample.drop(['target'],axis = 1)
# gc.collect()
# clf = RandomForestRegressor(n_estimators=1400,
# n_jobs = 11,
# max_depth = 22,
# max_features = 'log2',
# bootstrap = True)
# clf.fit(train_feature_10,train_label_10)
# print 'model already fitted'
# # save the model to disk
# filename = 'RF'+str(i)+'.model'
# joblib.dump(clf, filename)
# print 'finished'
In [ ]:
# submission_10 = pd.DataFrame()
# i = 0
# clf = joblib.load('filename.pkl')
# submission_10['predict_' + str(i)] = clf.predict(train_dataset_10_normalize[predictors_10])
In [1]:
i = 0
train_dataset_10_normalize.fillna(-99,inplace = True)
train_dataset_10_normalize.reset_index(drop = True, inplace = True)
gc.collect()
submission_10 = pd.DataFrame()
for i in range(20):
train_dataset_10_normalize_sample = train_dataset_10_normalize[predictors_10_target].sample(20000)
train_label_10 = train_dataset_10_normalize_sample['target']
train_feature_10 = train_dataset_10_normalize_sample.drop(['target'],axis = 1)
gc.collect()
clf = RandomForestRegressor(n_estimators=40,
n_jobs = 1,
max_depth = 6,
max_features = 'log2',
bootstrap = False,
verbose = 1)
clf.fit(train_feature_10,train_label_10)
submission_10['predict_' + str(i)] = clf.predict(train_dataset_10_normalize[predictors_10])
print submission_10.head()
submission_10['predict_' + str(i)].loc[train_dataset_10_normalize_sample.index.values] = np.nan
print clf.score(train_dataset_10_normalize[predictors_10],train_dataset_10_normalize['target'])
print str(i) + '__predicting finished!'
gc.collect()
print 'finished'
In [18]:
# # gird search
# # create model
# # svr_rbf = SVR(kernel='rbf', C=1e3, epsilon = 0.1,gamma=0.1)
# random_forest_regressor = RandomForestRegressor(n_jobs = 5,
# verbose = True,
# # max_depth = 5,
# bootstrap = True)
# # grid search epochs, batch size and optimizer# use a full grid over all parameters
# param_grid = {"n_estimators":[1400,1600],
# "max_depth": [12,20,25],
# # "max_depth": [5,None],
# "max_features": ['log2','sqrt']}
# # "min_samples_leaf": [5, 10]}
# # "min_samples_split": [10,15,100],
# # more complex para:
# # gamma = numpy.array([50, 100, 150])
# # degree = numpy.array([5, 10, 20])
# # param_grid = dict(kernel=kernel, C = C, batch_size=batches, init=init)
# grid = grid_search.GridSearchCV(random_forest_regressor, param_grid=param_grid)
# grid_result = grid.fit(train_nn_time1, label_nn_time1)
# # summarize results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# for params, mean_score, scores in grid_result.grid_scores_:
# print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))
In [ ]:
# # evaluate model with standardized dataset
# # numpy.random.seed(seed)
seed = 42
# # parameter for svr:
# # C = penalty before 'square loss', the larger the C, the more bias and less variance
train_dataset_10_normalize.fillna(-99,inplace = True)
train_dataset_10_normalize.reset_index(drop = True, inplace = True)
# train_dataset_10_normalize_sample = train_dataset_10_normalize[predictors_10_target]
train_label_10 = train_dataset_10_normalize['target']
train_feature_10 = train_dataset_10_normalize.drop(['target'],axis = 1)
# # gamma means parameter before(in) gussian kernel, the larger the gamma, the larger the bias and less variance
clf = RandomForestRegressor(n_estimators=1400,
n_jobs = 11,
max_depth = 22,
max_features = 'log2',
bootstrap = True)
kfold = KFold(n=len(train_label_10), n_folds=5, random_state=seed)
results = cross_val_score(clf,train_feature_10, train_label_10,scoring='mean_squared_error' ,cv=kfold,verbose = 3)
print results
print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))
In [ ]: