In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold
from sklearn import linear_model
from sklearn import grid_search
import gc

In [3]:
%ls


1_predata.ipynb                   preprocessed_products.csv
3_xgb.ipynb                       ruta_for_cliente_producto.csv
3_xgb_prediction.ipynb            stack_sub/
44fea_bst.model                   submission_10_new.csv
4_keras_nn.ipynb                  submission_11_new.csv
5_random_forest.ipynb             submission_44fea.csv
6_stack_model.ipynb               submission_nn.csv
7_SGD_regressor.ipynb             submission_nn_xgb
8_svm_linearSVR.ipynb             test_dataset_10_normalize.csv
agencia_for_cliente_producto.csv  train_dataset_10_normalize.csv
canal_for_cliente_producto.csv    train_dataset_10_normalize.pickle
model_nn_10_after_l2reg.h5        train_pivot_56789_to_10_44fea.pickle
model_nn_10.h5                    train_pivot_56789_to_10_new.pickle
model_nn_10_whole.h5              train_pivot_6789_to_11_new.pickle
old_submission/                   train_pivot_xgb_time1_44fea.csv
origin/                           train_pivot_xgb_time1.csv
pivot_test.pickle                 train_pivot_xgb_time2_38fea.csv
pivot_train_with_nan.pickle       train_pivot_xgb_time2.csv

In [2]:
predictors_10_target = ['agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
       't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
       't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
       't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
       'LR_prod_corr', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
       't_m_2_cum', 't_m_1_cum', 'NombreCliente', 'weight',
       'weight_per_piece', 'pieces','target']

In [3]:
predictors_10 = ['agen_for_log_de', 'ruta_for_log_de', 'cliente_for_log_de',
       'producto_for_log_de', 'agen_ruta_for_log_de',
       'agen_cliente_for_log_de', 'agen_producto_for_log_de',
       'ruta_cliente_for_log_de', 'ruta_producto_for_log_de',
       'cliente_producto_for_log_de', 'cliente_for_log_sum', 'corr',
       't_min_1', 't_min_2', 't_min_3', 't_min_4', 't_min_5', 't1_min_t2',
       't1_min_t3', 't1_min_t4', 't1_min_t5', 't2_min_t3', 't2_min_t4',
       't2_min_t5', 't3_min_t4', 't3_min_t5', 't4_min_t5', 'LR_prod',
       'LR_prod_corr', 't_m_5_cum', 't_m_4_cum', 't_m_3_cum',
       't_m_2_cum', 't_m_1_cum', 'NombreCliente', 'weight',
       'weight_per_piece', 'pieces']

In [4]:
train_dataset_10_normalize = pd.read_csv('train_dataset_10_normalize.csv',index_col = 0)

In [5]:
train_nn_time1 = train_dataset_10_normalize[predictors_10]
label_nn_time1 = train_dataset_10_normalize['target']

In [6]:
clf = linear_model.SGDRegressor(loss ='squared_loss',
                               penalty = 'l2',
                               alpha = 0.00001,
                               n_iter = 50)

In [7]:
train_nn_time1.fillna(-1,inplace = True)


/usr/local/lib/python2.7/dist-packages/pandas/core/frame.py:2762: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)

In [8]:
kfold = KFold(n=len(train_nn_time1), n_folds=5, random_state=42)
results = cross_val_score(clf,train_nn_time1, label_nn_time1,scoring='mean_squared_error' ,cv=kfold,verbose = 3)


[CV] no parameters to be set .........................................
[CV] ............... no parameters to be set, score=-0.228246 - 4.8min
[CV] no parameters to be set .........................................
[CV] ............... no parameters to be set, score=-0.234660 - 4.8min
[CV] no parameters to be set .........................................
[CV] ............... no parameters to be set, score=-0.239492 - 4.8min
[CV] no parameters to be set .........................................
[CV] ............... no parameters to be set, score=-0.229627 - 4.8min
[CV] no parameters to be set .........................................
[CV] ............... no parameters to be set, score=-0.238932 - 4.8min
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 23.9min finished

In [ ]: