In [1]:
!pip install lightgbm


Requirement already satisfied: lightgbm in /home/nbuser/anaconda3_420/lib/python3.5/site-packages
Requirement already satisfied: scipy in /home/nbuser/anaconda3_420/lib/python3.5/site-packages (from lightgbm)
Requirement already satisfied: numpy in /home/nbuser/anaconda3_420/lib/python3.5/site-packages (from lightgbm)
Requirement already satisfied: scikit-learn in /home/nbuser/anaconda3_420/lib/python3.5/site-packages (from lightgbm)

In [3]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-0.7.post3.tar.gz (450kB)
    100% |################################| 460kB 1.9MB/s ta 0:00:011
Requirement already satisfied: numpy in /home/nbuser/anaconda3_420/lib/python3.5/site-packages (from xgboost)
Requirement already satisfied: scipy in /home/nbuser/anaconda3_420/lib/python3.5/site-packages (from xgboost)
Building wheels for collected packages: xgboost
  Running setup.py bdist_wheel for xgboost ... done
  Stored in directory: /home/nbuser/.cache/pip/wheels/ca/b3/02/d44d5e12c5c1eecff4a822555bac96b182551cd5e13c4795f6
Successfully built xgboost
Installing collected packages: xgboost
Successfully installed xgboost-0.7.post3

In [7]:
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [5]:
import zipfile
archive = zipfile.ZipFile('test.csv.zip', 'r')
test = pd.read_csv(archive.open('test.csv'), sep=";", decimal=",",parse_dates=True)

In [6]:
archive = zipfile.ZipFile('train.csv.zip', 'r')
train = pd.read_csv(archive.open('train.csv'), sep=";", decimal=",",parse_dates=True)

In [8]:
import datetime
test.date = test.date.str.split('-').apply(lambda x: datetime.datetime(int(x[0]),int(x[1]),int(x[2])))
train.date = train.date.str.split('-').apply(lambda x: datetime.datetime(int(x[0]),int(x[1]),int(x[2])))

In [9]:
train['dayofweek'] = train.date.dt.dayofweek
test['dayofweek'] = test.date.dt.dayofweek
train['quarter'] = train.date.dt.quarter
test['quarter'] = test.date.dt.quarter
train['week'] = train.date.dt.week
test['week'] = test.date.dt.week
train['month'] = train.date.dt.month
test['month'] = test.date.dt.month

In [10]:
## some more feature engineering
train["qteG"] = train.article_nom.str.extract('(\d+)G',expand=True).fillna(0).astype(int)
test["qteG"] = test.article_nom.str.extract('(\d+)G',expand=True).fillna(0).astype(int)
train['qteX'] = train.article_nom.str.extract('X ?(\d)',expand=True).fillna(0).astype(int)
test['qteX'] = test.article_nom.str.extract('X ?(\d)',expand=True).fillna(0).astype(int)
train['qteMl'] = train.article_nom.str.extract('(\d+) ?Ml',expand=True).fillna(0).astype(int)
test['qteMl'] = test.article_nom.str.extract('(\d+) ?Ml',expand=True).fillna(0).astype(int)

In [11]:
ytrain = train.set_index('id').qte_article_vendue

In [12]:
cat_features = ['implant', 'article_nom']

In [13]:
from sklearn import preprocessing
label_encoders = {}
for cat in cat_features:
     label_encoders.update({cat:preprocessing.LabelEncoder()})

In [14]:
for cat, le in label_encoders.items():
    cat_str = cat+'_label'
    train[cat_str] = le.fit_transform(train[cat])
    test[cat_str] = le.transform(test[cat])

In [12]:
##aggregates
#data = pd.concat([train.set_index('id'),test.set_index('id')],axis=0)

In [1]:
#data.groupby(['article_nom','date','implant']).qte_article_vendue.rolling(2).mean().reset_index()

In [15]:
trainingset = train.set_index('id').select_dtypes(include=['float64','int64']).drop('qte_article_vendue', axis=1)
testset = test.set_index('id').select_dtypes(include=['float64','int64'])

In [16]:
# Feature Selection
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
regressor = ExtraTreesRegressor().fit(trainingset, ytrain)
#lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(trainingset, ytrain)
model = SelectFromModel(regressor, prefit=True)
X = model.transform(trainingset)
Xpredict = model.transform(testset)

Modeling


In [17]:
trainingset.columns[model.get_support()]


Out[17]:
Index(['vente_j_7', 'vente_j_8_14', 'vente_cat5_j_7', 'vente_cat5_j_8_14',
       'vente_cat4_j_7', 'vente_cat4_j_8_14', 'dayofweek', 'qteG',
       'implant_label', 'article_nom_label'],
      dtype='object')

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trainingset, ytrain, test_size=0.05, random_state=42)

In [30]:
print('Start training...')
# train
gbm = lgb.LGBMRegressor(objective='regression',
                        num_leaves=60,
                        learning_rate=0.1,
                        n_estimators=150, random_state=42)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='rmse',
        early_stopping_rounds=5)

print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

# feature importances
print('Feature importances:', list(gbm.feature_importances_))


Start training...
[1]	valid_0's rmse: 0.824954
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's rmse: 0.794949
[3]	valid_0's rmse: 0.769341
[4]	valid_0's rmse: 0.748389
[5]	valid_0's rmse: 0.729549
[6]	valid_0's rmse: 0.714751
[7]	valid_0's rmse: 0.701308
[8]	valid_0's rmse: 0.690887
[9]	valid_0's rmse: 0.681769
[10]	valid_0's rmse: 0.67352
[11]	valid_0's rmse: 0.666433
[12]	valid_0's rmse: 0.660194
[13]	valid_0's rmse: 0.654688
[14]	valid_0's rmse: 0.65034
[15]	valid_0's rmse: 0.646665
[16]	valid_0's rmse: 0.64352
[17]	valid_0's rmse: 0.640808
[18]	valid_0's rmse: 0.638041
[19]	valid_0's rmse: 0.635866
[20]	valid_0's rmse: 0.634015
[21]	valid_0's rmse: 0.632183
[22]	valid_0's rmse: 0.630454
[23]	valid_0's rmse: 0.629355
[24]	valid_0's rmse: 0.627942
[25]	valid_0's rmse: 0.626457
[26]	valid_0's rmse: 0.625055
[27]	valid_0's rmse: 0.624297
[28]	valid_0's rmse: 0.623282
[29]	valid_0's rmse: 0.622281
[30]	valid_0's rmse: 0.621367
[31]	valid_0's rmse: 0.620757
[32]	valid_0's rmse: 0.620001
[33]	valid_0's rmse: 0.619423
[34]	valid_0's rmse: 0.618684
[35]	valid_0's rmse: 0.617698
[36]	valid_0's rmse: 0.6168
[37]	valid_0's rmse: 0.616534
[38]	valid_0's rmse: 0.615811
[39]	valid_0's rmse: 0.615377
[40]	valid_0's rmse: 0.614908
[41]	valid_0's rmse: 0.614696
[42]	valid_0's rmse: 0.61408
[43]	valid_0's rmse: 0.613596
[44]	valid_0's rmse: 0.612832
[45]	valid_0's rmse: 0.612368
[46]	valid_0's rmse: 0.611901
[47]	valid_0's rmse: 0.611447
[48]	valid_0's rmse: 0.610893
[49]	valid_0's rmse: 0.610597
[50]	valid_0's rmse: 0.61004
[51]	valid_0's rmse: 0.609743
[52]	valid_0's rmse: 0.609457
[53]	valid_0's rmse: 0.609121
[54]	valid_0's rmse: 0.608916
[55]	valid_0's rmse: 0.608737
[56]	valid_0's rmse: 0.608501
[57]	valid_0's rmse: 0.608146
[58]	valid_0's rmse: 0.607998
[59]	valid_0's rmse: 0.607859
[60]	valid_0's rmse: 0.607631
[61]	valid_0's rmse: 0.607442
[62]	valid_0's rmse: 0.607241
[63]	valid_0's rmse: 0.606984
[64]	valid_0's rmse: 0.606862
[65]	valid_0's rmse: 0.606788
[66]	valid_0's rmse: 0.606796
[67]	valid_0's rmse: 0.606517
[68]	valid_0's rmse: 0.606409
[69]	valid_0's rmse: 0.606345
[70]	valid_0's rmse: 0.605998
[71]	valid_0's rmse: 0.605796
[72]	valid_0's rmse: 0.605436
[73]	valid_0's rmse: 0.605242
[74]	valid_0's rmse: 0.605226
[75]	valid_0's rmse: 0.605071
[76]	valid_0's rmse: 0.604913
[77]	valid_0's rmse: 0.604748
[78]	valid_0's rmse: 0.604586
[79]	valid_0's rmse: 0.60469
[80]	valid_0's rmse: 0.604479
[81]	valid_0's rmse: 0.60429
[82]	valid_0's rmse: 0.603949
[83]	valid_0's rmse: 0.603777
[84]	valid_0's rmse: 0.603497
[85]	valid_0's rmse: 0.603526
[86]	valid_0's rmse: 0.603344
[87]	valid_0's rmse: 0.60326
[88]	valid_0's rmse: 0.603202
[89]	valid_0's rmse: 0.603011
[90]	valid_0's rmse: 0.602634
[91]	valid_0's rmse: 0.602547
[92]	valid_0's rmse: 0.602456
[93]	valid_0's rmse: 0.60251
[94]	valid_0's rmse: 0.602468
[95]	valid_0's rmse: 0.602376
[96]	valid_0's rmse: 0.60223
[97]	valid_0's rmse: 0.602163
[98]	valid_0's rmse: 0.602211
[99]	valid_0's rmse: 0.602072
[100]	valid_0's rmse: 0.602145
[101]	valid_0's rmse: 0.602033
[102]	valid_0's rmse: 0.601907
[103]	valid_0's rmse: 0.601749
[104]	valid_0's rmse: 0.601565
[105]	valid_0's rmse: 0.601525
[106]	valid_0's rmse: 0.601463
[107]	valid_0's rmse: 0.601464
[108]	valid_0's rmse: 0.601419
[109]	valid_0's rmse: 0.601331
[110]	valid_0's rmse: 0.601322
[111]	valid_0's rmse: 0.60113
[112]	valid_0's rmse: 0.601027
[113]	valid_0's rmse: 0.600924
[114]	valid_0's rmse: 0.600925
[115]	valid_0's rmse: 0.600872
[116]	valid_0's rmse: 0.601014
[117]	valid_0's rmse: 0.601064
[118]	valid_0's rmse: 0.60098
[119]	valid_0's rmse: 0.601012
[120]	valid_0's rmse: 0.600805
[121]	valid_0's rmse: 0.600885
[122]	valid_0's rmse: 0.600686
[123]	valid_0's rmse: 0.600611
[124]	valid_0's rmse: 0.600603
[125]	valid_0's rmse: 0.600572
[126]	valid_0's rmse: 0.600507
[127]	valid_0's rmse: 0.600321
[128]	valid_0's rmse: 0.600163
[129]	valid_0's rmse: 0.600242
[130]	valid_0's rmse: 0.600336
[131]	valid_0's rmse: 0.600287
[132]	valid_0's rmse: 0.600224
[133]	valid_0's rmse: 0.600217
Early stopping, best iteration is:
[128]	valid_0's rmse: 0.600163
Start predicting...
The rmse of prediction is: 0.600163386271
Feature importances: [207, 186, 89, 250, 116, 67, 186, 226, 127, 34, 178, 160, 91, 60, 144, 172, 75, 48, 193, 62, 19, 2, 25, 1, 29, 102, 26, 5, 13, 13, 19, 679, 1032, 364, 264, 266, 373, 299, 13, 451, 43, 304, 14, 65, 167, 293]

In [31]:
import numpy as np
for i in np.argsort(gbm.feature_importances_)[::-1][:10]:
    print(trainingset.columns[i])


vente_j_8_14
vente_j_7
week
vente_cat4_j_8_14
vente_cat5_j_7
qteG
dayofweek
article_nom_label
vente_cat4_j_7
vente_cat5_j_8_14

In [25]:
help(xgbReg.fit)


Help on method fit in module xgboost.sklearn:

fit(X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None) method of xgboost.sklearn.XGBRegressor instance
    Fit the gradient boosting model
    
    Parameters
    ----------
    X : array_like
        Feature matrix
    y : array_like
        Labels
    sample_weight : array_like
        instance weights
    eval_set : list, optional
        A list of (X, y) tuple pairs to use as a validation set for
        early-stopping
    eval_metric : str, callable, optional
        If a str, should be a built-in evaluation metric to use. See
        doc/parameter.md. If callable, a custom evaluation metric. The call
        signature is func(y_predicted, y_true) where y_true will be a
        DMatrix object such that you may need to call the get_label
        method. It must return a str, value pair where the str is a name
        for the evaluation and value is the value of the evaluation
        function. This objective is always minimized.
    early_stopping_rounds : int
        Activates early stopping. Validation error needs to decrease at
        least every <early_stopping_rounds> round(s) to continue training.
        Requires at least one item in evals.  If there's more than one,
        will use the last. Returns the model from the last iteration
        (not the best one). If early stopping occurs, the model will
        have three additional fields: bst.best_score, bst.best_iteration
        and bst.best_ntree_limit.
        (Use bst.best_ntree_limit to get the correct value if num_parallel_tree
        and/or num_class appears in the parameters)
    verbose : bool
        If `verbose` and an evaluation set is used, writes the evaluation
        metric measured on the validation set to stderr.
    xgb_model : str
        file name of stored xgb model or 'Booster' instance Xgb model to be
        loaded before training (allows training continuation).


In [27]:
xgbReg = xgb.XGBRegressor(nthread=-1, min_child_weight=4, subsample=0.9, max_depth=5) 
xgbReg.fit(X_train, y_train,
        eval_metric='rmse',
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=5)


[0]	validation_0-rmse:0.826859
Will train until validation_0-rmse hasn't improved in 5 rounds.
[1]	validation_0-rmse:0.798098
[2]	validation_0-rmse:0.772668
[3]	validation_0-rmse:0.751844
[4]	validation_0-rmse:0.734243
[5]	validation_0-rmse:0.719077
[6]	validation_0-rmse:0.706491
[7]	validation_0-rmse:0.696243
[8]	validation_0-rmse:0.68749
[9]	validation_0-rmse:0.679917
[10]	validation_0-rmse:0.673718
[11]	validation_0-rmse:0.66858
[12]	validation_0-rmse:0.663864
[13]	validation_0-rmse:0.660004
[14]	validation_0-rmse:0.656571
[15]	validation_0-rmse:0.653245
[16]	validation_0-rmse:0.65026
[17]	validation_0-rmse:0.647687
[18]	validation_0-rmse:0.645823
[19]	validation_0-rmse:0.644358
[20]	validation_0-rmse:0.642686
[21]	validation_0-rmse:0.641419
[22]	validation_0-rmse:0.63989
[23]	validation_0-rmse:0.638882
[24]	validation_0-rmse:0.6377
[25]	validation_0-rmse:0.6365
[26]	validation_0-rmse:0.635653
[27]	validation_0-rmse:0.634763
[28]	validation_0-rmse:0.634005
[29]	validation_0-rmse:0.632881
[30]	validation_0-rmse:0.632189
[31]	validation_0-rmse:0.631473
[32]	validation_0-rmse:0.630768
[33]	validation_0-rmse:0.630484
[34]	validation_0-rmse:0.630282
[35]	validation_0-rmse:0.629654
[36]	validation_0-rmse:0.629012
[37]	validation_0-rmse:0.628461
[38]	validation_0-rmse:0.627771
[39]	validation_0-rmse:0.627302
[40]	validation_0-rmse:0.626285
[41]	validation_0-rmse:0.626089
[42]	validation_0-rmse:0.625384
[43]	validation_0-rmse:0.624773
[44]	validation_0-rmse:0.624378
[45]	validation_0-rmse:0.624054
[46]	validation_0-rmse:0.623799
[47]	validation_0-rmse:0.623576
[48]	validation_0-rmse:0.623227
[49]	validation_0-rmse:0.62293
[50]	validation_0-rmse:0.622721
[51]	validation_0-rmse:0.62275
[52]	validation_0-rmse:0.622563
[53]	validation_0-rmse:0.62257
[54]	validation_0-rmse:0.622452
[55]	validation_0-rmse:0.622265
[56]	validation_0-rmse:0.621668
[57]	validation_0-rmse:0.621515
[58]	validation_0-rmse:0.62147
[59]	validation_0-rmse:0.621317
[60]	validation_0-rmse:0.62119
[61]	validation_0-rmse:0.620701
[62]	validation_0-rmse:0.620731
[63]	validation_0-rmse:0.620737
[64]	validation_0-rmse:0.620449
[65]	validation_0-rmse:0.620349
[66]	validation_0-rmse:0.619975
[67]	validation_0-rmse:0.619659
[68]	validation_0-rmse:0.619545
[69]	validation_0-rmse:0.619322
[70]	validation_0-rmse:0.619293
[71]	validation_0-rmse:0.618752
[72]	validation_0-rmse:0.618843
[73]	validation_0-rmse:0.618651
[74]	validation_0-rmse:0.618492
[75]	validation_0-rmse:0.618023
[76]	validation_0-rmse:0.617687
[77]	validation_0-rmse:0.617492
[78]	validation_0-rmse:0.617392
[79]	validation_0-rmse:0.617372
[80]	validation_0-rmse:0.617398
[81]	validation_0-rmse:0.617571
[82]	validation_0-rmse:0.617296
[83]	validation_0-rmse:0.616955
[84]	validation_0-rmse:0.616561
[85]	validation_0-rmse:0.616405
[86]	validation_0-rmse:0.616325
[87]	validation_0-rmse:0.616226
[88]	validation_0-rmse:0.61602
[89]	validation_0-rmse:0.615876
[90]	validation_0-rmse:0.616279
[91]	validation_0-rmse:0.616269
[92]	validation_0-rmse:0.615728
[93]	validation_0-rmse:0.615591
[94]	validation_0-rmse:0.615444
[95]	validation_0-rmse:0.615076
[96]	validation_0-rmse:0.614916
[97]	validation_0-rmse:0.614819
[98]	validation_0-rmse:0.614784
[99]	validation_0-rmse:0.614503
Start predicting...
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-27-cfdb83a89473> in <module>()
      7 print('Start predicting...')
      8 # predict
----> 9 y_pred = xgbReg.predict(X_test, num_iteration=gbm.best_iteration_)
     10 # eval
     11 print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

TypeError: predict() got an unexpected keyword argument 'num_iteration'

In [33]:
print('Start predicting...')
# predict
y_pred2 = xgbReg.predict(X_test)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred2) ** 0.5)

# feature importances
print('Feature importances:', list(xgbReg.feature_importances_))

import numpy as np
for i in np.argsort(xgbReg.feature_importances_)[::-1][:10]:
    print(trainingset.columns[i])


Start predicting...
The rmse of prediction is: 0.614503423079
Feature importances: [0.016853932, 0.015449438, 0.0066713481, 0.028441012, 0.014396068, 0.0056179776, 0.018258426, 0.024227528, 0.019662922, 0.0070224721, 0.016151685, 0.015098315, 0.0056179776, 0.0049157306, 0.013693821, 0.020365169, 0.0014044944, 0.0056179776, 0.0098314611, 0.022120787, 0.001755618, 0.0, 0.0052668541, 0.0010533708, 0.0045646066, 0.025632022, 0.0056179776, 0.0, 0.0, 0.0024578653, 0.0049157306, 0.11341292, 0.18820225, 0.04985955, 0.031601124, 0.023174157, 0.051966291, 0.047752809, 0.0038623596, 0.066011235, 0.0070224721, 0.024578651, 0.0003511236, 0.0080758426, 0.024929775, 0.036516853]
vente_j_8_14
vente_j_7
week
vente_cat4_j_8_14
vente_cat5_j_7
dayofweek
article_nom_label
vente_cat5_j_8_14
t_9h_rouen
retour_zone_1

In [34]:
print('The rmse of prediction is:', mean_squared_error(y_test, 0.5*(y_pred+y_pred2)) ** 0.5)


The rmse of prediction is: 0.603037683074

In [35]:
y_sub = gbm.predict(testset, num_iteration=gbm.best_iteration_)

In [36]:
y_sub2 = xgbReg.predict(testset)

In [37]:
pd.DataFrame(0.5*(y_sub+y_sub2),index=testset.index,columns=['quantite_vendue']).to_csv('sub.csv',sep=';',decimal=',')

In [ ]: