notebook.community

Edit and run



In [23]:

    
%matplotlib inline

import pandas as pd
import numpy as np
import time

from math import sqrt
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

from matplotlib import pyplot as plt



In [45]:

    
RANDOM_STATE = 6578439


def root_mean_square_percentage(labels, predictions):
    """ As defined by competition """
    if len(labels) != len(predictions):
        raise Exception("Labels and predictions must be of same length")
    # Filter pairs where label == 0
    labels, predictions = tuple(
        zip(*filter(lambda x: x[0] != 0, zip(labels, predictions)))
    )
    labels = numpy.array(labels, dtype=float)
    predictions = numpy.array(predictions, dtype=float)
    return sqrt(numpy.power((labels - predictions) / labels, 2.0).sum() / len(labels))


if __name__ == '__main__':
    print "Loading annotated dataset..."
    annotated_df = pandas.read_csv(
        'data/train.csv',
        dtype={
            'StateHoliday': object,
            'Sales': float,
            'Customers': float,
        },
        parse_dates=['Date']
    )
    
    print "Loading stores dataset..."
    stores_df = pandas.read_csv(
        'data/store.csv',
        dtype = {
            "Store": int,
            "DayOfWeek": int,       # {1, 2, 3, 4, 5, 6, 7}
            "Date": object,         # e.g. "2015-07-31"
            "Sales": int,
            "Customers": int,
            "Open": bool,
            "Promo": bool,
            "StateHoliday": str,    # {'0', 'a', 'b', 'c'}
            "SchoolHoliday": bool,
        }
    )
    
    print "Done loading datasets!"









    



Loading annotated dataset...
Loading stores dataset...
Done loading datasets!



In [46]:

    
stores_df.head()









    Out[46]:






  
    
      
      Store
      StoreType
      Assortment
      CompetitionDistance
      CompetitionOpenSinceMonth
      CompetitionOpenSinceYear
      Promo2
      Promo2SinceWeek
      Promo2SinceYear
      PromoInterval
    
  
  
    
      0
      1
      c
      a
      1270
      9
      2008
      0
      NaN
      NaN
      NaN
    
    
      1
      2
      a
      a
      570
      11
      2007
      1
      13
      2010
      Jan,Apr,Jul,Oct
    
    
      2
      3
      a
      a
      14130
      12
      2006
      1
      14
      2011
      Jan,Apr,Jul,Oct
    
    
      3
      4
      c
      c
      620
      9
      2009
      0
      NaN
      NaN
      NaN
    
    
      4
      5
      a
      a
      29910
      4
      2015
      0
      NaN
      NaN
      NaN



In [49]:

    
full_df = pandas.merge(annotated_df, stores_df)



In [52]:

    
annotated_df.head()









    Out[52]:






  
    
      
      Store
      DayOfWeek
      Date
      Sales
      Customers
      Open
      Promo
      StateHoliday
      SchoolHoliday
    
  
  
    
      0
      1
      5
      2015-07-31
      5263
      555
      1
      1
      0
      1
    
    
      1
      2
      5
      2015-07-31
      6064
      625
      1
      1
      0
      1
    
    
      2
      3
      5
      2015-07-31
      8314
      821
      1
      1
      0
      1
    
    
      3
      4
      5
      2015-07-31
      13995
      1498
      1
      1
      0
      1
    
    
      4
      5
      5
      2015-07-31
      4822
      559
      1
      1
      0
      1



In [51]:

    
full_df.head()









    Out[51]:






  
    
      
      Store
      DayOfWeek
      Date
      Sales
      Customers
      Open
      Promo
      StateHoliday
      SchoolHoliday
      StoreType
      Assortment
      CompetitionDistance
      CompetitionOpenSinceMonth
      CompetitionOpenSinceYear
      Promo2
      Promo2SinceWeek
      Promo2SinceYear
      PromoInterval
    
  
  
    
      0
      1
      5
      2015-07-31
      5263
      555
      1
      1
      0
      1
      c
      a
      1270
      9
      2008
      0
      NaN
      NaN
      NaN
    
    
      1
      1
      4
      2015-07-30
      5020
      546
      1
      1
      0
      1
      c
      a
      1270
      9
      2008
      0
      NaN
      NaN
      NaN
    
    
      2
      1
      3
      2015-07-29
      4782
      523
      1
      1
      0
      1
      c
      a
      1270
      9
      2008
      0
      NaN
      NaN
      NaN
    
    
      3
      1
      2
      2015-07-28
      5011
      560
      1
      1
      0
      1
      c
      a
      1270
      9
      2008
      0
      NaN
      NaN
      NaN
    
    
      4
      1
      1
      2015-07-27
      6102
      612
      1
      1
      0
      1
      c
      a
      1270
      9
      2008
      0
      NaN
      NaN
      NaN



In [55]:

    
print "Preparing annotated dataset for sklearn usage..."
annotated_df['StateHoliday'] = LabelEncoder().fit_transform(annotated_df['StateHoliday'])

print "Enriching annotated dataset with extra features..."
annotated_df['DayOfMonth'] = annotated_df['Date'].apply(lambda dt: dt.day)
annotated_df['Month'] = annotated_df['Date'].apply(lambda dt: dt.month)
annotated_df['Year'] = annotated_df['Date'].apply(lambda dt: dt.year)
annotated_df['UnixTimestamp'] = annotated_df['Date'].apply(lambda dt: time.mktime(dt.timetuple()))
annotated_df.drop('Date', axis=1, inplace=True)









    



Preparing annotated dataset for sklearn usage...
Enriching annotated dataset with extra features...



In [56]:

    
print "Splitting train and test sets..."
train_df, test_df = train_test_split(
    annotated_df,
    test_size=0.10,
    random_state=RANDOM_STATE
)

print "Training random forest..."
random_forest = RandomForestRegressor(
    n_jobs=-1,  # Auto selects number of cores
    random_state=RANDOM_STATE,
    max_features="log2",
    n_estimators=10,
).fit(
    X=train_df[train_df.columns.difference(['Sales'])],
    y=train_df['Sales'],
)
print "Feature importances:"
pairs = zip(train_df.columns.difference(['Sales']), random_forest.feature_importances_)
pairs.sort(key=lambda x: -x[1])
for column, importance in pairs:
    print " ", column, importance

print "Testing random forest..."
predictions = random_forest.predict(
    X=test_df[test_df.columns.difference(['Sales'])],
)
print "Root mean square percentage:"
print " ", root_mean_square_percentage(test_df['Sales'], predictions)









    



Splitting train and test sets...
Training random forest...






    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-56-2a762d68d245> in <module>()
     14 ).fit(
     15     X=train_df[train_df.columns.difference(['Sales'])],
---> 16     y=train_df['Sales'],
     17 )
     18 print "Feature importances:"

/Users/jimbijwaard/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in fit(self, X, y, sample_weight)
    193         """
    194         # Validate or convert input data
--> 195         X = check_array(X, dtype=DTYPE, accept_sparse="csc")
    196         if issparse(X):
    197             # Pre-sort indices to avoid that each individual tree of the

/Users/jimbijwaard/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features)
    342             else:
    343                 dtype = None
--> 344         array = np.array(array, dtype=dtype, order=order, copy=copy)
    345         # make sure we actually converted to numeric:
    346         if dtype_numeric and array.dtype.kind == "O":

ValueError: could not convert string to float: d



In [5]:









    Out[5]:





array([ 7815.3,  9163.5,  3195.7, ...,     0. ,     0. ,     0. ])



In [ ]:

    
predictions_df = pandas.DataFrame({'Sales':predictions})



In [ ]:

    
mask = predictions_df.Sales != 0
predictions_df[mask].Sales.hist(bins=100)
print predictions_df.median()
print predictions_df.mean()
print predictions_df.std()



In [43]:

    
mask = train_df.Sales != 0
train_df[mask].Sales.hist(bins=100)
print train_df.median()
print train_df.mean()
print train_df.std()









    



Store                   558
DayOfWeek                 4
Sales                  5744
Customers               609
Open                      1
Promo                     0
StateHoliday              0
SchoolHoliday             0
DayOfMonth               16
Month                     6
Year                   2014
UnixTimestamp    1396389600
dtype: float64
Store            5.582410e+02
DayOfWeek        3.998639e+00
Sales            5.773654e+03
Customers        6.331056e+02
Open             8.300852e-01
Promo            3.813715e-01
StateHoliday     4.524581e-02
SchoolHoliday    1.786992e-01
DayOfMonth       1.569863e+01
Month            5.847853e+00
Year             2.013832e+03
UnixTimestamp    1.397163e+09
dtype: float64
Store                 321.912126
DayOfWeek               1.997523
Sales                3849.586467
Customers             464.327282
Open                    0.375558
Promo                   0.485724
StateHoliday            0.284062
SchoolHoliday           0.383100
DayOfMonth              8.785539
Month                   3.326897
Year                    0.777177
UnixTimestamp    23707091.402800
dtype: float64



In [ ]:

	Store	StoreType	Assortment	CompetitionDistance	CompetitionOpenSinceMonth	CompetitionOpenSinceYear	Promo2	Promo2SinceWeek	Promo2SinceYear	PromoInterval
0	1	c	a	1270	9	2008	0	NaN	NaN	NaN
1	2	a	a	570	11	2007	1	13	2010	Jan,Apr,Jul,Oct
2	3	a	a	14130	12	2006	1	14	2011	Jan,Apr,Jul,Oct
3	4	c	c	620	9	2009	0	NaN	NaN	NaN
4	5	a	a	29910	4	2015	0	NaN	NaN	NaN

	Store	DayOfWeek	Date	Sales	Customers	Open	Promo	SchoolHoliday
0	1	5	2015-07-31	5263	555	1	1	1
1	2	5	2015-07-31	6064	625	1	1	1
2	3	5	2015-07-31	8314	821	1	1	1
3	4	5	2015-07-31	13995	1498	1	1	1
4	5	5	2015-07-31	4822	559	1	1	1