In [1]:

    
from jupyterthemes import get_themes
from jupyterthemes.stylefx import set_nb_theme
themes = get_themes()
set_nb_theme(themes[3])









    Out[1]:



In [2]:

    
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

import os
import time
import numba
import numpy as np
import pandas as pd

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,numba









    



Ethen 2019-08-09 12:21:33 

CPython 3.6.4
IPython 7.7.0

numpy 1.17.0
pandas 0.25.0
numba 0.37.0

Rossman Data Preparation

Individual Data Source

In addition to the data provided by the competition, we will be using external datasets put together by participants in the Kaggle competition. We can download all of them here. Then we should untar them in the directory to which data_dir is pointing to.



In [3]:

    
data_dir = 'rossmann'
print('available files: ', os.listdir(data_dir))

file_names = ['train', 'store', 'store_states', 'state_names',
              'googletrend', 'weather', 'test']
path_names = {file_name: os.path.join(data_dir, file_name + '.csv')
              for file_name in file_names}

df_train = pd.read_csv(path_names['train'], low_memory=False)
df_test = pd.read_csv(path_names['test'], low_memory=False)
print('training data dimension: ', df_train.shape)
print('testing data dimension: ', df_test.shape)
df_train.head()









    



available files:  ['state_names.csv', 'googletrend.csv', 'test.csv', 'weather.csv', 'train.csv', 'store_states.csv', 'store.csv', 'sample_submission.csv']
training data dimension:  (1017209, 9)
testing data dimension:  (41088, 8)






    Out[3]:







  
    
      
      Store
      DayOfWeek
      Date
      Sales
      Customers
      Open
      Promo
      StateHoliday
      SchoolHoliday
    
  
  
    
      0
      1
      5
      2015-07-31
      5263
      555
      1
      1
      0
      1
    
    
      1
      2
      5
      2015-07-31
      6064
      625
      1
      1
      0
      1
    
    
      2
      3
      5
      2015-07-31
      8314
      821
      1
      1
      0
      1
    
    
      3
      4
      5
      2015-07-31
      13995
      1498
      1
      1
      0
      1
    
    
      4
      5
      5
      2015-07-31
      4822
      559
      1
      1
      0
      1

We turn state Holidays to booleans, to make them more convenient for modeling.



In [4]:

    
df_train['StateHoliday'] = df_train['StateHoliday'] != '0'
df_test['StateHoliday'] = df_test['StateHoliday'] != '0'

For the weather and state names data, we perform a join on a state name field and create a single dataframe.



In [5]:

    
df_weather = pd.read_csv(path_names['weather'])
print('weather data dimension: ', df_weather.shape)
df_weather.head()









    



weather data dimension:  (15840, 24)






    Out[5]:







  
    
      
      file
      Date
      Max_TemperatureC
      Mean_TemperatureC
      Min_TemperatureC
      Dew_PointC
      MeanDew_PointC
      Min_DewpointC
      Max_Humidity
      Mean_Humidity
      ...
      Max_VisibilityKm
      Mean_VisibilityKm
      Min_VisibilitykM
      Max_Wind_SpeedKm_h
      Mean_Wind_SpeedKm_h
      Max_Gust_SpeedKm_h
      Precipitationmm
      CloudCover
      Events
      WindDirDegrees
    
  
  
    
      0
      NordrheinWestfalen
      2013-01-01
      8
      4
      2
      7
      5
      1
      94
      87
      ...
      31.0
      12.0
      4.0
      39
      26
      58.0
      5.08
      6.0
      Rain
      215
    
    
      1
      NordrheinWestfalen
      2013-01-02
      7
      4
      1
      5
      3
      2
      93
      85
      ...
      31.0
      14.0
      10.0
      24
      16
      NaN
      0.00
      6.0
      Rain
      225
    
    
      2
      NordrheinWestfalen
      2013-01-03
      11
      8
      6
      10
      8
      4
      100
      93
      ...
      31.0
      8.0
      2.0
      26
      21
      NaN
      1.02
      7.0
      Rain
      240
    
    
      3
      NordrheinWestfalen
      2013-01-04
      9
      9
      8
      9
      9
      8
      100
      94
      ...
      11.0
      5.0
      2.0
      23
      14
      NaN
      0.25
      7.0
      Rain
      263
    
    
      4
      NordrheinWestfalen
      2013-01-05
      8
      8
      7
      8
      7
      6
      100
      94
      ...
      10.0
      6.0
      3.0
      16
      10
      NaN
      0.00
      7.0
      Rain
      268
    
  

5 rows × 24 columns



In [6]:

    
df_state_names = pd.read_csv(path_names['state_names'])
print('state names data dimension: ', df_state_names.shape)
df_state_names.head()









    



state names data dimension:  (16, 2)






    Out[6]:







  
    
      
      StateName
      State
    
  
  
    
      0
      BadenWuerttemberg
      BW
    
    
      1
      Bayern
      BY
    
    
      2
      Berlin
      BE
    
    
      3
      Brandenburg
      BB
    
    
      4
      Bremen
      HB



In [7]:

    
df_weather = df_weather.rename(columns={'file': 'StateName'})
df_weather = df_weather.merge(df_state_names, on="StateName", how='left')
df_weather.head()









    Out[7]:







  
    
      
      StateName
      Date
      Max_TemperatureC
      Mean_TemperatureC
      Min_TemperatureC
      Dew_PointC
      MeanDew_PointC
      Min_DewpointC
      Max_Humidity
      Mean_Humidity
      ...
      Mean_VisibilityKm
      Min_VisibilitykM
      Max_Wind_SpeedKm_h
      Mean_Wind_SpeedKm_h
      Max_Gust_SpeedKm_h
      Precipitationmm
      CloudCover
      Events
      WindDirDegrees
      State
    
  
  
    
      0
      NordrheinWestfalen
      2013-01-01
      8
      4
      2
      7
      5
      1
      94
      87
      ...
      12.0
      4.0
      39
      26
      58.0
      5.08
      6.0
      Rain
      215
      NW
    
    
      1
      NordrheinWestfalen
      2013-01-02
      7
      4
      1
      5
      3
      2
      93
      85
      ...
      14.0
      10.0
      24
      16
      NaN
      0.00
      6.0
      Rain
      225
      NW
    
    
      2
      NordrheinWestfalen
      2013-01-03
      11
      8
      6
      10
      8
      4
      100
      93
      ...
      8.0
      2.0
      26
      21
      NaN
      1.02
      7.0
      Rain
      240
      NW
    
    
      3
      NordrheinWestfalen
      2013-01-04
      9
      9
      8
      9
      9
      8
      100
      94
      ...
      5.0
      2.0
      23
      14
      NaN
      0.25
      7.0
      Rain
      263
      NW
    
    
      4
      NordrheinWestfalen
      2013-01-05
      8
      8
      7
      8
      7
      6
      100
      94
      ...
      6.0
      3.0
      16
      10
      NaN
      0.00
      7.0
      Rain
      268
      NW
    
  

5 rows × 25 columns

For the google trend data. We're going to extract the state and date information from the raw dataset, also replace all instances of state name 'NI' to match the usage in the rest of the data: 'HB,NI'.



In [8]:

    
df_googletrend = pd.read_csv(path_names['googletrend'])
print('google trend data dimension: ', df_googletrend.shape)
df_googletrend.head()









    



google trend data dimension:  (2072, 3)






    Out[8]:







  
    
      
      file
      week
      trend
    
  
  
    
      0
      Rossmann_DE_SN
      2012-12-02 - 2012-12-08
      96
    
    
      1
      Rossmann_DE_SN
      2012-12-09 - 2012-12-15
      95
    
    
      2
      Rossmann_DE_SN
      2012-12-16 - 2012-12-22
      91
    
    
      3
      Rossmann_DE_SN
      2012-12-23 - 2012-12-29
      48
    
    
      4
      Rossmann_DE_SN
      2012-12-30 - 2013-01-05
      67



In [9]:

    
df_googletrend['Date'] = df_googletrend['week'].str.split(' - ', expand=True)[0]
df_googletrend['State'] = df_googletrend['file'].str.split('_', expand=True)[2]
df_googletrend.loc[df_googletrend['State'] == 'NI', 'State'] = 'HB,NI'
df_googletrend.head()









    Out[9]:







  
    
      
      file
      week
      trend
      Date
      State
    
  
  
    
      0
      Rossmann_DE_SN
      2012-12-02 - 2012-12-08
      96
      2012-12-02
      SN
    
    
      1
      Rossmann_DE_SN
      2012-12-09 - 2012-12-15
      95
      2012-12-09
      SN
    
    
      2
      Rossmann_DE_SN
      2012-12-16 - 2012-12-22
      91
      2012-12-16
      SN
    
    
      3
      Rossmann_DE_SN
      2012-12-23 - 2012-12-29
      48
      2012-12-23
      SN
    
    
      4
      Rossmann_DE_SN
      2012-12-30 - 2013-01-05
      67
      2012-12-30
      SN

The following code chunks extracts particular date fields from a complete datetime for the purpose of constructing categoricals.

We should always consider this feature extraction step when working with date-time. Without expanding our date-time into these additional fields, we can't capture any trend/cyclical behavior as a function of time at any of these granularities. We'll add to every table with a date field.



In [10]:

    
DEFAULT_DT_ATTRIBUTES = [
    'Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
    'Is_month_end', 'Is_month_start', 'Is_quarter_end',
    'Is_quarter_start', 'Is_year_end', 'Is_year_start'
]

def add_datepart(df, colname, drop_original_col=False,
                 dt_attributes=DEFAULT_DT_ATTRIBUTES,
                 add_elapse_col=True):
    """
    Extract various date time components out of a date column, this modifies
    the dataframe inplace.

    References
    ----------
    - https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-date-components
    """
    df[colname] = pd.to_datetime(df[colname], infer_datetime_format=True)
    
    if dt_attributes:
        for attr in dt_attributes:
            df[attr] = getattr(df[colname].dt, attr.lower())

    # representing the number of seconds elapsed from 1970-01-01 00:00:00
    # https://stackoverflow.com/questions/15203623/convert-pandas-datetimeindex-to-unix-time
    if add_elapse_col:
        df['Elapsed'] = df[colname].astype(np.int64) // 10 ** 9
    if drop_original_col:
        df = df.drop(colname, axis=1)

    return df



In [11]:

    
df_weather.head()









    Out[11]:







  
    
      
      StateName
      Date
      Max_TemperatureC
      Mean_TemperatureC
      Min_TemperatureC
      Dew_PointC
      MeanDew_PointC
      Min_DewpointC
      Max_Humidity
      Mean_Humidity
      ...
      Mean_VisibilityKm
      Min_VisibilitykM
      Max_Wind_SpeedKm_h
      Mean_Wind_SpeedKm_h
      Max_Gust_SpeedKm_h
      Precipitationmm
      CloudCover
      Events
      WindDirDegrees
      State
    
  
  
    
      0
      NordrheinWestfalen
      2013-01-01
      8
      4
      2
      7
      5
      1
      94
      87
      ...
      12.0
      4.0
      39
      26
      58.0
      5.08
      6.0
      Rain
      215
      NW
    
    
      1
      NordrheinWestfalen
      2013-01-02
      7
      4
      1
      5
      3
      2
      93
      85
      ...
      14.0
      10.0
      24
      16
      NaN
      0.00
      6.0
      Rain
      225
      NW
    
    
      2
      NordrheinWestfalen
      2013-01-03
      11
      8
      6
      10
      8
      4
      100
      93
      ...
      8.0
      2.0
      26
      21
      NaN
      1.02
      7.0
      Rain
      240
      NW
    
    
      3
      NordrheinWestfalen
      2013-01-04
      9
      9
      8
      9
      9
      8
      100
      94
      ...
      5.0
      2.0
      23
      14
      NaN
      0.25
      7.0
      Rain
      263
      NW
    
    
      4
      NordrheinWestfalen
      2013-01-05
      8
      8
      7
      8
      7
      6
      100
      94
      ...
      6.0
      3.0
      16
      10
      NaN
      0.00
      7.0
      Rain
      268
      NW
    
  

5 rows × 25 columns



In [12]:

    
df_weather = add_datepart(
    df_weather, 'Date',
    dt_attributes=None, add_elapse_col=False)
df_googletrend = add_datepart(
    df_googletrend, 'Date', drop_original_col=True,
    dt_attributes=['Year', 'Week'], add_elapse_col=False)
df_train = add_datepart(df_train, 'Date')
df_test = add_datepart(df_test, 'Date')

print('training data dimension: ', df_train.shape)
df_train.head()









    



training data dimension:  (1017209, 22)






    Out[12]:







  
    
      
      Store
      DayOfWeek
      Date
      Sales
      Customers
      Open
      Promo
      StateHoliday
      SchoolHoliday
      Year
      ...
      Day
      Dayofweek
      Dayofyear
      Is_month_end
      Is_month_start
      Is_quarter_end
      Is_quarter_start
      Is_year_end
      Is_year_start
      Elapsed
    
  
  
    
      0
      1
      5
      2015-07-31
      5263
      555
      1
      1
      False
      1
      2015
      ...
      31
      4
      212
      True
      False
      False
      False
      False
      False
      1438300800
    
    
      1
      2
      5
      2015-07-31
      6064
      625
      1
      1
      False
      1
      2015
      ...
      31
      4
      212
      True
      False
      False
      False
      False
      False
      1438300800
    
    
      2
      3
      5
      2015-07-31
      8314
      821
      1
      1
      False
      1
      2015
      ...
      31
      4
      212
      True
      False
      False
      False
      False
      False
      1438300800
    
    
      3
      4
      5
      2015-07-31
      13995
      1498
      1
      1
      False
      1
      2015
      ...
      31
      4
      212
      True
      False
      False
      False
      False
      False
      1438300800
    
    
      4
      5
      5
      2015-07-31
      4822
      559
      1
      1
      False
      1
      2015
      ...
      31
      4
      212
      True
      False
      False
      False
      False
      False
      1438300800
    
  

5 rows × 22 columns

The Google trends data has a special category for the whole of the Germany - we'll pull that out so we can use it explicitly.



In [13]:

    
df_trend_de = df_googletrend.loc[df_googletrend['file'] == 'Rossmann_DE',
                                 ['Year', 'Week', 'trend']]
df_trend_de.head()

Merging Various Data Source

Now we can outer join all of our data into a single dataframe. Recall that in outer joins everytime a value in the joining field on the left table does not have a corresponding value on the right table, the corresponding row in the new table has Null values for all right table fields. One way to check that all records are consistent and complete is to check for Null values post-join, as we do here.

Aside: Why not just do an inner join? If we are assuming that all records are complete and match on the field we desire, an inner join will do the same thing as an outer join. However, in the event we are not sure, an outer join followed by a null-check will catch it. (Comparing before/after # of rows for inner join is an equivalent approach).

During the merging process, we'll print out the first few rows of the dataframe and the column names so we can keep track of how the dataframe evolves as we join with a new data source.



In [14]:

    
df_store = pd.read_csv(path_names['store'])
print('store data dimension: ', df_store.shape)
df_store.head()









    



store data dimension:  (1115, 10)






    Out[14]:







  
    
      
      Store
      StoreType
      Assortment
      CompetitionDistance
      CompetitionOpenSinceMonth
      CompetitionOpenSinceYear
      Promo2
      Promo2SinceWeek
      Promo2SinceYear
      PromoInterval
    
  
  
    
      0
      1
      c
      a
      1270.0
      9.0
      2008.0
      0
      NaN
      NaN
      NaN
    
    
      1
      2
      a
      a
      570.0
      11.0
      2007.0
      1
      13.0
      2010.0
      Jan,Apr,Jul,Oct
    
    
      2
      3
      a
      a
      14130.0
      12.0
      2006.0
      1
      14.0
      2011.0
      Jan,Apr,Jul,Oct
    
    
      3
      4
      c
      c
      620.0
      9.0
      2009.0
      0
      NaN
      NaN
      NaN
    
    
      4
      5
      a
      a
      29910.0
      4.0
      2015.0
      0
      NaN
      NaN
      NaN



In [15]:

    
df_store_states = pd.read_csv(path_names['store_states'])
print('store states data dimension: ', df_store_states.shape)
df_store_states.head()









    



store states data dimension:  (1115, 2)






    Out[15]:







  
    
      
      Store
      State
    
  
  
    
      0
      1
      HE
    
    
      1
      2
      TH
    
    
      2
      3
      NW
    
    
      3
      4
      BE
    
    
      4
      5
      SN



In [16]:

    
df_store = df_store.merge(df_store_states, on='Store', how='left')
print('null count: ', len(df_store[df_store['State'].isnull()]))
df_store.head()









    



null count:  0






    Out[16]:







  
    
      
      Store
      StoreType
      Assortment
      CompetitionDistance
      CompetitionOpenSinceMonth
      CompetitionOpenSinceYear
      Promo2
      Promo2SinceWeek
      Promo2SinceYear
      PromoInterval
      State
    
  
  
    
      0
      1
      c
      a
      1270.0
      9.0
      2008.0
      0
      NaN
      NaN
      NaN
      HE
    
    
      1
      2
      a
      a
      570.0
      11.0
      2007.0
      1
      13.0
      2010.0
      Jan,Apr,Jul,Oct
      TH
    
    
      2
      3
      a
      a
      14130.0
      12.0
      2006.0
      1
      14.0
      2011.0
      Jan,Apr,Jul,Oct
      NW
    
    
      3
      4
      c
      c
      620.0
      9.0
      2009.0
      0
      NaN
      NaN
      NaN
      BE
    
    
      4
      5
      a
      a
      29910.0
      4.0
      2015.0
      0
      NaN
      NaN
      NaN
      SN



In [17]:

    
df_joined_train = df_train.merge(df_store, on='Store', how='left')
df_joined_test = df_test.merge(df_store, on='Store', how='left')

null_count_train = len(df_joined_train[df_joined_train['StoreType'].isnull()])
null_count_test = len(df_joined_test[df_joined_test['StoreType'].isnull()])
print('null count: ', null_count_train, null_count_test)
print('dimension: ', df_joined_train.shape)
df_joined_train.head()









    



null count:  0 0
dimension:  (1017209, 32)






    Out[17]:







  
    
      
      Store
      DayOfWeek
      Date
      Sales
      Customers
      Open
      Promo
      StateHoliday
      SchoolHoliday
      Year
      ...
      StoreType
      Assortment
      CompetitionDistance
      CompetitionOpenSinceMonth
      CompetitionOpenSinceYear
      Promo2
      Promo2SinceWeek
      Promo2SinceYear
      PromoInterval
      State
    
  
  
    
      0
      1
      5
      2015-07-31
      5263
      555
      1
      1
      False
      1
      2015
      ...
      c
      a
      1270.0
      9.0
      2008.0
      0
      NaN
      NaN
      NaN
      HE
    
    
      1
      2
      5
      2015-07-31
      6064
      625
      1
      1
      False
      1
      2015
      ...
      a
      a
      570.0
      11.0
      2007.0
      1
      13.0
      2010.0
      Jan,Apr,Jul,Oct
      TH
    
    
      2
      3
      5
      2015-07-31
      8314
      821
      1
      1
      False
      1
      2015
      ...
      a
      a
      14130.0
      12.0
      2006.0
      1
      14.0
      2011.0
      Jan,Apr,Jul,Oct
      NW
    
    
      3
      4
      5
      2015-07-31
      13995
      1498
      1
      1
      False
      1
      2015
      ...
      c
      c
      620.0
      9.0
      2009.0
      0
      NaN
      NaN
      NaN
      BE
    
    
      4
      5
      5
      2015-07-31
      4822
      559
      1
      1
      False
      1
      2015
      ...
      a
      a
      29910.0
      4.0
      2015.0
      0
      NaN
      NaN
      NaN
      SN
    
  

5 rows × 32 columns



In [18]:

    
df_joined_train.columns









    Out[18]:





Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'Year', 'Month', 'Week', 'Day',
       'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
       'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start',
       'Elapsed', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'State'],
      dtype='object')



In [19]:

    
df_joined_train = df_joined_train.merge(df_weather, on=['State', 'Date'], how='left')
df_joined_test = df_joined_test.merge(df_weather, on=['State', 'Date'], how='left')

null_count_train = len(df_joined_train[df_joined_train['Mean_TemperatureC'].isnull()])
null_count_test = len(df_joined_test[df_joined_test['Mean_TemperatureC'].isnull()])
print('null count: ', null_count_train, null_count_test)
print('dimension: ', df_joined_train.shape)
df_joined_train.head()









    



null count:  0 0
dimension:  (1017209, 55)






    Out[19]:







  
    
      
      Store
      DayOfWeek
      Date
      Sales
      Customers
      Open
      Promo
      StateHoliday
      SchoolHoliday
      Year
      ...
      Max_VisibilityKm
      Mean_VisibilityKm
      Min_VisibilitykM
      Max_Wind_SpeedKm_h
      Mean_Wind_SpeedKm_h
      Max_Gust_SpeedKm_h
      Precipitationmm
      CloudCover
      Events
      WindDirDegrees
    
  
  
    
      0
      1
      5
      2015-07-31
      5263
      555
      1
      1
      False
      1
      2015
      ...
      31.0
      15.0
      10.0
      24
      11
      NaN
      0.0
      1.0
      Fog
      13
    
    
      1
      2
      5
      2015-07-31
      6064
      625
      1
      1
      False
      1
      2015
      ...
      10.0
      10.0
      10.0
      14
      11
      NaN
      0.0
      4.0
      Fog
      309
    
    
      2
      3
      5
      2015-07-31
      8314
      821
      1
      1
      False
      1
      2015
      ...
      31.0
      14.0
      10.0
      14
      5
      NaN
      0.0
      2.0
      Fog
      354
    
    
      3
      4
      5
      2015-07-31
      13995
      1498
      1
      1
      False
      1
      2015
      ...
      10.0
      10.0
      10.0
      23
      16
      NaN
      0.0
      6.0
      NaN
      282
    
    
      4
      5
      5
      2015-07-31
      4822
      559
      1
      1
      False
      1
      2015
      ...
      10.0
      10.0
      10.0
      14
      11
      NaN
      0.0
      4.0
      NaN
      290
    
  

5 rows × 55 columns



In [20]:

    
df_joined_train.columns









    Out[20]:





Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'Year', 'Month', 'Week', 'Day',
       'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
       'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start',
       'Elapsed', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'State',
       'StateName', 'Max_TemperatureC', 'Mean_TemperatureC',
       'Min_TemperatureC', 'Dew_PointC', 'MeanDew_PointC', 'Min_DewpointC',
       'Max_Humidity', 'Mean_Humidity', 'Min_Humidity',
       'Max_Sea_Level_PressurehPa', 'Mean_Sea_Level_PressurehPa',
       'Min_Sea_Level_PressurehPa', 'Max_VisibilityKm', 'Mean_VisibilityKm',
       'Min_VisibilitykM', 'Max_Wind_SpeedKm_h', 'Mean_Wind_SpeedKm_h',
       'Max_Gust_SpeedKm_h', 'Precipitationmm', 'CloudCover', 'Events',
       'WindDirDegrees'],
      dtype='object')



In [21]:

    
df_joined_train = df_joined_train.merge(df_googletrend,
                                        on=['State', 'Year', 'Week'],
                                        how='left')
df_joined_test = df_joined_test.merge(df_googletrend,
                                      on=['State', 'Year', 'Week'],
                                      how='left')

null_count_train = len(df_joined_train[df_joined_train['trend'].isnull()])
null_count_test = len(df_joined_test[df_joined_test['trend'].isnull()])
print('null count: ', null_count_train, null_count_test)
print('dimension: ', df_joined_train.shape)
df_joined_train.head()









    



null count:  0 0
dimension:  (1017209, 58)






    Out[21]:







  
    
      
      Store
      DayOfWeek
      Date
      Sales
      Customers
      Open
      Promo
      StateHoliday
      SchoolHoliday
      Year
      ...
      Max_Wind_SpeedKm_h
      Mean_Wind_SpeedKm_h
      Max_Gust_SpeedKm_h
      Precipitationmm
      CloudCover
      Events
      WindDirDegrees
      file
      week
      trend
    
  
  
    
      0
      1
      5
      2015-07-31
      5263
      555
      1
      1
      False
      1
      2015
      ...
      24
      11
      NaN
      0.0
      1.0
      Fog
      13
      Rossmann_DE_HE
      2015-08-02 - 2015-08-08
      85
    
    
      1
      2
      5
      2015-07-31
      6064
      625
      1
      1
      False
      1
      2015
      ...
      14
      11
      NaN
      0.0
      4.0
      Fog
      309
      Rossmann_DE_TH
      2015-08-02 - 2015-08-08
      80
    
    
      2
      3
      5
      2015-07-31
      8314
      821
      1
      1
      False
      1
      2015
      ...
      14
      5
      NaN
      0.0
      2.0
      Fog
      354
      Rossmann_DE_NW
      2015-08-02 - 2015-08-08
      86
    
    
      3
      4
      5
      2015-07-31
      13995
      1498
      1
      1
      False
      1
      2015
      ...
      23
      16
      NaN
      0.0
      6.0
      NaN
      282
      Rossmann_DE_BE
      2015-08-02 - 2015-08-08
      74
    
    
      4
      5
      5
      2015-07-31
      4822
      559
      1
      1
      False
      1
      2015
      ...
      14
      11
      NaN
      0.0
      4.0
      NaN
      290
      Rossmann_DE_SN
      2015-08-02 - 2015-08-08
      82
    
  

5 rows × 58 columns



In [22]:

    
df_joined_train.columns









    Out[22]:





Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'Year', 'Month', 'Week', 'Day',
       'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
       'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start',
       'Elapsed', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'State',
       'StateName', 'Max_TemperatureC', 'Mean_TemperatureC',
       'Min_TemperatureC', 'Dew_PointC', 'MeanDew_PointC', 'Min_DewpointC',
       'Max_Humidity', 'Mean_Humidity', 'Min_Humidity',
       'Max_Sea_Level_PressurehPa', 'Mean_Sea_Level_PressurehPa',
       'Min_Sea_Level_PressurehPa', 'Max_VisibilityKm', 'Mean_VisibilityKm',
       'Min_VisibilitykM', 'Max_Wind_SpeedKm_h', 'Mean_Wind_SpeedKm_h',
       'Max_Gust_SpeedKm_h', 'Precipitationmm', 'CloudCover', 'Events',
       'WindDirDegrees', 'file', 'week', 'trend'],
      dtype='object')



In [23]:

    
df_joined_train = df_joined_train.merge(df_trend_de,
                                        on=['Year', 'Week'],
                                        suffixes=('', '_DE'),
                                        how='left')
df_joined_test = df_joined_test.merge(df_trend_de,
                                      on=['Year', 'Week'],
                                      suffixes=('', '_DE'),
                                      how='left')

null_count_train = len(df_joined_train[df_joined_train['trend_DE'].isnull()])
null_count_test = len(df_joined_test[df_joined_test['trend_DE'].isnull()])
print('null count: ', null_count_train, null_count_test)
print('dimension: ', df_joined_train.shape)
df_joined_train.head()









    



null count:  0 0
dimension:  (1017209, 59)






    Out[23]:







  
    
      
      Store
      DayOfWeek
      Date
      Sales
      Customers
      Open
      Promo
      StateHoliday
      SchoolHoliday
      Year
      ...
      Mean_Wind_SpeedKm_h
      Max_Gust_SpeedKm_h
      Precipitationmm
      CloudCover
      Events
      WindDirDegrees
      file
      week
      trend
      trend_DE
    
  
  
    
      0
      1
      5
      2015-07-31
      5263
      555
      1
      1
      False
      1
      2015
      ...
      11
      NaN
      0.0
      1.0
      Fog
      13
      Rossmann_DE_HE
      2015-08-02 - 2015-08-08
      85
      83
    
    
      1
      2
      5
      2015-07-31
      6064
      625
      1
      1
      False
      1
      2015
      ...
      11
      NaN
      0.0
      4.0
      Fog
      309
      Rossmann_DE_TH
      2015-08-02 - 2015-08-08
      80
      83
    
    
      2
      3
      5
      2015-07-31
      8314
      821
      1
      1
      False
      1
      2015
      ...
      5
      NaN
      0.0
      2.0
      Fog
      354
      Rossmann_DE_NW
      2015-08-02 - 2015-08-08
      86
      83
    
    
      3
      4
      5
      2015-07-31
      13995
      1498
      1
      1
      False
      1
      2015
      ...
      16
      NaN
      0.0
      6.0
      NaN
      282
      Rossmann_DE_BE
      2015-08-02 - 2015-08-08
      74
      83
    
    
      4
      5
      5
      2015-07-31
      4822
      559
      1
      1
      False
      1
      2015
      ...
      11
      NaN
      0.0
      4.0
      NaN
      290
      Rossmann_DE_SN
      2015-08-02 - 2015-08-08
      82
      83
    
  

5 rows × 59 columns



In [24]:

    
df_joined_train.columns









    Out[24]:





Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'Year', 'Month', 'Week', 'Day',
       'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
       'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start',
       'Elapsed', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'State',
       'StateName', 'Max_TemperatureC', 'Mean_TemperatureC',
       'Min_TemperatureC', 'Dew_PointC', 'MeanDew_PointC', 'Min_DewpointC',
       'Max_Humidity', 'Mean_Humidity', 'Min_Humidity',
       'Max_Sea_Level_PressurehPa', 'Mean_Sea_Level_PressurehPa',
       'Min_Sea_Level_PressurehPa', 'Max_VisibilityKm', 'Mean_VisibilityKm',
       'Min_VisibilitykM', 'Max_Wind_SpeedKm_h', 'Mean_Wind_SpeedKm_h',
       'Max_Gust_SpeedKm_h', 'Precipitationmm', 'CloudCover', 'Events',
       'WindDirDegrees', 'file', 'week', 'trend', 'trend_DE'],
      dtype='object')

Final Data

After merging all the various data source to create our master dataframe, we'll still perform some additional feature engineering steps including:

Some of the rows contain missing values for some columns, we'll impute them here. What values to impute is pretty subjective then we don't really know the root cause of why it is missing, we won't spend too much time on it here. One common strategy for imputing missing categorical features is to pick an arbitrary signal value that otherwise doesn't appear in the data, e.g. -1, -999. Or impute it with the mean, majority value and create another column that takes on a binary value indicating whether or not that value is missing in the first place.
Create some duration features with Competition and Promo column.



In [25]:

    
for df in (df_joined_train, df_joined_test):
    df['CompetitionOpenSinceYear'] = (df['CompetitionOpenSinceYear']
                                      .fillna(1900)
                                      .astype(np.int32))
    df['CompetitionOpenSinceMonth'] = (df['CompetitionOpenSinceMonth']
                                       .fillna(1)
                                       .astype(np.int32))
    df['Promo2SinceYear'] = df['Promo2SinceYear'].fillna(1900).astype(np.int32)
    df['Promo2SinceWeek'] = df['Promo2SinceWeek'].fillna(1).astype(np.int32)



In [26]:

    
for df in (df_joined_train, df_joined_test):
    df['CompetitionOpenSince'] = pd.to_datetime(dict(
        year=df['CompetitionOpenSinceYear'], 
        month=df['CompetitionOpenSinceMonth'],
        day=15
    ))
    df['CompetitionDaysOpen'] = df['Date'].subtract(df['CompetitionOpenSince']).dt.days

For the CompetitionMonthsOpen field, we limit the maximum to 2 years to limit the number of unique categories.



In [27]:

    
for df in (df_joined_train, df_joined_test):
    df['CompetitionMonthsOpen'] = df['CompetitionDaysOpen'] // 30
    df.loc[df['CompetitionMonthsOpen'] > 24, 'CompetitionMonthsOpen'] = 24
    df.loc[df['CompetitionMonthsOpen'] < -24, 'CompetitionMonthsOpen'] = -24

df_joined_train['CompetitionMonthsOpen'].unique()









    Out[27]:





array([ 24,   3,  19,   9,  16,  17,   7,  15,  22,  11,  13,   2,  23,
         0,  12,   4,  10,   1,  14,  20,   8,  18,  -1,   6,  21,   5,
        -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9, -10, -11, -12, -13, -14,
       -15, -16, -17, -18, -19, -20, -21, -22, -23, -24])

Repeat the same process for Promo



In [28]:

    
from isoweek import Week

for df in (df_joined_train, df_joined_test):
    df['Promo2Since'] = pd.to_datetime(df.apply(lambda x: Week(
        x.Promo2SinceYear, x.Promo2SinceWeek).monday(), axis=1))
    df['Promo2Days'] = df['Date'].subtract(df['Promo2Since']).dt.days



In [29]:

    
for df in (df_joined_train, df_joined_test):
    df['Promo2Weeks'] = df['Promo2Days'] // 7
    df.loc[df['Promo2Weeks'] < 0, 'Promo2Weeks'] = 0
    df.loc[df['Promo2Weeks'] > 25, 'Promo2Weeks'] = 25

df_joined_train['Promo2Weeks'].unique()









    Out[29]:





array([25, 17,  8, 13, 24, 16,  7, 12, 23, 15,  6, 11, 22, 14,  5, 10, 21,
        4,  9, 20,  3, 19,  2, 18,  1,  0])



In [30]:

    
df_joined_train.columns









    Out[30]:





Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'Year', 'Month', 'Week', 'Day',
       'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
       'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start',
       'Elapsed', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'State',
       'StateName', 'Max_TemperatureC', 'Mean_TemperatureC',
       'Min_TemperatureC', 'Dew_PointC', 'MeanDew_PointC', 'Min_DewpointC',
       'Max_Humidity', 'Mean_Humidity', 'Min_Humidity',
       'Max_Sea_Level_PressurehPa', 'Mean_Sea_Level_PressurehPa',
       'Min_Sea_Level_PressurehPa', 'Max_VisibilityKm', 'Mean_VisibilityKm',
       'Min_VisibilitykM', 'Max_Wind_SpeedKm_h', 'Mean_Wind_SpeedKm_h',
       'Max_Gust_SpeedKm_h', 'Precipitationmm', 'CloudCover', 'Events',
       'WindDirDegrees', 'file', 'week', 'trend', 'trend_DE',
       'CompetitionOpenSince', 'CompetitionDaysOpen', 'CompetitionMonthsOpen',
       'Promo2Since', 'Promo2Days', 'Promo2Weeks'],
      dtype='object')

Durations

It is common when working with time series data to extract features that captures relationships across rows instead of between columns. e.g. time until next event, time since last event.

Here, we would like to compute features such as days until next promotion or days before next promotion. And the same process can be repeated for state/school holiday.



In [31]:

    
columns = ['Date', 'Store', 'Promo', 'StateHoliday', 'SchoolHoliday']
df = df_joined_train[columns].append(df_joined_test[columns])
df['DateUnixSeconds'] = df['Date'].astype(np.int64) // 10 ** 9
df.head()









    Out[31]:







  
    
      
      Date
      Store
      Promo
      StateHoliday
      SchoolHoliday
      DateUnixSeconds
    
  
  
    
      0
      2015-07-31
      1
      1
      False
      1
      1438300800
    
    
      1
      2015-07-31
      2
      1
      False
      1
      1438300800
    
    
      2
      2015-07-31
      3
      1
      False
      1
      1438300800
    
    
      3
      2015-07-31
      4
      1
      False
      1
      1438300800
    
    
      4
      2015-07-31
      5
      1
      False
      1
      1438300800



In [32]:

    
@numba.njit
def compute_duration(store_arr, date_unix_seconds_arr, field_arr):
    """
    For each store, track the day since/before the occurrence of a field.
    The store and date are assumed to be already sorted.
    
    Parameters
    ----------
    store_arr : 1d ndarray[int]

    date_unix_seconds_arr : 1d ndarray[int]
        The date should be represented in unix timestamp (seconds).

    field_arr : 1d ndarray[bool]/ndarray[int]
        The field that we're interested in. If int, it should take value
        of 1/0 indicating whether the field/event occurred or not.

    Returns
    -------
    result : list[int]
        Days since/before the occurrence of a field.
    """
    result = []
    last_store = 0

    zipped = zip(store_arr, date_unix_seconds_arr, field_arr)
    for store, date_unix_seconds, field in zipped:
        if store != last_store:
            last_store = store
            last_date = date_unix_seconds

        if field:
            last_date = date_unix_seconds

        diff_day = (date_unix_seconds - last_date) // 86400
        result.append(diff_day)

    return result



In [33]:

    
df = df.sort_values(['Store', 'Date'])

start = time.time()

for col in ('SchoolHoliday', 'StateHoliday', 'Promo'):
    result = compute_duration(df['Store'].values,
                              df['DateUnixSeconds'].values,
                              df[col].values)
    df['After' + col] = result
    
end = time.time()
print('elapsed: ', end - start)

df.head(10)









    



elapsed:  0.97281813621521






    Out[33]:







  
    
      
      Date
      Store
      Promo
      StateHoliday
      SchoolHoliday
      DateUnixSeconds
      AfterSchoolHoliday
      AfterStateHoliday
      AfterPromo
    
  
  
    
      1016095
      2013-01-01
      1
      0
      True
      1
      1356998400
      0
      0
      0
    
    
      1014980
      2013-01-02
      1
      0
      False
      1
      1357084800
      0
      1
      1
    
    
      1013865
      2013-01-03
      1
      0
      False
      1
      1357171200
      0
      2
      2
    
    
      1012750
      2013-01-04
      1
      0
      False
      1
      1357257600
      0
      3
      3
    
    
      1011635
      2013-01-05
      1
      0
      False
      1
      1357344000
      0
      4
      4
    
    
      1010520
      2013-01-06
      1
      0
      False
      1
      1357430400
      0
      5
      5
    
    
      1009405
      2013-01-07
      1
      1
      False
      1
      1357516800
      0
      6
      0
    
    
      1008290
      2013-01-08
      1
      1
      False
      1
      1357603200
      0
      7
      0
    
    
      1007175
      2013-01-09
      1
      1
      False
      1
      1357689600
      0
      8
      0
    
    
      1006060
      2013-01-10
      1
      1
      False
      1
      1357776000
      0
      9
      0

If we look at the values in the AfterStateHoliday column, we can see that the first row of the StateHoliday column is True, therefore, the corresponding AfterStateHoliday is therefore 0 indicating it's a state holiday that day, after encountering a state holiday, the AfterStateHoliday column will start incrementing until it sees the next StateHoliday, which will then reset this counter.

Note that for Promo, it starts out with a 0, but the AfterPromo starts accumulating until it sees the next Promo. Here, we're not exactly sure when was the last promo before 2013-01-01 since we don't have the data for it. Nonetheless we'll still start incrementing the counter. Another approach is to fill it all with 0.



In [34]:

    
df = df.sort_values(['Store', 'Date'], ascending=[True, False])

start = time.time()

for col in ('SchoolHoliday', 'StateHoliday', 'Promo'):
    result = compute_duration(df['Store'].values,
                              df['DateUnixSeconds'].values,
                              df[col].values)
    df['Before' + col] = result
    
end = time.time()
print('elapsed: ', end - start)

df.head(10)









    



elapsed:  0.7040410041809082






    Out[34]:







  
    
      
      Date
      Store
      Promo
      StateHoliday
      SchoolHoliday
      DateUnixSeconds
      AfterSchoolHoliday
      AfterStateHoliday
      AfterPromo
      BeforeSchoolHoliday
      BeforeStateHoliday
      BeforePromo
    
  
  
    
      0
      2015-09-17
      1
      1
      False
      0
      1442448000
      13
      105
      0
      0
      0
      0
    
    
      856
      2015-09-16
      1
      1
      False
      0
      1442361600
      12
      104
      0
      -1
      -1
      0
    
    
      1712
      2015-09-15
      1
      1
      False
      0
      1442275200
      11
      103
      0
      -2
      -2
      0
    
    
      2568
      2015-09-14
      1
      1
      False
      0
      1442188800
      10
      102
      0
      -3
      -3
      0
    
    
      3424
      2015-09-13
      1
      0
      False
      0
      1442102400
      9
      101
      9
      -4
      -4
      -1
    
    
      4280
      2015-09-12
      1
      0
      False
      0
      1442016000
      8
      100
      8
      -5
      -5
      -2
    
    
      5136
      2015-09-11
      1
      0
      False
      0
      1441929600
      7
      99
      7
      -6
      -6
      -3
    
    
      5992
      2015-09-10
      1
      0
      False
      0
      1441843200
      6
      98
      6
      -7
      -7
      -4
    
    
      6848
      2015-09-09
      1
      0
      False
      0
      1441756800
      5
      97
      5
      -8
      -8
      -5
    
    
      7704
      2015-09-08
      1
      0
      False
      0
      1441670400
      4
      96
      4
      -9
      -9
      -6

After creating these new features, we join it back to the original dataframe.



In [35]:

    
df = df.drop(['Promo', 'StateHoliday', 'SchoolHoliday', 'DateUnixSeconds'], axis=1)
df_joined_train = df_joined_train.merge(df, on=['Date', 'Store'], how='inner')
df_joined_test = df_joined_test.merge(df, on=['Date', 'Store'], how='inner')

print('dimension: ', df_joined_train.shape)
df_joined_train.head()









    



dimension:  (1017209, 71)






    Out[35]:







  
    
      
      Store
      DayOfWeek
      Date
      Sales
      Customers
      Open
      Promo
      StateHoliday
      SchoolHoliday
      Year
      ...
      CompetitionMonthsOpen
      Promo2Since
      Promo2Days
      Promo2Weeks
      AfterSchoolHoliday
      AfterStateHoliday
      AfterPromo
      BeforeSchoolHoliday
      BeforeStateHoliday
      BeforePromo
    
  
  
    
      0
      1
      5
      2015-07-31
      5263
      555
      1
      1
      False
      1
      2015
      ...
      24
      1900-01-01
      42214
      25
      0
      57
      0
      0
      -48
      0
    
    
      1
      2
      5
      2015-07-31
      6064
      625
      1
      1
      False
      1
      2015
      ...
      24
      2010-03-29
      1950
      25
      0
      67
      0
      0
      0
      0
    
    
      2
      3
      5
      2015-07-31
      8314
      821
      1
      1
      False
      1
      2015
      ...
      24
      2011-04-04
      1579
      25
      0
      57
      0
      0
      -48
      0
    
    
      3
      4
      5
      2015-07-31
      13995
      1498
      1
      1
      False
      1
      2015
      ...
      24
      1900-01-01
      42214
      25
      0
      67
      0
      0
      0
      0
    
    
      4
      5
      5
      2015-07-31
      4822
      559
      1
      1
      False
      1
      2015
      ...
      3
      1900-01-01
      42214
      25
      0
      57
      0
      0
      0
      0
    
  

5 rows × 71 columns



In [36]:

    
df_joined_train.columns









    Out[36]:





Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'Year', 'Month', 'Week', 'Day',
       'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
       'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start',
       'Elapsed', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'State',
       'StateName', 'Max_TemperatureC', 'Mean_TemperatureC',
       'Min_TemperatureC', 'Dew_PointC', 'MeanDew_PointC', 'Min_DewpointC',
       'Max_Humidity', 'Mean_Humidity', 'Min_Humidity',
       'Max_Sea_Level_PressurehPa', 'Mean_Sea_Level_PressurehPa',
       'Min_Sea_Level_PressurehPa', 'Max_VisibilityKm', 'Mean_VisibilityKm',
       'Min_VisibilitykM', 'Max_Wind_SpeedKm_h', 'Mean_Wind_SpeedKm_h',
       'Max_Gust_SpeedKm_h', 'Precipitationmm', 'CloudCover', 'Events',
       'WindDirDegrees', 'file', 'week', 'trend', 'trend_DE',
       'CompetitionOpenSince', 'CompetitionDaysOpen', 'CompetitionMonthsOpen',
       'Promo2Since', 'Promo2Days', 'Promo2Weeks', 'AfterSchoolHoliday',
       'AfterStateHoliday', 'AfterPromo', 'BeforeSchoolHoliday',
       'BeforeStateHoliday', 'BeforePromo'],
      dtype='object')

We save the cleaned data so we won't have to repeat this data preparation step again.



In [37]:

    
output_dir = 'cleaned_data'
if not os.path.isdir(output_dir):
    os.makedirs(output_dir, exist_ok=True)

engine = 'pyarrow'
output_path_train = os.path.join(output_dir, 'train_clean.parquet')
output_path_test = os.path.join(output_dir, 'test_clean.parquet')
df_joined_train.to_parquet(output_path_train, engine=engine)
df_joined_test.to_parquet(output_path_test, engine=engine)

Reference

Jupyter Notebook: Fastai Course v3 - Rossman Data Clean

	Store	DayOfWeek	Date	Sales	Customers	Open	Promo	SchoolHoliday
0	1	5	2015-07-31	5263	555	1	1	1
1	2	5	2015-07-31	6064	625	1	1	1
2	3	5	2015-07-31	8314	821	1	1	1
3	4	5	2015-07-31	13995	1498	1	1	1
4	5	5	2015-07-31	4822	559	1	1	1

	file	Date	Max_TemperatureC	Mean_TemperatureC	Min_TemperatureC	Dew_PointC	MeanDew_PointC	Min_DewpointC	Max_Humidity	Mean_Humidity	...	Max_VisibilityKm	Mean_VisibilityKm	Min_VisibilitykM	Max_Wind_SpeedKm_h	Mean_Wind_SpeedKm_h	Max_Gust_SpeedKm_h	Precipitationmm	CloudCover	Events	WindDirDegrees
0	NordrheinWestfalen	2013-01-01	8	4	2	7	5	1	94	87	...	31.0	12.0	4.0	39	26	58.0	5.08	6.0	Rain	215
1	NordrheinWestfalen	2013-01-02	7	4	1	5	3	2	93	85	...	31.0	14.0	10.0	24	16	NaN	0.00	6.0	Rain	225
2	NordrheinWestfalen	2013-01-03	11	8	6	10	8	4	100	93	...	31.0	8.0	2.0	26	21	NaN	1.02	7.0	Rain	240
3	NordrheinWestfalen	2013-01-04	9	9	8	9	9	8	100	94	...	11.0	5.0	2.0	23	14	NaN	0.25	7.0	Rain	263
4	NordrheinWestfalen	2013-01-05	8	8	7	8	7	6	100	94	...	10.0	6.0	3.0	16	10	NaN	0.00	7.0	Rain	268

	StateName	State
0	BadenWuerttemberg	BW
1	Bayern	BY
2	Berlin	BE
3	Brandenburg	BB
4	Bremen	HB

	file	week	trend
0	Rossmann_DE_SN	2012-12-02 - 2012-12-08	96
1	Rossmann_DE_SN	2012-12-09 - 2012-12-15	95
2	Rossmann_DE_SN	2012-12-16 - 2012-12-22	91
3	Rossmann_DE_SN	2012-12-23 - 2012-12-29	48
4	Rossmann_DE_SN	2012-12-30 - 2013-01-05	67

	file	week	trend	Date	State
0	Rossmann_DE_SN	2012-12-02 - 2012-12-08	96	2012-12-02	SN
1	Rossmann_DE_SN	2012-12-09 - 2012-12-15	95	2012-12-09	SN
2	Rossmann_DE_SN	2012-12-16 - 2012-12-22	91	2012-12-16	SN
3	Rossmann_DE_SN	2012-12-23 - 2012-12-29	48	2012-12-23	SN
4	Rossmann_DE_SN	2012-12-30 - 2013-01-05	67	2012-12-30	SN

	Store	DayOfWeek	Date	Sales	Customers	Open	Promo	StateHoliday	SchoolHoliday	Year	...	Day	Dayofweek	Dayofyear	Is_month_end	Is_month_start	Is_quarter_end	Is_quarter_start	Is_year_end	Is_year_start	Elapsed
0	1	5	2015-07-31	5263	555	1	1	False	1	2015	...	31	4	212	True	False	False	False	False	False	1438300800
1	2	5	2015-07-31	6064	625	1	1	False	1	2015	...	31	4	212	True	False	False	False	False	False	1438300800
2	3	5	2015-07-31	8314	821	1	1	False	1	2015	...	31	4	212	True	False	False	False	False	False	1438300800
3	4	5	2015-07-31	13995	1498	1	1	False	1	2015	...	31	4	212	True	False	False	False	False	False	1438300800
4	5	5	2015-07-31	4822	559	1	1	False	1	2015	...	31	4	212	True	False	False	False	False	False	1438300800

	Store	StoreType	Assortment	CompetitionDistance	CompetitionOpenSinceMonth	CompetitionOpenSinceYear	Promo2	Promo2SinceWeek	Promo2SinceYear	PromoInterval
0	1	c	a	1270.0	9.0	2008.0	0	NaN	NaN	NaN
1	2	a	a	570.0	11.0	2007.0	1	13.0	2010.0	Jan,Apr,Jul,Oct
2	3	a	a	14130.0	12.0	2006.0	1	14.0	2011.0	Jan,Apr,Jul,Oct
3	4	c	c	620.0	9.0	2009.0	0	NaN	NaN	NaN
4	5	a	a	29910.0	4.0	2015.0	0	NaN	NaN	NaN

	Date	Store	Promo	StateHoliday	SchoolHoliday	DateUnixSeconds	AfterStateHoliday	AfterPromo
1016095	2013-01-01	1	0	True	1	1356998400	0	0
1014980	2013-01-02	1	0	False	1	1357084800	1	1
1013865	2013-01-03	1	0	False	1	1357171200	2	2
1012750	2013-01-04	1	0	False	1	1357257600	3	3
1011635	2013-01-05	1	0	False	1	1357344000	4	4
1010520	2013-01-06	1	0	False	1	1357430400	5	5
1009405	2013-01-07	1	1	False	1	1357516800	6	0
1008290	2013-01-08	1	1	False	1	1357603200	7	0
1007175	2013-01-09	1	1	False	1	1357689600	8	0
1006060	2013-01-10	1	1	False	1	1357776000	9	0

	Date	Store	Promo	StateHoliday	DateUnixSeconds	AfterSchoolHoliday	AfterStateHoliday	AfterPromo	BeforeSchoolHoliday	BeforeStateHoliday	BeforePromo
0	2015-09-17	1	1	False	1442448000	13	105	0	0	0	0
856	2015-09-16	1	1	False	1442361600	12	104	0	-1	-1	0
1712	2015-09-15	1	1	False	1442275200	11	103	0	-2	-2	0
2568	2015-09-14	1	1	False	1442188800	10	102	0	-3	-3	0
3424	2015-09-13	1	0	False	1442102400	9	101	9	-4	-4	-1
4280	2015-09-12	1	0	False	1442016000	8	100	8	-5	-5	-2
5136	2015-09-11	1	0	False	1441929600	7	99	7	-6	-6	-3
5992	2015-09-10	1	0	False	1441843200	6	98	6	-7	-7	-4
6848	2015-09-09	1	0	False	1441756800	5	97	5	-8	-8	-5
7704	2015-09-08	1	0	False	1441670400	4	96	4	-9	-9	-6