In [1]:
%matplotlib inline
%pylab inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings as w
import rossmann_helper_functions as rosshellfun #:)
sns.set_style('whitegrid')
pd.set_option('display.max_columns', None)
w.filterwarnings("ignore", category=DeprecationWarning)
w.filterwarnings("ignore", category=FutureWarning)
|
train.csv
- 3.5 years of sales data
- number of customers - promotion activities - store opening hours - state / school holiday details |
stores.csv
- store type code
- assortment type code - long term promotions - nearest competition - locality unknown |
test.csv
- same as train.csv
- no sales and customer data |
In [2]:
types = {
'CompetitionOpenSinceYear': np.dtype(int),
'CompetitionOpenSinceMonth': np.dtype(int),
'StateHoliday': np.dtype(str),
'Promo2SinceWeek': np.dtype(int),
'SchoolHoliday': np.dtype(int),
'PromoInterval': np.dtype(str)
}
train = pd.read_csv("../train.csv", parse_dates=[2], dtype=types)
test = pd.read_csv("../test.csv", parse_dates=[3], dtype=types)
store = pd.read_csv("../store.csv")
train['Day'] = train.Date.dt.day
train['Month'] = train.Date.dt.month
train['Year'] = train.Date.dt.year
train.sort(['Store','Year','Month','Day'], inplace = True)
In [3]:
train.head(5)
Out[3]:
In [4]:
test.head(5)
Out[4]:
In [5]:
fig, (axis1) = plt.subplots(1, 1, figsize=(15,4))
train_plot = train[train["Open"] == 1]
sns.boxplot(x='DayOfWeek',y='Sales', data=train_plot.sort('DayOfWeek'), palette="husl", ax=axis1)
plt
Out[5]:
In [6]:
fig, (axis1) = plt.subplots(1, 1, figsize=(15,4))
train_plot = train[train["Open"] == 1]
sns.boxplot(x='DayOfWeek',y='Sales', data=train_plot.sort('DayOfWeek'), palette="husl", ax=axis1)
plt.ylim(ymax=12500, ymin=3000)
Out[6]:
In [7]:
fig, (axis1) = plt.subplots(1, 1, figsize=(15,7))
sns.boxplot(x='Day',y='Sales', data=train_plot.sort('Day'), palette="husl", ax=axis1)
Out[7]:
In [8]:
for fest in ['a','b','c']:
print('Max. consecutive day(s) for holiday type "%s" is %i'
% (fest, rosshellfun.longest_streak(train['StateHoliday'].values, fest)))
In [9]:
print('Max. consecutive day(s) where a store was marked as closed is %i!'
% rosshellfun.longest_streak(train['Open'].values, 0))
print('Max. consecutive day(s) where a store was marked with zero sales is %i!'
% rosshellfun.longest_streak(train['Sales'].values, 0))
In [10]:
plt.plot(train.loc[train['Store'] == 708, 'Sales'], 'b.')
plt.ylim(ymax=20000, ymin=200)
Out[10]:
In [11]:
rosshellfun.addTimeToClose(train)
rosshellfun.addTimeToClose(test)
plt.xlabel('Days after/before longer closed period')
Out[11]:
In [12]:
print("There are %i missing Open/Close values in the TRAINING set."
% sum(np.isnan(train.loc[:,'Open'])))
print("There are %i missing Open/Close values in the TEST set."
% sum(np.isnan(test.loc[:,'Open'])))
train = train[train["Open"] != 0]
train = train[train["Sales"] > 0]
test.fillna(1, inplace=True)
In [13]:
train = pd.merge(train, store, on='Store')
test = pd.merge(test, store, on='Store')
|
|
|
|
In [14]:
# Load external data
ext_curr = rosshellfun.ext_load_currency_data(
"../ExternalData/ext-currfx-eur-usd.csv") # EurUsdRate
ext_search = rosshellfun.ext_load_search_data(
"../ExternalData/ext-rossmann-google-search-intensities-2013-2015.csv") # SearchRossmanGermany, SearchRossmanWorld
ext_weather = rosshellfun.ext_load_weather_data(
"../ExternalData/ext-weather-events-germany.csv") # WeatherBad
# Join the existing data sets with the external data
train2 = train \
.merge(ext_weather, on="Date") \
.merge(ext_search, on="Date") \
.merge(ext_curr, on="Date")
test2 = test \
.merge(ext_weather, on="Date") \
.merge(ext_search, on="Date") \
.merge(ext_curr, on="Date")
print("Shape of vectors before/after joined external data:\nTrain",
train.shape, "=", train2.shape, "\nTest", test.shape, "=", test2.shape)
train = train2
test = test2
In [15]:
train = rosshellfun.build_features(train)
test = rosshellfun.build_features(test)
In [16]:
train.head(5)
Out[16]:
In [17]:
test.head(5)
Out[17]:
In [18]:
table = pd.pivot_table(train.loc[:,['Date','Sales','Store']],index = ['Date'], columns = ['Store'], values = ['Sales'] )
store_type_labels = rosshellfun.cluster_all_stores(table, cut_thresh = 0.05, method = 'ward')
# ward's method focuses on variance and not distance itself
In [19]:
# add store_type_labels to training and test
train = pd.merge(train, pd.DataFrame([store_type_labels,table.columns.get_level_values(1)],
columns = table.columns.get_level_values(1),
index = ['StoreCluster','Store'] ).T, on='Store')
test = pd.merge(test, pd.DataFrame([store_type_labels,table.columns.get_level_values(1)],
columns = table.columns.get_level_values(1),
index = ['StoreCluster','Store'] ).T, on='Store')
In [20]:
train.loc[:,['Store','StoreCluster']].drop_duplicates(inplace = False).sort('Store').head(10)
Out[20]:
In [21]:
test.loc[:,['Store','StoreCluster']].drop_duplicates(inplace = False).sort('Store').head(10)
Out[21]:
In [22]:
#create dummy columns function creates dummy columns where values are value+_+columns name
train = rosshellfun.create_dummy_columns(train, ['StoreCluster','DayOfWeek','DaysLeft','DaysAfter'])
test = rosshellfun.create_dummy_columns(test, ['StoreCluster','DayOfWeek','DaysLeft','DaysAfter'])
In [23]:
train.head(5)
Out[23]:
In [24]:
# note Id is not sorted now
test = test.sort('Id')
test.head(5)
Out[24]:
In [25]:
# set up training features
# from this we have to remove the original categorical and the string columns
# AND the customers column that is not available in the test
# Gert predicted the customers also and used that to further help his prediction on the sales
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
features = train.select_dtypes(include=numerics).columns.values.tolist()
features.remove('Customers')
features.remove('Store') # categorical but is numeric
features.remove('Sales') # Of course let's remove the outcome
print(features)
In [26]:
from sklearn.cross_validation import train_test_split
# create test set
X = train.loc[:,features]
y = train.loc[:,'Sales']
train_base_x, train_validation_x, train_base_y, train_validation_y = train_test_split( X, y, test_size = 0.2 )
# Note Gert here did not randomly chose the hold out set but
# he chose consecutive days
Gradient boosting is the Holy Grail of several machine learning problems today
Gradient boosting is practically a huge ensemble of decision trees
Strong regularization:
The difference between scikit GBM and XGBoost is... well actually as far as I see XGBoost gives you a little more option in logistic regression problems (e.g. max delta step)
The objective function consists of the sum of the Loss and Regularization parameter ($\Omega$). So:
$$Objective(\Theta) = Loss(\Theta) + \Omega(\Theta)$$Where $\Theta$ is the parameter set to be defined. The loss function that we used was
$$\begin{align} Loss(\Theta) = \left\{ \begin{array}{cl} \frac{1}{2} \left[y-\hat{y}\right]^2 & \text{for }|y-\hat{y}| \le \alpha, \\ \alpha \left(|y-\hat{y}|-\alpha/2\right) & \text{for }|y-\hat{y}| \gt \alpha \end{array}\right. \end{align} $$Where $\alpha$ was equal to $Q(.56)$ (that is ~median). This loss function applies squared loss for small residuals, but absolute loss for larger. Could be better then squared loss, when outliers would pull the squared loss from optimum.
Huber, P. J. (1964). Robust estimation of a location parameter. The Annals of Mathematical Statistics, 35(1), 73-101.
There are several parameters to set (both for the loss and for the regularization)
Bergstra, J., & Bengio, Y. (2012). Random search for hyper-parameter optimization. The Journal of Machine Learning Research, 13(1), 281-305. ISO 690
In [27]:
import random
from sklearn.grid_search import RandomizedSearchCV
from sklearn import ensemble
from scipy.stats import uniform, randint
# it is not necessary to search the parameters using the complete training set
training_subset = random.sample(range(len(train_base_x)),800) # this is intentionnaly so low it was 80000 in the competition
train_sample_x = train_base_x.iloc[training_subset,]
train_sample_y = train_base_y.iloc[training_subset,]
# note, this is not the whole parameter set of scikit (neither of XGBoost)
parameters = { 'learning_rate': uniform(), #0.426348401220647
'max_depth' : [4, None], # None
'min_samples_leaf' : randint(10, 100), #45
'loss' : ['ls', 'lad', 'huber', 'quantile'], #'huber'
'subsample' : uniform(), #0.7675104474139473
'alpha': uniform()} #0.560116434370429
model = ensemble.GradientBoostingRegressor(n_estimators = 100, verbose = 0)
params_found = RandomizedSearchCV(model, param_distributions = parameters,
n_iter = 100, n_jobs = -1).fit(train_sample_x,train_sample_y)
params_found.best_params_
Out[27]:
In [28]:
model = ensemble.GradientBoostingRegressor(n_estimators = 100, verbose = 100, # in the challenge we used 3000 estimators
learning_rate = params_found.best_params_['learning_rate'],
loss = params_found.best_params_['loss'],
max_depth = params_found.best_params_['max_depth'],
min_samples_leaf = params_found.best_params_['min_samples_leaf'],
subsample = params_found.best_params_['subsample'])
model = model.fit(train_base_x,train_base_y)
In [29]:
# Let's see how the model performed
rosshellfun.plot_results(model, train_base_x, train_validation_x,
train_base_y, train_validation_y, savemodel = True)
In [30]:
# let's see the relative importance of features
rosshellfun.plot_feature_importances(model, features)
In [31]:
test['Sales'] = model.predict(test.loc[:,features])
test.loc[test['Open'] == 0,'Sales'] = 0
test.loc[:,['Id','Sales']].set_index('Id',drop = True).to_csv('submission_gbrt.csv')
Weight initialization
Activation functions
Size of network
Normalization
Optimizers
Batch Size
We used Keras
Weight initialization defines the probability distribution used to set the initial random weights of layers.
Available methods in Keras:
Activations that are more complex than a simple Theano/TensorFlow function (eg. learnable activations, configurable activations, etc.) are also available in Keras:
In [ ]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.optimizers import Adam
from keras.layers.advanced_activations import PReLU
from keras.layers.normalization import BatchNormalization
model = Sequential()
model.add(Dense(50, input_dim=1242, init='he_normal'))
model.add(PReLU())
#model.add(BatchNormalization())
#model.add(Dropout(0.5))
model.add(Dense(50, init='he_normal'))
model.add(PReLU())
#model.add(BatchNormalization())
#model.add(Dropout(0.5))
model.add(Dense(1, init='he_normal'))
model.add(Activation('sigmoid'))
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
model.compile(loss='mse', optimizer=adam)
In [ ]:
def featureScaling(arr):
min_arr = min(arr); max_arr = max(arr); new_arr = []
for i in arr:
new_arr.append((i - min_arr) / (max_arr - min_arr))
return new_arr
In [ ]:
class Vectorizer():
def __init__(self):
self.vectCache = dict()
def vect(self, size, idx):
"""Retrieves a vector such as [0,0,0,...,idx=1,0,0,...,0] where the size of the vector is 'size'
and all items are zero except the 'idx' numbered element, which is one.
Note: 'idx' numbering starts from 1..size, not from zero, it is 1-indexed.
"""
if size not in self.vectCache:
print("Vectorizer: Generating vectors for size", size)
arr = []
for i in range(size):
e = [0] * size
e[i] = 1.0
arr.append(e)
self.vectCache[size] = arr
return self.vectCache[size][idx-1]
v = Vectorizer()
In [ ]:
df = pd.read_csv('train_extended.csv')
df.Sales = df.Sales.apply(np.log1p)
df.Customers = df.Customers.apply(np.log1p)
df['CompetitionDistanceLog'] = df.CompetitionDistance.apply(np.log1p)
df.CompetitionOpen = df.CompetitionOpen + 31
df['CompetitionOpenLog'] = df.CompetitionOpen.apply(np.log1p)
df.EurUsdRate = df.EurUsdRate - 0.7
df.CompetitionOpenNull = df.CompetitionOpenNull + 31
df['CompetitionOpenNullLog'] = df.CompetitionOpenNull.apply(np.log1p)
new = df
a = new['Sales']
b = new['Customers']
c = new['CompetitionDistance']
d = new['CompetitionOpen']
e = new['PromoOpen']
f = new['CompetitionOpenNull']
g = new['CompetitionDistanceLog']
h = new['CompetitionOpenLog']
i = new['CompetitionOpenNullLog']
a_scaled = featureScaling(a)
b_scaled = featureScaling(b)
c_scaled = featureScaling(c)
d_scaled = featureScaling(d)
e_scaled = featureScaling(e)
f_scaled = featureScaling(f)
g_scaled = featureScaling(g)
h_scaled = featureScaling(h)
i_scaled = featureScaling(i)
new['Sales'] = a_scaled
new['Customers'] = b_scaled
new['CompetitionDistance'] = c_scaled
new['CompetitionOpen'] = d_scaled
new['PromoOpen'] = e_scaled
new['CompetitionOpenNull'] = f_scaled
new['CompetitionDistanceLog'] = g_scaled
new['CompetitionOpenLog'] = h_scaled
new['CompetitionOpenNullLog'] = i_scaled
new.to_csv('scaled_train.csv', na_rep='0', header=True, index = False)
In [ ]:
df = pd.read_csv('test_extended.csv')
df['CompetitionDistanceLog'] = df.CompetitionDistance.apply(np.log1p)
df.CompetitionOpen = df.CompetitionOpen + 31
df['CompetitionOpenLog'] = df.CompetitionOpen.apply(np.log1p)
df.EurUsdRate = df.EurUsdRate - 0.7
df.DaysLeft = df.DaysLeft.astype(int)
df.DaysAfter = df.DaysAfter.astype(int)
df.CompetitionOpenNull = df.CompetitionOpenNull + 31
df['CompetitionOpenNullLog'] = df.CompetitionOpenNull.apply(np.log1p)
new = df
a = new['CompetitionDistance']
d = new['CompetitionOpen']
e = new['PromoOpen']
f = new['CompetitionOpenNull']
g = new['CompetitionDistanceLog']
h = new['CompetitionOpenLog']
i = new['CompetitionOpenNullLog']
a_scaled = featureScaling(a)
d_scaled = featureScaling(d)
e_scaled = featureScaling(e)
f_scaled = featureScaling(f)
g_scaled = featureScaling(g)
h_scaled = featureScaling(h)
i_scaled = featureScaling(i)
new['CompetitionDistance'] = a_scaled
new['CompetitionOpen'] = d_scaled
new['PromoOpen'] = e_scaled
new['CompetitionOpenNull'] = f_scaled
new['CompetitionDistanceLog'] = g_scaled
new['CompetitionOpenLog'] = h_scaled
new['CompetitionOpenNullLog'] = i_scaled
new.to_csv('scaled_test.csv', na_rep='0', header=True, index = False)
In [ ]:
train = pd.read_csv('scaled_train.csv')
shape = train.shape
#shuffle data set, and split to train and test set.
df = pd.DataFrame(train)
shuffled_train = df.reindex(np.random.permutation(df.index))
indice_5_percent = int((shape[0]/100.0)* 5)
indice_10_percent = int((shape[0]/100.0)* 10)
indice_15_percent = int((shape[0]/100.0)* 15)
#for experimenting
shuffled_train[:indice_5_percent].to_csv('test_5_split.csv', index=True, columns=train.columns)
shuffled_train[indice_5_percent:indice_10_percent].to_csv('train_5_split.csv', index=True, columns=train.columns)
#for full scale training
shuffled_train[:indice_15_percent].to_csv('test_15_split.csv', index=True, columns=train.columns)
shuffled_train[indice_15_percent:].to_csv('train_85_split.csv', index=True, columns=train.columns)
In [ ]:
train_file = 'train_5_split.csv'
test_file = 'test_5_split.csv'
num_epochs = 1
batch_size = 10
logging = True #sets whether to write log file and create submission files
def feature_list():
features = v.vect(31, (int(row['Day']))) + \
v.vect(1115, (int(row['Store']))) +\
v.vect(52, (int(row['WeekOfYear']))) +\
v.vect(7, (int(row['DayOfWeek']))) +\
[row['PromoOpen']] +\
v.vect(12, (int(row['Month']))) +\
v.vect(3, (int(row['Year'])-2013)) +\
[row['Promo']] +\
v.vect(4, (int(row['StoreType']))) +\
[row['CompetitionDistance']] +\
[row['CompetitionOpenNull']] +\
[row['SchoolHoliday']] +\
v.vect(3, (int(row['Assortment']))) +\
[row['IsPromoMonth']] +\
[row['Promo2']] +\
v.vect(4, (int(row['StateHoliday']))) +\
[row['WeatherBad']] +\
[row['CompetitionDistanceLog']] + \
[row['CompetitionOpenLog']] +\
[row['CompetitionOpen']]
return features
In [ ]:
import csv
import time
training_subset_size = 10000
loss = 1
losslog = []
Time = (time.strftime("Day:%d Time:%H:%M"))
df = pd.DataFrame.from_csv(train_file)
print ("Assembling training set and training neural network...")
for epoch in range(1, num_epochs + 1):
n = 1
i = 1
list_a = []
list_b = []
df = df.reindex(np.random.permutation(df.index))
for index, row in df.iterrows():
if i <= training_subset_size:
if row['Open'] == 1:
features = feature_list()
list_a.append(features)
list_b.append(row['Sales'])
i = i+1
else:
x = np.array(list_a, dtype=float)
y = np.array(list_b, dtype=float)
model.fit(x, y, batch_size=batch_size, nb_epoch=1, verbose=0)
i = 1
list_a = []
list_b = []
if (n % 10000 == 0):
print (n)
n = n + 1
'''
Evaluating model
'''
n = 1
list_a = []
list_b = []
with open(test_file) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
features = feature_list()
list_a.append(features)
list_b.append(row['Sales'])
if (n % 10000 == 0):
print (n)
n = n + 1
test_x = np.array(list_a, dtype=float)
test_y = np.array(list_b, dtype=float)
print ("Performance of the model over validation data (loss score): \
" + str(model.evaluate(test_x, test_y, verbose=2)) + " at epoch:" + str(epoch))
if logging == True:
losslog.append(("Performance of the model over validation data (loss score): \
" + str(model.evaluate(test_x, test_y, verbose=2)) + " at epoch:" + str(epoch) \
+ "_"+time.strftime("Day:%d Time:%H:%M")))
losslog_df = pd.DataFrame(losslog)
losslog_df.to_csv('losslog_'+Time+'_train:_'+str(train_file)+'_test:_'+str(test_file) \
+'_batch_'+str(batch_size)+'_epoch_'+str(num_epochs)+'.csv', index_label='Id')
if model.evaluate(test_x, test_y, verbose=2) < loss and logging == True:
'''
Creating submission file on scaled_test.csv
'''
print ("Testing neural network on test file and writing results into submission file...")
results = []
with open('scaled_test.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
list_a = []
features = feature_list()
if row['Open'] == '':
op = [1.0]
else:
op = [row['Open']]
list_a.append(features)
x = np.array(list_a, dtype=float)
if row['Open'] == '1.0':
result = model.predict(x, batch_size=1)
else:
result = [np.array([0])]
results.extend(result)
'''
Writing results into submission.csv
'''
results = pd.DataFrame(results)
results.T
results = np.exp(results * 10.634700933902529) - 1
results.columns = ['Sales']
results.index += 1
results.to_csv('submission_train_'+str(train_file)+'_test_'+str(test_file) \
+'_batch_'+str(batch_size) +'_best_loss:_'+str(model.evaluate(test_x, test_y, verbose=2)) \
+'_epoch:_'+str(epoch)+"_"+time.strftime("Day:%d Time:%H:%M")+'.csv', index_label='Id')
#
print ("Finished!!!")
loss = model.evaluate(test_x, test_y, verbose=2)
In [ ]:
df1 = pd.read_csv('tobag/11884.csv')
df2 = pd.read_csv('tobag/12003.csv')
df3 = pd.read_csv('tobag/12017.csv')
df4 = pd.read_csv('tobag/12300.csv')
df5 = pd.read_csv('tobag/12488.csv')
df6 = pd.read_csv('tobag/12553.csv')
df7 = pd.read_csv('tobag/12628.csv')
df8 = pd.read_csv('tobag/12769.csv')
df9 = pd.read_csv('tobag/12792.csv')
df10 = pd.read_csv('tobag/13235.csv')
df11 = pd.read_csv('tobag/13462.csv')
df12 = pd.read_csv('tobag/13484.csv')
df13 = pd.read_csv('tobag/13491.csv')
df14 = pd.read_csv('tobag/13550.csv')
df15 = pd.read_csv('tobag/13580.csv')
df16 = pd.read_csv('tobag/13913.csv')
df17 = pd.read_csv('tobag/13945.csv')
df18 = pd.read_csv('tobag/13962.csv')
results = (df1.Sales * 0.985 + df2.Sales * 0.985 + df3.Sales * 0.985 + \
df4.Sales * 0.985 + df5.Sales * 0.985 + df6.Sales * 0.985 + \
df7.Sales * 0.985 + df8.Sales * 0.985 + df9.Sales * 0.985 + \
df10.Sales * 0.985 + df11.Sales * 0.985 + df12.Sales * 0.985 + df13.Sales * 0.985 + \
df14.Sales * 0.985 + df15.Sales * 0.985 + df16.Sales * 0.985 + \
df17.Sales * 0.985 + df18.Sales) / 18.0
results = pd.DataFrame(results)
results.T
results.columns = ['Sales']
results.index += 1
results.to_csv('bagged.csv', index_label='Id')