In [23]:
%matplotlib inline
import pandas as pd
import numpy as np
import time
from math import sqrt
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt
In [45]:
RANDOM_STATE = 6578439
def root_mean_square_percentage(labels, predictions):
""" As defined by competition """
if len(labels) != len(predictions):
raise Exception("Labels and predictions must be of same length")
# Filter pairs where label == 0
labels, predictions = tuple(
zip(*filter(lambda x: x[0] != 0, zip(labels, predictions)))
)
labels = numpy.array(labels, dtype=float)
predictions = numpy.array(predictions, dtype=float)
return sqrt(numpy.power((labels - predictions) / labels, 2.0).sum() / len(labels))
if __name__ == '__main__':
print "Loading annotated dataset..."
annotated_df = pandas.read_csv(
'data/train.csv',
dtype={
'StateHoliday': object,
'Sales': float,
'Customers': float,
},
parse_dates=['Date']
)
print "Loading stores dataset..."
stores_df = pandas.read_csv(
'data/store.csv',
dtype = {
"Store": int,
"DayOfWeek": int, # {1, 2, 3, 4, 5, 6, 7}
"Date": object, # e.g. "2015-07-31"
"Sales": int,
"Customers": int,
"Open": bool,
"Promo": bool,
"StateHoliday": str, # {'0', 'a', 'b', 'c'}
"SchoolHoliday": bool,
}
)
print "Done loading datasets!"
In [46]:
stores_df.head()
Out[46]:
In [49]:
full_df = pandas.merge(annotated_df, stores_df)
In [52]:
annotated_df.head()
Out[52]:
In [51]:
full_df.head()
Out[51]:
In [55]:
print "Preparing annotated dataset for sklearn usage..."
annotated_df['StateHoliday'] = LabelEncoder().fit_transform(annotated_df['StateHoliday'])
print "Enriching annotated dataset with extra features..."
annotated_df['DayOfMonth'] = annotated_df['Date'].apply(lambda dt: dt.day)
annotated_df['Month'] = annotated_df['Date'].apply(lambda dt: dt.month)
annotated_df['Year'] = annotated_df['Date'].apply(lambda dt: dt.year)
annotated_df['UnixTimestamp'] = annotated_df['Date'].apply(lambda dt: time.mktime(dt.timetuple()))
annotated_df.drop('Date', axis=1, inplace=True)
In [56]:
print "Splitting train and test sets..."
train_df, test_df = train_test_split(
annotated_df,
test_size=0.10,
random_state=RANDOM_STATE
)
print "Training random forest..."
random_forest = RandomForestRegressor(
n_jobs=-1, # Auto selects number of cores
random_state=RANDOM_STATE,
max_features="log2",
n_estimators=10,
).fit(
X=train_df[train_df.columns.difference(['Sales'])],
y=train_df['Sales'],
)
print "Feature importances:"
pairs = zip(train_df.columns.difference(['Sales']), random_forest.feature_importances_)
pairs.sort(key=lambda x: -x[1])
for column, importance in pairs:
print " ", column, importance
print "Testing random forest..."
predictions = random_forest.predict(
X=test_df[test_df.columns.difference(['Sales'])],
)
print "Root mean square percentage:"
print " ", root_mean_square_percentage(test_df['Sales'], predictions)
In [5]:
Out[5]:
In [ ]:
predictions_df = pandas.DataFrame({'Sales':predictions})
In [ ]:
mask = predictions_df.Sales != 0
predictions_df[mask].Sales.hist(bins=100)
print predictions_df.median()
print predictions_df.mean()
print predictions_df.std()
In [43]:
mask = train_df.Sales != 0
train_df[mask].Sales.hist(bins=100)
print train_df.median()
print train_df.mean()
print train_df.std()
In [ ]: