In [1]:
import numpy as np
import pandas as pd
In [2]:
df = pd.read_csv('../data/processed/df.csv', encoding='iso-8859-1')
keys = pd.read_csv('../data/raw/key_1.csv.zip', encoding='iso-8859-1', compression='zip')
sample_submission = pd.read_csv('../data/raw/sample_submission_1.csv.zip', encoding='iso-8859-1', compression='zip')
In [3]:
df.head()
Out[3]:
In [4]:
keys.head()
Out[4]:
In [5]:
keys['Page'].apply(lambda x: x.split('_')[-1]).unique()
Out[5]:
In [6]:
sample_submission.head()
Out[6]:
Define SMAPE evaluation. It handles the case where there are nan in the y_true array, but it assumes there are no nan in the y_pred array.
In [7]:
def smape(y_true, y_pred):
denominator = (np.abs(y_true) + np.abs(y_pred)) / 200.0
diff = np.abs(y_true - y_pred) / denominator
diff[denominator == 0] = 0.0
return np.nanmean(diff)
After we do the predictions, we must correctly build the submission file from the predicted dataframe.
First let's verify we can join page names in keys and raw file.
In [8]:
all_keys = keys['Page'].apply(lambda x: '_'.join(x.split('_')[:-1])).unique()
all_pages = df['Page'].values
In [9]:
print(len(all_keys))
print(len(all_pages))
In [10]:
print(len(np.intersect1d(all_keys, all_pages)) == len(all_keys))
print(len(np.intersect1d(all_keys, all_pages)) == len(all_pages))
So we can join on page names, the keys Page is like the df Page+_date at the end. As such we will need a function to pass the test dataframe with dates in columns to a submission dataframe with dates in rows.
In [11]:
# divide into train and test by slicing at last 60 days
train, test = df.iloc[:,1:-64], df.iloc[:,-64:-4]
pagenames = df['Page'].values
In [12]:
predictions = test.copy()
predictions[:] = 0
We should build a function that from a dataframe with row Pagename and column timestamp generates a Dataframe row Pagename_timestamp and column visits. This function maps our predictions Dataframe to the space of submissions.
In [13]:
test.head()
Out[13]:
In [14]:
predictions.head()
Out[14]:
In [15]:
def meltDataframe(preds, pagenames):
t = preds.copy()
t['Page'] = pagenames
t = pd.melt(t, id_vars=['Page'], var_name='date', value_name='Visits')
t['Page'] = t['Page'] + "_" + t['date']
return t[['Page', 'Visits']]
In [16]:
meltDataframe(test, pagenames).head()
Out[16]:
In [17]:
meltDataframe(predictions, pagenames).head()
Out[17]:
In [18]:
smape(meltDataframe(test, pagenames)['Visits'], meltDataframe(predictions, pagenames)['Visits'])
Out[18]:
In [19]:
mean_per_row = test.mean(axis=1)
mean_per_row
Out[19]:
There are NAs in there, let's fill it with 0s and convert all to ints
In [20]:
mean_per_row = mean_per_row.fillna(0)
mean_per_row = mean_per_row.astype(np.int)
In [21]:
predictions_mean = test.copy()
predictions_mean[:] = 1
predictions_mean = predictions_mean.mul(mean_per_row, axis=0)
In [22]:
smape(meltDataframe(test, pagenames)['Visits'], meltDataframe(predictions_mean, pagenames)['Visits'])
Out[22]:
Well that is a better result, let's send that to Kaggle to test the submission process.
In [23]:
submission_cols = pd.Series(pd.date_range('1/1/2017', '3/1/2017').format())
submission = meltDataframe(pd.DataFrame({col:mean_per_row for col in submission_cols}), pagenames)
In [24]:
submission.head()
Out[24]:
In [25]:
submission['Page'].head().values
Out[25]:
In [26]:
keys['Page'].head().values
Out[26]:
In [27]:
submission[submission['Page'] == keys['Page'].head().values[0]]
Out[27]:
In [28]:
keys[keys['Page'] == submission['Page'].head().values[0]]
Out[28]:
In [29]:
submission.set_index('Page')
keys.set_index('Page')
Out[29]:
In [30]:
submission_file = submission.join(keys, how='left', lsuffix='_sub', rsuffix='_key')[["Id", "Visits"]]
In [31]:
submission_file.describe()
Out[31]:
In [32]:
# check there are no nans after left join, eventually
submission_file.isnull().sum()
Out[32]:
In [33]:
submission_file.to_csv('../data/submissions/0.3_mean_row_submission.csv.gz', compression='gzip', index=False)
Next time we can try to decompose to trend + seasonality and just predict using trend for the same date the year before
In [ ]: