Peeter Piksarv (piksarv .at. gmail.com)
The latest version of this Jupyter notebook is available at https://github.com/ppik/playdata/tree/master/Kaggle-Expedia
Here I'll try to test some machine learning techniques on this dataset.
In [1]:
import collections
import itertools
import operator
import random
import heapq
import matplotlib.pyplot as plt
import ml_metrics as metrics
import numpy as np
import pandas as pd
import sklearn
import sklearn.decomposition
import sklearn.linear_model
import sklearn.preprocessing
%matplotlib notebook
In [2]:
traincols = ['date_time', 'site_name', 'posa_continent', 'user_location_country',
'user_location_region', 'user_location_city', 'orig_destination_distance',
'user_id', 'is_mobile', 'is_package', 'channel', 'srch_ci', 'srch_co',
'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id',
'srch_destination_type_id', 'is_booking', 'cnt', 'hotel_continent',
'hotel_country', 'hotel_market', 'hotel_cluster']
testcols = ['id', 'date_time', 'site_name', 'posa_continent', 'user_location_country',
'user_location_region', 'user_location_city', 'orig_destination_distance',
'user_id', 'is_mobile', 'is_package', 'channel', 'srch_ci', 'srch_co',
'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt', 'srch_destination_id',
'srch_destination_type_id', 'hotel_continent', 'hotel_country', 'hotel_market']
Convenience function for reading the data in:
In [3]:
def read_csv(filename, cols, nrows=None):
datecols = ['date_time', 'srch_ci', 'srch_co']
dateparser = lambda x: pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S', errors='coerce')
dtypes = {
'id': np.uint32,
'site_name': np.uint8,
'posa_continent': np.uint8,
'user_location_country': np.uint16,
'user_location_region': np.uint16,
'user_location_city': np.uint16,
'orig_destination_distance': np.float32,
'user_id': np.uint32,
'is_mobile': bool,
'is_package': bool,
'channel': np.uint8,
'srch_adults_cnt': np.uint8,
'srch_children_cnt': np.uint8,
'srch_rm_cnt': np.uint8,
'srch_destination_id': np.uint32,
'srch_destination_type_id': np.uint8,
'is_booking': bool,
'cnt': np.uint64,
'hotel_continent': np.uint8,
'hotel_country': np.uint16,
'hotel_market': np.uint16,
'hotel_cluster': np.uint8,
}
df = pd.read_csv(
filename,
nrows=nrows,
usecols=cols,
dtype=dtypes,
parse_dates=[col for col in datecols if col in cols],
date_parser=dateparser,
)
if 'date_time' in df.columns:
df['month'] = df['date_time'].dt.month.astype(np.uint8)
df['year'] = df['date_time'].dt.year.astype(np.uint16)
if 'srch_ci' and 'srch_co' in df.columns:
df['srch_ngt'] = (df['srch_co'] - df['srch_ci']).astype('timedelta64[h]')
if 'srch_children_cnt' in df.columns:
df['is_family'] = np.array(df['srch_children_cnt'] > 0)
return df
In [4]:
train = read_csv('data/train.csv.gz', nrows=None, cols=traincols)
Getting a list of all user_id
s in the sample.
In [5]:
train_ids = set(train.user_id.unique())
len(train_ids)
Out[5]:
Pick a subset of users for testing and validation
In [7]:
sel_user_ids = sorted(random.sample(train_ids, 12000))
sel_train = train[train.user_id.isin(sel_user_ids)]
Create new test and training sets, using bookings from 2013 as training data and 2014 as test data.
In [8]:
cv_train = sel_train[sel_train.year == 2013]
cv_test = sel_train[sel_train.year == 2014]
Remove click events from cv_test
as in original test data.
In [9]:
cv_test = cv_test[cv_test.is_booking == True]
Public solutions to the compedition (Dataquest tutorial by Vik Paruchuri and Leakage solution by ZFTurbo) use most common clusters in following groups:
srch_destination_id
user_location_city
, orig_destination_distance
(data leak)srch_destination_id
, hotel_country
, hotel_market
(for year 2014)srch_destination_id
hotel_country
Finding the most common overall clusters
In [10]:
most_common_clusters = list(cv_train.hotel_cluster.value_counts().head().index)
Predicting the most common clusters in groups of srch_destination_id
, hotel_country
, hotel_market
.
In [11]:
match_cols = ['srch_destination_id']
match_cols = ['srch_destination_id', 'hotel_country', 'hotel_market']
groups = cv_train.groupby(match_cols + ['hotel_cluster'])
In [12]:
top_clusters = {}
for name, group in groups:
bookings = group['is_booking'].sum()
clicks = len(group) - bookings
score = bookings + .15*clicks
clus_name = name[:len(match_cols)]
if clus_name not in top_clusters:
top_clusters[clus_name] = {}
top_clusters[clus_name][name[-1]] = score
This dictionary has a key of srch_destination_id
, hotel_country
, hotel_market
and each value is another dictionary, with hotel clusters as keys and scores as values.
Finding the top 5 for each destination.
In [13]:
cluster_dict = {}
for n in top_clusters:
tc = top_clusters[n]
top = [l[0] for l in sorted(tc.items(), key=operator.itemgetter(1), reverse=True)[:5]]
cluster_dict[n] = top
In [14]:
preds = []
for index, row in cv_test.iterrows():
key = tuple([row[m] for m in match_cols])
pred = cluster_dict.get(key, most_common_clusters)
preds.append(pred)
In [15]:
cv_target = [[l] for l in cv_test['hotel_cluster']]
metrics.mapk(cv_target, preds, k=5)
Out[15]:
srch_destination_id
, is_booking
: 0.212srch_destination_id
, hotel_country
, hotel_market
: 0.214
In [ ]:
clf = sklearn.linear_model.SGDClassifier(loss='log', n_jobs=4)
Make dummy variables from categorical features. Pandas has get_dummies()
, but currently this returns only float64
-s, that thends to be rather memory hungry and slow. See #8725.
In [ ]:
cv_train_data = pd.DataFrame()
for elem in cv_train['srch_destination_id'].unique():
cv_train_data[str(elem)] = cv_train['srch_destination_id'] == elem
In [ ]:
cv_test_data = pd.DataFrame()
for elem in cv_train_data.columns:
cv_test_data[elem] = cv_test['srch_destination_id'] == int(elem)
In [ ]:
# cv_train_data['is_booking'] = cv_train['is_booking']
# cv_test_data['is_booking'] = np.ones(len(cv_test_data), dtype=bool)
In [ ]:
clf.fit(cv_train_data, cv_train['hotel_cluster'])
In [ ]:
result = clf.predict_proba(cv_test_data)
In [ ]:
preds = [heapq.nlargest(5, clf.classes_, row.take) for row in result]
In [ ]:
metrics.mapk(cv_target, preds, k=5)
I would say that not that bad at all (comparing the random forrest classifier in the Dataquest tutorial).
Using destination latent features form destination description data file.
In [ ]:
dest = pd.read_csv(
'data/destinations.csv.gz',
index_col = 'srch_destination_id',
)
In [ ]:
pca = sklearn.decomposition.PCA(n_components=10)
dest_small = pca.fit_transform(dest[['d{}'.format(i) for i in range(1,150)]])
dest_small = pd.DataFrame(dest_small, index=dest.index)
In [ ]:
cv_train_data = pd.DataFrame({key: cv_train[key] for key in ['srch_destination_id']})
cv_train_data = cv_train_data.join(dest_small, on=['srch_destination_id'], how='left')
cv_train_data = cv_train_data.fillna(dest_small.mean())
In [ ]:
cv_test_data = pd.DataFrame({key: cv_test[key] for key in ['srch_destination_id']})
cv_test_data = cv_test_data.join(dest_small, on='srch_destination_id', how='left', rsuffix='dest')
cv_test_data = cv_test_data.fillna(dest_small.mean())
In [ ]:
clf = sklearn.linear_model.SGDClassifier(loss='log', n_jobs=4)
clf.fit(cv_train_data, cv_train['hotel_cluster'])
In [ ]:
result = clf.predict_proba(cv_test_data)
In [ ]:
preds = [heapq.nlargest(5, clf.classes_, row.take) for row in result]
In [ ]:
metrics.mapk(cv_target, preds, k=5)
=> destination latent features seem not to be for any good use?!
In [16]:
features = [
'site_name', 'posa_continent', 'user_location_country',
'user_location_region', 'user_location_city',
'is_mobile', 'is_package',
'channel', 'srch_adults_cnt', 'srch_destination_id',
'srch_destination_type_id', 'is_booking', 'cnt',
'hotel_continent', 'hotel_country', 'hotel_market',
'month', 'year', 'is_family',
]
In [17]:
def fit_features(features, train, test):
# Data manipulation - split categorical features
train_data = pd.DataFrame()
test_data = pd.DataFrame()
for feature in features:
if train[feature].dtype == np.dtype('bool'):
train_data[feature] = train[feature]
test_data[feature] = test[feature]
else:
for elem in train[feature].unique():
train_data['{}_{}'.format(feature, elem)] = train[feature] == elem
test_data['{}_{}'.format(feature, elem)] = test[feature] == elem
# Fitting
clf = sklearn.linear_model.SGDClassifier(loss='log', n_jobs=4)
clf.fit(train_data, train['hotel_cluster'])
# Cross-validate the fit
result = clf.predict_proba(test_data)
preds = [heapq.nlargest(5, clf.classes_, row.take) for row in result]
target = [[l] for l in test['hotel_cluster']]
return metrics.mapk(target, preds, k=5)
In [20]:
cv_results = {}
for feature in features:
cv_results[feature] = fit_features([feature], cv_train, cv_test)
print('{}: {}'.format(feature, cv_results[feature]))
In [23]:
sorted(cv_results.items(), key=operator.itemgetter(1), reverse=True)
Out[23]:
The best single predictor of a hotel cluster seems to be hotel_market
.
In [25]:
features2 = [['hotel_market'] + [f] for f in features if f not in ['hotel_market']]
In [34]:
cv_results2 = {}
for feature in features2:
cv_results2[tuple(feature)] = fit_features(feature, cv_train, cv_test)
print('{}: {}'.format(feature, cv_results2[tuple(feature)]))
In [42]:
sorted(cv_results2.items(), key=operator.itemgetter(1), reverse=True)[:3]
In [18]:
features3 = [['hotel_market', 'srch_destination_id'] + [f] for f in features if f not in ['hotel_market', 'srch_destination_id']]
In [19]:
cv_results3 = {}
for feature in features3:
cv_results3[tuple(feature)] = fit_features(feature, cv_train, cv_test)
print('{}: {}'.format(feature, cv_results3[tuple(feature)]))
In [41]:
sorted(cv_results3.items(), key=operator.itemgetter(1), reverse=True)[:3]
Out[41]:
In [21]:
features4 = [['hotel_market', 'srch_destination_id', 'hotel_country'] + [f] for f in features if f not in ['hotel_market', 'srch_destination_id', 'hotel_country']]
In [24]:
cv_results4 = {}
for feature in features4:
cv_results4[tuple(feature)] = fit_features(feature, cv_train, cv_test)
print('{}: {}'.format(feature, cv_results4[tuple(feature)]))
In [40]:
sorted(cv_results4.items(), key=operator.itemgetter(1), reverse=True)[:3]
Out[40]:
In [27]:
sel_features = ['hotel_market', 'srch_destination_id', 'hotel_country', 'is_package']
features5 = [sel_features + [f] for f in features if f not in sel_features]
In [29]:
cv_results5 = {}
for feature in features5:
cv_results5[tuple(feature)] = fit_features(feature, cv_train, cv_test)
print('{}: {}'.format(feature, cv_results5[tuple(feature)]))
In [39]:
sorted(cv_results5.items(), key=operator.itemgetter(1), reverse=True)[:3]
Out[39]:
In [31]:
sel_features = ['hotel_market', 'srch_destination_id', 'hotel_country', 'is_package', 'is_booking']
features6 = [sel_features + [f] for f in features if f not in sel_features]
In [32]:
cv_results6 = {}
for feature in features6:
cv_results6[tuple(feature)] = fit_features(feature, cv_train, cv_test)
print('{}: {}'.format(feature, cv_results6[tuple(feature)]))
In [38]:
sorted(cv_results6.items(), key=operator.itemgetter(1), reverse=True)[:3]
Out[38]:
In [34]:
sel_features = ['hotel_market', 'srch_destination_id', 'hotel_country', 'is_package', 'is_booking', 'posa_continent']
features7 = [sel_features + [f] for f in features if f not in sel_features]
In [35]:
cv_results7 = {}
for feature in features7:
cv_results7[tuple(feature)] = fit_features(feature, cv_train, cv_test)
print('{}: {}'.format(feature, cv_results7[tuple(feature)]))
In [37]:
sorted(cv_results7.items(), key=operator.itemgetter(1), reverse=True)[:3]
Out[37]:
In [ ]:
-- Peeter Piksarv