In [1]:
from pandas import Series, DataFrame
import pandas as pd
%pylab inline
In [2]:
destinations = pd.read_csv("destinations.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
Convert date time type to seperate the train and test set. becasue the test set data time have to be come later than the train set
In [ ]:
train["date_time"] = pd.to_datetime(train["date_time"])
train["year"] = train["date_time"].dt.year
train["month"] = train["date_time"].dt.month
pick random 10000 users row as our train data set
In [ ]:
import random
unique_users = train.user_id.unique()
sel_user_ids = [unique_users[i] for i in sorted(random.sample(range(len(unique_users)), 10000)) ]
sel_train = train[train.user_id.isin(sel_user_ids)]
In [ ]:
t1 = sel_train[((sel_train.year == 2013) | ((sel_train.year == 2014) & (sel_train.month < 8)))]
t2 = sel_train[((sel_train.year == 2014) & (sel_train.month >= 8))]
In [ ]:
# remove the empty bookinf in test set
t2 = t2[t2.is_booking == True]
Simple predication: use the most 5 common cluster as predication for each data in test
In [ ]:
t2[:10]
In [ ]:
most_common_clusters = list(train.hotel_cluster.value_counts().head().index)
In [ ]:
predictions = [most_common_clusters for i in range(t2.shape[0])]
In [ ]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
dest_small = pca.fit_transform(destinations[["d{0}".format(i + 1) for i in range(149)]])
dest_small = pd.DataFrame(dest_small)
dest_small["srch_destination_id"] = destinations["srch_destination_id"]
In [ ]: