In [4]:
from IPython.core import display
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, cross_validation
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
%matplotlib inline
In [17]:
def readCsv(name, nrows=6000000):
df = pd.read_csv(
"data/expedia/{}.csv".format(name),
nrows=nrows,
parse_dates=["date_time", "srch_ci", "srch_co"])
df = df[(df.is_booking == True) & (pd.DatetimeIndex(df['date_time']).year == 2014)].sample(100000)
df["year"] = pd.DatetimeIndex(df['date_time']).year
df["month"] = pd.DatetimeIndex(df['date_time']).month
df["hour"] = pd.DatetimeIndex(df['date_time']).hour
df["dayofweek"] = pd.DatetimeIndex(df['date_time']).dayofweek
df["hour"] = pd.DatetimeIndex(df['date_time']).hour
df["srch_ci_month"] = pd.DatetimeIndex(df['srch_ci']).month
df["srch_ci_dayofweek"] = pd.DatetimeIndex(df['srch_ci']).dayofweek
df["srch_co_month"] = pd.DatetimeIndex(df['srch_co']).month
df["srch_co_dayofweek"] = pd.DatetimeIndex(df['srch_co']).dayofweek
srch_ci_utime = pd.DatetimeIndex(df['srch_ci']).astype(np.int64) // 10**9
srch_co_utime = pd.DatetimeIndex(df['srch_co']).astype(np.int64) // 10**9
df["srch_days"] = (srch_co_utime - srch_ci_utime) / (3600 * 24)
return df
#df = readCsv("train")
df = pd.read_csv("data/expedia/2014_bookings.csv")
print("shape: ", df.shape)
df.sample(5)
Out[17]:
train.csv - the training set
test.csv - the test set
destinations.csv - hotel search latent attributes
sample_submission.csv - a sample submission file in the correct format
train/test.csv
Column name Description Data type
date_time Timestamp string
site_name ID of the Expedia point of sale (i.e. Expedia.com, Expedia.co.uk, Expedia.co.jp, ...) int
posa_continent ID of continent associated with site_name int
user_location_country The ID of the country the customer is located int
user_location_region The ID of the region the customer is located int
user_location_city The ID of the city the customer is located int
orig_destination_distance Physical distance between a hotel and a customer at the time of search. A null means the distance could not be calculated double
user_id ID of user int
is_mobile 1 when a user connected from a mobile device, 0 otherwise tinyint
is_package 1 if the click/booking was generated as a part of a package (i.e. combined with a flight), 0 otherwise int
channel ID of a marketing channel int
srch_ci Checkin date string
srch_co Checkout date string
srch_adults_cnt The number of adults specified in the hotel room int
srch_children_cnt The number of (extra occupancy) children specified in the hotel room int
srch_rm_cnt The number of hotel rooms specified in the search int
srch_destination_id ID of the destination where the hotel search was performed int
srch_destination_type_id Type of destination int
hotel_continent Hotel continent int
hotel_country Hotel country int
hotel_market Hotel market int
is_booking 1 if a booking, 0 if a click tinyint
cnt Numer of similar events in the context of the same user session bigint
hotel_cluster ID of a hotel cluster int
destinations.csv
Column name Description Data type
srch_destination_id ID of the destination where the hotel search was performed int
d1-d149 latent description of search regions double
In [18]:
df.info()
In [19]:
df.describe()
Out[19]:
In [20]:
print "before removing outliers :", df.shape
df = df[(df["srch_days"] < 18) & (df["srch_days"] > -1)]
print "after removing outliers :", df.shape
In [21]:
def removeMisingvalues(df):
"""
remove data rows associated with any missing value
"""
return df[
df.orig_destination_distance.notnull() &
df.srch_ci.notnull()
]
print "before removing missing values :", df.shape
df = removeMisingvalues(df)
print "after removing missing values :", df.shape
In [22]:
df.info()
In [23]:
def exploreUnivariate(column):
print "\n-------------------------\nColumn: {}\n".format(column.name)
print column.describe()
try:
sns.distplot(column)
except:
print "cannot be plotted"
plt.show()
#exploreUnivariate(df["srch_days"])
# exploreUnivariate(np.log(df["srch_days"])) # less skewness
In [24]:
for i in df.drop(labels=["date_time"], axis=1).columns:
exploreUnivariate(df[i])
In [11]:
# @TODO: plot bivariate charts
In [25]:
sns.stripplot(x="user_location_country", y="hotel_cluster", jitter=True, data=df[
(df["hotel_country"] == 19) & (df["year"] == 2014)])
plt.show()
sns.stripplot(x="user_location_country", y="hotel_cluster", jitter=True ,data=df[
(df["hotel_country"] == 20) & (df["year"] == 2014)])
plt.show()
sns.stripplot(x="user_location_country", y="hotel_cluster", jitter=True, data=df[
(df["hotel_country"] == 21) & (df["year"] == 2014)])
plt.show()
In [34]:
cols = [u'site_name', u'posa_continent', u'user_location_country',
u'channel', u'srch_adults_cnt',
u'srch_children_cnt', u'srch_rm_cnt', u'srch_destination_id',
u'is_booking', u'cnt', u'hotel_continent',
u'hotel_country', u'hotel_market', u'hotel_cluster', u'year', u'month',
u'hour', u'dayofweek', u'srch_ci_month', u'srch_ci_dayofweek',
u'srch_co_month', u'srch_co_dayofweek', u'srch_days']
for i in cols:
g = sns.FacetGrid(
df[
(
(df["hotel_country"] == 19) |
(df["hotel_country"] == 20) |
(df["hotel_country"] == 21) |
(df["hotel_country"] == 22)
)
& (df["year"] == 2014)
& (df["srch_co_month"] == 8)
], col="hotel_country")
g.map(sns.stripplot, i, "hotel_cluster", jitter=True)
#g.add_legenad();
#sns.boxplot(x=i, y="hotel_cluster", data=df[
# (df["hotel_country"] == 21) & (df["year"] == 2014)])
plt.show()
In [29]:
sns.stripplot(
x="hotel_market", y="hotel_cluster",
data=df[df["hotel_country"] == 48], jitter=True)
Out[29]:
In [32]:
def biVariate(df):
cols = [
#"srch_destination_id"]
"hotel_continent", "hotel_country",
"srch_adults_cnt", "srch_rm_cnt", "srch_children_cnt",
"srch_destination_type_id", "srch_destination_id",
"month", "year", "hour", "dayofweek", "srch_ci_month",
"srch_ci_dayofweek", "srch_co_month", "srch_co_dayofweek", "srch_days"]
for i in cols:
sns.stripplot(x=i, y="hotel_cluster", data=df, jitter=True)
plt.show()
sns.boxplot(x=i, y="hotel_cluster", data=df)
plt.show()
biVariate(df.sample(50000))
In [21]:
sns.stripplot(x="hotel_country", y="hotel_market", data=df, jitter=True)
Out[21]:
In [14]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(df["hotel_cluster"], df["hotel_country"], df["srch_days"], c="b")
Out[14]:
In [53]:
df.info()
In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
In [11]:
clf = RandomForestClassifier(n_estimators=10)
a = df[[
"hotel_continent",
"hotel_market",
"user_location_country",
"srch_destination_type_id",
"hotel_country",
"is_package",
"srch_ci_month",
"srch_days",
"hotel_cluster"
]]
X = a.drop(labels=["hotel_cluster"], axis=1)
y = a["hotel_cluster"]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
X, y, test_size=0.2, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Out[11]:
In [ ]:
from sklearn.svm import SVC
clf = SVC(kernel="rbf", gamma=0.002)
a = df[[
"hotel_continent",
"hotel_market",
"user_location_country",
"srch_destination_type_id",
"hotel_country",
"is_package",
"srch_ci_month",
"srch_days",
"hotel_cluster"
]]
X = a.drop(labels=["hotel_cluster"], axis=1)
y = a["hotel_cluster"]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
X, y, test_size=0.2, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
In [2]:
dest = pd.read_csv("data/expedia/destinations.csv")
In [4]:
dest.sample(5)
Out[4]:
In [11]:
import pymongo
from pprint import pprint as pp
from datetime import datetime
In [60]:
db = pymongo.MongoClient("192.168.5.5")["expedia2"]
In [70]:
i = 0
with open("data/expedia/train.csv") as f:
cache = []
headers = f.readline().strip().split(",") # reading the header line, we do not need this.
strptime_format1 = "%Y-%m-%d %H:%M:%S"
strptime_format2 = "%Y-%m-%d"
def toInt(line):
for h in headers:
if h not in ["date_time", "srch_ci", "srch_co"]:
line[h] = float(line[h])
return line
for line in f:
i += 1
line = line.strip().split(",")
if line[18] == 0:
continue
line = dict(zip(headers, line))
if line["orig_destination_distance"] == "":
continue
if line["srch_ci"] == "" or line["srch_co"] == "":
continue
# line["date_time"] = datetime.strptime(line["date_time"], strptime_format)
# line["srch_ci"] = datetime.strptime(line["srch_ci"], strptime_format2)
# line["srch_co"] = datetime.strptime(line["srch_co"], strptime_format2)
# line = toInt(line)
# pp(line)
cache.append(line)
if len(cache) == 10000:
# print "inserting , ", i, len(cache)
db["train2"].insert_many(cache)
cache = []
if i % 40000 == 0:
print i
In [71]:
headers
Out[71]:
In [ ]: