In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from __future__ import print_function
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from datetime import datetime
import os
%matplotlib inline
%config InlineBackend.figure_format = 'png'
pd.set_option("max_columns",50)
In [3]:
%%time
train = pd.read_csv("../data/train_2013.csv", index_col=0)
train = train.reset_index(drop=True)
train = train[train["is_booking"] == 1]
np.random.seed(402)
train = train.ix[np.random.choice(train.index, 50000)]
train = train.reset_index(drop=True)
In [4]:
use_col = ["srch_co","srch_ci","user_location_region",\
"hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt","hotel_cluster"]
In [5]:
train_y = train[["hotel_cluster"]]
In [6]:
train_x = train[use_col]
In [8]:
train_x["srch_ci"] = pd.to_datetime(train_x["srch_ci"], errors="coerce")
train_x["srch_co"] = pd.to_datetime(train_x["srch_co"], errors="coerce")
In [13]:
train_x["period"] = train_x["srch_co"] - train_x["srch_ci"]
In [23]:
train_x["period"] = (train_x["period"] / np.timedelta64(1, 'D')).astype(int)
In [26]:
train_x = train_x.drop(["srch_co","srch_ci"], axis=1)
In [29]:
train_x["srch_adults_cnt"].value_counts()
Out[29]:
In [34]:
train_x.tail()
Out[34]:
In [36]:
train_x["num"] = 1
In [41]:
pd.pivot_table(train_x, values='num', index=['srch_adults_cnt'], columns=['srch_children_cnt'], aggfunc=np.sum)
Out[41]:
In [43]:
train_x["srch_adults_cnt"] = train_x["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
# 3은 가족여행 나머지는 1(혼자) 2(이건 애매함)
In [45]:
train_x = train_x.drop(["num","srch_children_cnt"], axis=1)
In [60]:
train_x.tail()
Out[60]:
In [61]:
train_x["srch_destination_id"].value_counts()
Out[61]:
In [65]:
train_x["num"] = 1
In [70]:
pd.pivot_table(train_x, values='num', index=['srch_adults_cnt'], columns=['period'], aggfunc=np.sum)
Out[70]:
In [75]:
sns.heatmap(pd.pivot_table(train_x, values='num', index=['srch_adults_cnt'], columns=['period'], aggfunc=np.sum))
Out[75]:
In [77]:
train_x = train_x.drop(["num"], axis=1)
In [93]:
train_x = train_x[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]
In [91]:
train_y.head()
Out[91]:
In [97]:
%%time
use_col = ["srch_co","srch_ci","user_location_region",\
"hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt"]
print("read the test.csv")
test = pd.read_csv("../data/test.csv")
test = test[use_col]
test["srch_ci"] = pd.to_datetime(test["srch_ci"], errors="coerce")
test["srch_co"] = pd.to_datetime(test["srch_co"], errors="coerce")
test["period"] = test["srch_co"] - test["srch_ci"]
test["period"] = (test["period"] / np.timedelta64(1, 'D')).astype(int)
test = test.drop(["srch_co","srch_ci"], axis=1)
test["num"] = 1
test["srch_adults_cnt"] = test["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
test = test.drop(["num","srch_children_cnt"], axis=1)
test = test[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]
print("modeling strart")
model = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)
print('='*50)
print('# Test shape : {}'.format(test.shape))
model.fit(train,train_y)
preds = model.predict_proba(test)
preds = np.fliplr(np.argsort(preds, axis=1))
print("save file")
result_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)], columns=["hotel_cluster"])
result_df.index.names = ["id"]
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
result_df.to_csv(os.path.join('../output',file_name), index=True)
In [96]:
test
Out[96]:
In [140]:
%%time
use_col = ["srch_co","srch_ci","user_location_region",\
"hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt"]
print("read the test.csv")
test = pd.read_csv("../data/test.csv")
test = test[use_col]
test["srch_ci"] = pd.to_datetime(test["srch_ci"], errors="coerce")
test["srch_co"] = pd.to_datetime(test["srch_co"], errors="coerce")
test["period"] = test["srch_co"] - test["srch_ci"]
test["period"] = (test["period"] / np.timedelta64(1, 'D')).fillna(0.0).astype(int)
test = test.drop(["srch_co","srch_ci"], axis=1)
test["num"] = 1
test["srch_adults_cnt"] = test["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
test = test.drop(["num","srch_children_cnt"], axis=1)
test = test[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]
print("modeling strart")
model = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)
print('='*50)
print('# Test shape : {}'.format(test.shape))
model.fit(train_x,train_y)
preds = model.predict_proba(test)
preds = np.fliplr(np.argsort(preds, axis=1))
print("save file")
result_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)], columns=["hotel_cluster"])
result_df.index.names = ["id"]
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
result_df.to_csv(os.path.join('../output',file_name), index=True)
In [141]:
result_df
Out[141]:
In [ ]:
%%time
use_col = ["srch_co","srch_ci","user_location_region",\
"hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt"]
print("read the test.csv")
test = pd.read_csv("../data/test.csv")
test = test[use_col]
test["srch_ci"] = pd.to_datetime(test["srch_ci"], errors="coerce")
test["srch_co"] = pd.to_datetime(test["srch_co"], errors="coerce")
test["period"] = test["srch_co"] - test["srch_ci"]
test["period"] = (test["period"] / np.timedelta64(1, 'D')).fillna(0.0).astype(int)
test = test.drop(["srch_co","srch_ci"], axis=1)
test["num"] = 1
test["srch_adults_cnt"] = test["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
test = test.drop(["num","srch_children_cnt"], axis=1)
test = test[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]
print("modeling strart")
model = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)
print('='*50)
print('# Test shape : {}'.format(test.shape))
model.fit(train_x,train_y)
preds = model.predict_proba(test)
preds = np.fliplr(np.argsort(preds, axis=1))
print("save file")
result_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)], columns=["hotel_cluster"])
result_df.index.names = ["id"]
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
result_df.to_csv(os.path.join('../output',file_name), index=True)
In [148]:
train_sum = pd.concat([train_x,train_y], axis=1)
In [149]:
train_sum.to_csv("train_data.csv")
test.to_csv("test_data.csv")
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [1]:
# 하나로 정리
In [ ]:
%%time
print('preprocessing train_data')
use_col = ["srch_co","srch_ci","user_location_region",\
"hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt","hotel_cluster"]
train_y = train[["hotel_cluster"]]
train_x = train[use_col]
train_x["srch_ci"] = pd.to_datetime(train_x["srch_ci"], errors="coerce")
train_x["srch_co"] = pd.to_datetime(train_x["srch_co"], errors="coerce")
train_x["period"] = train_x["srch_co"] - train_x["srch_ci"]
train_x["period"] = (train_x["period"] / np.timedelta64(1, 'D')).astype(int)
train_x = train_x.drop(["srch_co","srch_ci"], axis=1)
train_x["srch_adults_cnt"] = train_x["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
train_x = train_x.drop(["srch_children_cnt"], axis=1)
train_x = train_x[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]
use_col = ["srch_co","srch_ci","user_location_region",\
"hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","srch_children_cnt"]
print("read the test.csv")
test = pd.read_csv("../data/test.csv")
test = test[use_col]
print("preprocessing test_data")
test["srch_ci"] = pd.to_datetime(test["srch_ci"], errors="coerce")
test["srch_co"] = pd.to_datetime(test["srch_co"], errors="coerce")
test["period"] = test["srch_co"] - test["srch_ci"]
test["period"] = (test["period"] / np.timedelta64(1, 'D')).fillna(0.0).astype(int)
test = test.drop(["srch_co","srch_ci"], axis=1)
test["num"] = 1
test["srch_adults_cnt"] = test["srch_adults_cnt"].apply(lambda x: 3 if x>=3 else x)
test = test.drop(["num","srch_children_cnt"], axis=1)
test = test[["hotel_market","srch_destination_id","hotel_country","srch_adults_cnt","period","user_location_region"]]
print("modeling strart")
model = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)
print('='*50)
print('# Test shape : {}'.format(test.shape))
model.fit(train_x,train_y)
preds = model.predict_proba(test)
preds = np.fliplr(np.argsort(preds, axis=1))
print("save file")
result_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)], columns=["hotel_cluster"])
result_df.index.names = ["id"]
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
result_df.to_csv(os.path.join('../output',file_name), index=True)
In [8]:
train[["srch_children_cnt","srch_adults_cnt"]].groupby(["srch_children_cnt"]).agg
Out[8]:
In [10]:
from csv import DictReader
from collections import defaultdict
from datetime import datetime
start = datetime.now()
def get_top5(d):
return " ".join(sorted(d, key=d.get, reverse=True)[:5])
destination_clusters = defaultdict(lambda: defaultdict(int))
for i, row in enumerate(DictReader(open("../data/train.csv"))):
destination_clusters[row["srch_destination_id"]][row["hotel_cluster"]] += 1
if i % 1000000 == 0:
print("%s\t%s"%(i, datetime.now() - start))
most_frequent = defaultdict(str)
for k in destination_clusters:
most_frequent[k] = get_top5(destination_clusters[k])
with open("pred_sub.csv", "w") as outfile:
outfile.write("id,hotel_cluster\n")
for i, row in enumerate(DictReader(open("../data/test.csv"))):
outfile.write("%d,%s\n"%(i,most_frequent[row["srch_destination_id"]]))
if i % 1000000 == 0:
print("%s\t%s"%(i, datetime.now() - start))
In [ ]: