In [1]:
    
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from __future__ import print_function
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from datetime import datetime
import os
%matplotlib inline
%config InlineBackend.figure_format = 'png'
pd.set_option("max_columns",50)
    
In [2]:
    
%%time
train = pd.read_csv("../data/train_2013.csv", index_col=0)
train = train.reset_index(drop=True)
np.random.seed(402)
train = train.ix[np.random.choice(train.index, 50000)]
train = train.reset_index(drop=True)
test = pd.read_csv("../data/test.csv")
test_id = test["id"]
    
    
In [40]:
    
# use_col = ["is_booking", "user_id", "date_time","user_location_country","orig_destination_distance", "srch_co","srch_ci","user_location_region",\
#                "hotel_market","srch_destination_id","hotel_cluster"]
    
In [61]:
    
train.columns
    
    Out[61]:
In [62]:
    
test.columns # id가 생기고 hotel_cluster / is_booking / cnt 가 사라짐
    
    Out[62]:
In [3]:
    
use_col2 = ["user_id", "date_time","user_location_country","orig_destination_distance", "srch_co","srch_ci","user_location_region",\
               "hotel_market","srch_destination_id"]
    
In [4]:
    
train_y = train[["hotel_cluster"]]
train = train[use_col2]
test = test[use_col2]
    
In [88]:
    
    
In [5]:
    
%%time
le = preprocessing.LabelEncoder()
# train.fillna(0)
# test.fillna(0)
train["date_time"] = pd.to_datetime(train["date_time"], errors="coerce")
train["date_time"] = train["date_time"].dt.date
train["srch_ci"] = pd.to_datetime(train["srch_ci"], errors="coerce")
train["srch_co"] = pd.to_datetime(train["srch_co"], errors="coerce")
train["date_time"] = le.fit_transform(train["date_time"])
train["srch_ci"] = le.fit_transform(train["srch_ci"])
train["srch_co"] = le.fit_transform(train["srch_co"])
train["orig_destination_distance"].fillna(0, inplace=True)
test["date_time"] = pd.to_datetime(test["date_time"], errors="coerce")
test["date_time"] = test["date_time"].dt.date
test["srch_ci"] = pd.to_datetime(test["srch_ci"], errors="coerce")
test["srch_co"] = pd.to_datetime(test["srch_co"], errors="coerce")
test["date_time"] = le.fit_transform(test["date_time"])
test["srch_ci"] = le.fit_transform(test["srch_ci"])
test["srch_co"] = le.fit_transform(test["srch_co"])
test["orig_destination_distance"].fillna(0, inplace=True)
    
    
    
In [12]:
    
model = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)
    
In [16]:
    
%%time
print('='*50)
print('# Test shape : {}'.format(test.shape))
model.fit(train,train_y)
preds = model.predict_proba(test)
preds = np.fliplr(np.argsort(preds, axis=1))
    
    
In [32]:
    
preds[:,:5]
    
    Out[32]:
In [49]:
    
result_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)])
    
In [57]:
    
result_df = result_df.rename(index=str, columns={0:"hotel_cluster"})
    
In [72]:
    
result_df = result_df.reset_index()
result_df = result_df.rename(index=str,columns={"index":"id"})
    
In [81]:
    
result_df1 = pd.read_csv("201702061420.csv", index_col="id").drop(["Unnamed: 0"], axis=1)
    
In [83]:
    
result_df1.to_csv("201702061422.csv")
    
In [86]:
    
result_df1.tail()
    
    Out[86]:
public score = 0.14201
In [110]:
    
print("="*20)
trn_x1 = train
trn_y1 = train_y
#     model = RandomForestClassifier(max_depth=3, n_jobs=-1, random_state=402)
#     model.fit(trn_x1,trn_y1)
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
rank_series = pd.Series([])
for f in range(trn_x1.shape[1]):
    print("%d. feature %d %s (%f)" % (f + 1, indices[f], trn_x1.columns[indices[f]], importances[indices[f]]))
#         rank_series = rank_series.append(pd.Series([trn_x1.columns[indices[f]], importances[indices[f]]]))
#     rank_df2.insert(len(rank_df2.columns), column=i ,value=rank_series)
plt.title("Feature importances")
plt.bar(range(trn_x1.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(trn_x1.shape[1]), indices)
plt.xlim([-1, trn_x1.shape[1]])
plt.show()
    
In [ ]:
    
    
In [2]:
    
%%time
train = pd.read_csv("../data/train_2013.csv", index_col=0)
train = train.reset_index(drop=True)
np.random.seed(402)
train = train.ix[np.random.choice(train.index, 50000)]
train = train.reset_index(drop=True)
print("resd the train.csv")
use_col3 = ['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country']
train_y = train[["hotel_cluster"]]
train = train[use_col3]
print("read the test.csv")
test = pd.read_csv("../data/test.csv")
test = test[use_col3]
print("modeling strart")
model = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)
print('='*50)
print('# Test shape : {}'.format(test.shape))
model.fit(train,train_y)
preds = model.predict_proba(test)
preds = np.fliplr(np.argsort(preds, axis=1))
result_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)], columns=["hotel_cluster"])
result_df.index.names = ["id"]
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
result_df.to_csv(os.path.join('../output',file_name), index=True)
    
    
In [8]:
    
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(train.shape[1]):
    print("%d. feature %d %s (%f)" % (f + 1, indices[f], train.columns[indices[f]], importances[indices[f]]))
plt.title("Feature importances")
plt.bar(range(train.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(train.shape[1]), indices)
plt.xlim([-1, train.shape[1]])
plt.show()
    
    
    
In [18]:
    
    
In [38]:
    
%%time
train = pd.read_csv("../data/train_2013.csv", index_col=0)
train = train.reset_index(drop=True)
train = train[train["is_booking"] == 1]
np.random.seed(402)
train = train.ix[np.random.choice(train.index, 50000)]
train = train.reset_index(drop=True)
    
    
In [2]:
    
%%time
train = pd.read_csv("../data/train_2013.csv", index_col=0)
train = train.reset_index(drop=True)
train = train[train["is_booking"] == 1]
np.random.seed(402)
train = train.ix[np.random.choice(train.index, 50000)]
train = train.reset_index(drop=True)
print("read the train.csv")
use_col3 = ['site_name', 'user_location_region', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_destination_id', 'hotel_market', 'hotel_country']
train_y = train[["hotel_cluster"]]
train = train[use_col3]
print("read the test.csv")
test = pd.read_csv("../data/test.csv")
test = test[use_col3]
print("modeling strart")
model = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)
print('='*50)
print('# Test shape : {}'.format(test.shape))
model.fit(train,train_y)
preds = model.predict_proba(test)
preds = np.fliplr(np.argsort(preds, axis=1))
result_df = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)], columns=["hotel_cluster"])
result_df.index.names = ["id"]
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
print("save file")
result_df.to_csv(os.path.join('../output',file_name), index=True)
    
    
    
    
In [6]:
    
train.head()
    
    Out[6]:
In [ ]: