In [40]:
from pandas import Series, DataFrame
import pandas as pd


%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['random']
`%matplotlib` prevents importing * from pylab and numpy

In [41]:
destinations = pd.read_csv("destinations.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

Convert date time type to seperate the train and test set. becasue the test set data time have to be come later than the train set


In [42]:
train["date_time"] = pd.to_datetime(train["date_time"])
train["year"] = train["date_time"].dt.year
train["month"] = train["date_time"].dt.month

pick random 10000 users row as our train data set


In [43]:
import random

unique_users = train.user_id.unique()

sel_user_ids = [unique_users[i] for i in sorted(random.sample(range(len(unique_users)), 10000)) ]
sel_train = train[train.user_id.isin(sel_user_ids)]
print len(sel_train)


319807

In [44]:
t1 = sel_train[((sel_train.year == 2013) | ((sel_train.year == 2014) & (sel_train.month < 8)))]
t2 = sel_train[((sel_train.year == 2014) & (sel_train.month >= 8))]

In [45]:
# remove the empty bookinf in test set
t2 = t2[t2.is_booking == True]

Simple predication: use the most 5 common cluster as predication for each data in test


In [46]:
most_common_clusters = list(train.hotel_cluster.value_counts().head().index)

In [47]:
predictions = [most_common_clusters for i in range(t2.shape[0])]

Too many features in destination --> use PCA to reduce dimension


In [48]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
dest_small = pca.fit_transform(destinations[["d{0}".format(i + 1) for i in range(149)]])
dest_small = pd.DataFrame(dest_small)
dest_small["srch_destination_id"] = destinations["srch_destination_id"]

Generating feature

convert the time data into float that can be used as predicator in random forest. and join the destination feature transformed by PCA


In [49]:
def calc_fast_features(df):
    df["date_time"] = pd.to_datetime(df["date_time"])
    df["srch_ci"] = pd.to_datetime(df["srch_ci"], format='%Y-%m-%d', errors="coerce")
    df["srch_co"] = pd.to_datetime(df["srch_co"], format='%Y-%m-%d', errors="coerce")
    
    props = {}
    for prop in ["month", "day", "hour", "minute", "dayofweek", "quarter"]:
        props[prop] = getattr(df["date_time"].dt, prop)
    
    carryover = [p for p in df.columns if p not in ["date_time", "srch_ci", "srch_co"]]
    for prop in carryover:
        props[prop] = df[prop]
    
    date_props = ["month", "day", "dayofweek", "quarter"]
    for prop in date_props:
        props["ci_{0}".format(prop)] = getattr(df["srch_ci"].dt, prop)
        props["co_{0}".format(prop)] = getattr(df["srch_co"].dt, prop)
    props["stay_span"] = (df["srch_co"] - df["srch_ci"]).astype('timedelta64[h]')
        
    ret = pd.DataFrame(props)
    
    ret = ret.join(dest_small, on="srch_destination_id", how='left', rsuffix="dest")
    ret = ret.drop("srch_destination_iddest", axis=1)
    return ret

df = calc_fast_features(t1)

df.fillna(-1, inplace=True)


/Users/yaweny2/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/Users/yaweny2/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/Users/yaweny2/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [50]:
df[:5]


Out[50]:
channel ci_day ci_dayofweek ci_month ci_quarter cnt co_day co_dayofweek co_month co_quarter ... srch_rm_cnt stay_span user_id user_location_city user_location_country user_location_region year 0 1 2
5768 1 15.0 4.0 2.0 1.0 1 17.0 6.0 2.0 1.0 ... 1 48.0 34019 12328 205 354 2013 -1.212365 -1.065999 -0.000302
5769 1 8.0 4.0 3.0 1.0 1 10.0 6.0 3.0 1.0 ... 1 48.0 34019 12328 205 354 2013 -0.626661 0.214322 0.011202
5770 1 22.0 4.0 3.0 1.0 2 24.0 6.0 3.0 1.0 ... 1 48.0 34019 12328 205 354 2013 0.143469 0.103292 0.118912
5771 1 22.0 4.0 3.0 1.0 2 24.0 6.0 3.0 1.0 ... 1 48.0 34019 12328 205 354 2013 0.143469 0.103292 0.118912
5772 1 22.0 4.0 3.0 1.0 1 24.0 6.0 3.0 1.0 ... 1 48.0 34019 12328 205 354 2013 0.143469 0.103292 0.118912

5 rows × 40 columns


In [51]:
predictors = [c for c in df.columns if c not in ["hotel_cluster"]]
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10, min_weight_fraction_leaf=0.1)
scores = cross_validation.cross_val_score(clf, df[predictors], df['hotel_cluster'], cv=3)
scores


Out[51]:
array([ 0.06471449,  0.06494025,  0.06663115])

Top clusters based on hotel_cluster


In [52]:
def make_key(items):
    return "_".join([str(i) for i in items])

match_cols = ["srch_destination_id"]
cluster_cols = match_cols + ['hotel_cluster']
groups = t1.groupby(cluster_cols)
top_clusters = {}
for name, group in groups:
    clicks = len(group.is_booking[group.is_booking == False])
    bookings = len(group.is_booking[group.is_booking == True])
    
    score = bookings + .15 * clicks
    
    clus_name = make_key(name[:len(match_cols)])
    if clus_name not in top_clusters:
        top_clusters[clus_name] = {}
    top_clusters[clus_name][name[-1]] = score

In [53]:
import operator

cluster_dict = {}
for n in top_clusters:
    tc = top_clusters[n]
    top = [l[0] for l in sorted(tc.items(), key=operator.itemgetter(1), reverse=True)[:5]]
    cluster_dict[n] = top

Transform this dictionary to find the top 5 hotel clusters for each srch_destination_id


In [54]:
import operator

cluster_dict = {}
for n in top_clusters:
    tc = top_clusters[n]
    top = [l[0] for l in sorted(tc.items(), key=operator.itemgetter(1), reverse=True)[:5]]
    cluster_dict[n] = top

Making predictions based on destination


In [55]:
preds = []
for index, row in t2.iterrows():
    key = make_key([row[m] for m in match_cols])
    if key in cluster_dict:
        preds.append(cluster_dict[key])
    else:
        preds.append([])

In [56]:
preds


Out[56]:
[[56, 70, 98, 41, 55],
 [1, 79, 45, 54, 24],
 [],
 [25, 18, 37, 89, 72],
 [28, 31, 50, 32, 7],
 [33, 25, 40, 59, 98],
 [10, 49, 4, 28, 59],
 [17, 31, 90, 91, 73],
 [4, 49, 23, 13, 19],
 [0, 26, 34, 17, 96],
 [36, 63, 86, 44, 57],
 [],
 [9, 68, 98, 37, 95],
 [70, 83, 17, 41, 77],
 [],
 [71, 34, 77, 18, 50],
 [64, 37, 21, 59, 11],
 [36, 30, 99, 15, 58],
 [64, 37, 21, 59, 11],
 [91],
 [56, 70, 98, 41, 55],
 [50, 9, 21, 70, 49],
 [2, 18, 25, 4, 5],
 [21, 98, 77, 40, 37],
 [21, 98, 77, 40, 37],
 [72, 48, 32, 40, 18],
 [18, 68, 59, 6, 55],
 [19, 21, 9, 17, 23],
 [1, 79, 45, 24, 51],
 [1, 79, 45, 24, 51],
 [1, 79, 45, 54, 24],
 [96, 15, 13, 32, 7],
 [91, 95],
 [],
 [],
 [8, 64, 22, 5, 2],
 [],
 [],
 [83, 41, 17, 70, 5],
 [90, 84, 0, 53, 93],
 [91, 48, 62],
 [],
 [],
 [],
 [18, 99, 95, 42, 91],
 [96, 69, 9, 37, 32],
 [91, 59, 48, 50, 69],
 [],
 [70, 98, 91, 69, 41],
 [29, 64, 46, 97, 62],
 [56, 41, 70, 55, 98],
 [71, 34, 77, 18, 50],
 [97, 98, 21, 25, 11],
 [],
 [71, 34, 77, 18, 50],
 [13, 7, 48, 50, 22],
 [13, 7, 48, 50, 22],
 [95, 96, 55, 40, 83],
 [85, 80, 3, 8, 78],
 [85, 80, 3, 8, 78],
 [83, 41, 17, 70, 5],
 [5, 59, 2, 29, 48],
 [11, 58, 2],
 [76, 49, 40, 48, 16],
 [76, 49, 40, 48, 16],
 [32, 48, 47],
 [91],
 [72, 28, 33, 7, 35],
 [],
 [95, 33, 55, 83, 91],
 [72, 28, 33, 7, 35],
 [],
 [42, 91, 32, 23, 16],
 [5, 91, 33, 49, 10],
 [1, 54, 51, 48, 79],
 [64, 22, 8, 99, 78],
 [82, 6, 25, 29, 30],
 [46, 29, 11, 5, 28],
 [26, 0, 52, 33, 65],
 [14, 48, 7, 42, 50],
 [0, 31, 25, 77, 48],
 [71, 49, 54, 51, 40],
 [98, 97, 23, 21, 41],
 [65, 52, 50, 26, 66],
 [94, 91, 40, 75, 14],
 [],
 [48, 91, 72, 77, 37],
 [8, 68, 99, 9, 5],
 [8, 64, 97, 46, 2],
 [61, 82, 29, 30, 85],
 [34, 0, 91, 96, 42],
 [72, 50, 51, 43, 32],
 [83, 21, 77, 19, 70],
 [33, 41, 49, 48, 97],
 [95, 91, 41, 70, 9],
 [19, 17, 5, 72, 68],
 [],
 [42, 18, 91, 68, 49],
 [61, 99, 9, 82, 2],
 [68, 9, 41, 95, 51],
 [1, 79, 45, 54, 24],
 [64, 22, 8, 99, 78],
 [95, 96, 55, 40, 83],
 [55, 95, 68, 98, 72],
 [59, 9, 5, 58, 68],
 [99, 46, 29, 9, 95],
 [25, 59, 21, 72, 42],
 [25, 59, 21, 72, 42],
 [50, 6, 48, 47, 91],
 [2, 62, 29, 64, 90],
 [46, 62, 82, 43, 61],
 [56, 41, 70, 55, 98],
 [],
 [],
 [],
 [56, 70, 98, 41, 55],
 [64, 37, 21, 59, 11],
 [],
 [81, 36, 99, 46, 61],
 [],
 [95, 59, 50, 48, 28],
 [42, 13, 48, 91, 39],
 [16, 18, 77, 23, 71],
 [0, 31, 25, 77, 48],
 [7, 28, 4, 32, 6],
 [54, 79],
 [8, 64, 97, 46, 2],
 [55, 2, 68, 70, 95],
 [36, 29, 46, 76, 12],
 [9, 97, 64, 11, 21],
 [46, 29, 57, 58, 12],
 [91, 94, 18, 13, 83],
 [91, 94, 18, 13, 83],
 [28, 48, 4, 41, 91],
 [25, 5, 64, 11, 41],
 [78, 29, 11, 82, 22],
 [46, 61, 81, 78, 64],
 [46, 78, 81, 82, 29],
 [62, 58, 3, 44],
 [61, 62, 20, 12, 64],
 [85, 62, 36, 67, 3],
 [85, 62, 36, 67, 3],
 [2, 33, 95, 18, 68],
 [2, 33, 95, 18, 68],
 [82, 5],
 [77, 40, 48, 50, 15],
 [48, 0, 9, 70, 18],
 [46, 16, 29, 58, 2],
 [57, 85, 36, 62, 12],
 [36, 62, 46, 85, 82],
 [81, 63, 12, 57, 82],
 [36, 29, 46, 76, 12],
 [36, 29, 46, 76, 12],
 [62, 81, 30, 36, 57],
 [],
 [],
 [],
 [50, 13, 48, 33, 4],
 [98, 70, 19, 55, 83],
 [97, 98, 21, 25, 11],
 [59, 91, 42, 16, 18],
 [59, 91, 42, 16, 18],
 [46, 6, 15, 2, 62],
 [56, 70, 98, 41, 55],
 [1, 79, 45, 24, 51],
 [32, 72, 8, 10, 48],
 [91, 95, 28, 48, 18],
 [25, 59, 91, 39, 51],
 [40, 96, 48, 84, 23],
 [65, 66, 52, 92, 44],
 [54, 1, 24, 88, 51],
 [48, 41, 50, 46],
 [8, 64, 97, 46, 2],
 [95, 91, 68, 98, 18],
 [77, 42, 28, 16, 50],
 [42, 18, 91, 68, 49],
 [42, 18, 91, 68, 49],
 [42, 18, 91, 68, 49],
 [64, 22, 8, 99, 78],
 [8, 64, 22, 5, 2],
 [22, 8, 78, 30, 58],
 [8, 64, 22, 5, 2],
 [34, 94, 42, 96, 89],
 [56, 70, 55, 21, 41],
 [42, 48, 39, 41, 37],
 [],
 [],
 [19, 40, 6, 28, 77],
 [],
 [26, 34, 17, 84, 41],
 [41, 37, 94, 11, 18],
 [77, 33, 47, 32, 16],
 [11, 59, 36, 72, 2],
 [91, 95, 28, 48, 18],
 [18, 9, 91, 68, 6],
 [91, 95, 28, 48, 18],
 [42, 91, 94],
 [26, 22, 96, 25, 84],
 [47, 94, 39],
 [61, 63],
 [64, 9, 44, 99, 46],
 [65, 66, 52, 31, 73],
 [98, 70, 19, 55, 83],
 [98, 70, 19, 55, 83],
 [6, 77, 42, 96, 48],
 [41, 68, 91, 92, 5],
 [91, 18, 7],
 [19, 21, 9, 17, 23],
 [19, 21, 9, 17, 23],
 [64, 36, 44, 12, 81],
 [46, 76, 67, 30, 60],
 [46, 76, 67, 30, 60],
 [78, 30, 20, 64, 38],
 [],
 [60, 53, 20, 3, 78],
 [60, 53, 20, 3, 78],
 [46, 64, 78, 8, 36],
 [16, 18, 77, 23, 71],
 [71, 34, 77, 18, 50],
 [71, 34, 77, 18, 50],
 [77, 48, 47, 10, 39],
 [71, 34, 77, 18, 50],
 [71, 34, 77, 18, 50],
 [71, 34, 77, 18, 50],
 [71, 34, 77, 18, 50],
 [],
 [37, 70, 48, 11, 21],
 [21, 10, 4, 7, 59],
 [72, 95, 21, 2, 11],
 [36, 29, 46, 76, 12],
 [64, 29, 2, 25, 58],
 [32, 96, 42, 48, 47],
 [27, 20, 67, 38, 76],
 [80, 38, 57, 20, 78],
 [98, 46, 2, 97, 5],
 [70, 83, 17, 41, 77],
 [70, 83, 17, 41, 77],
 [15, 23, 48, 32, 36],
 [15, 23, 48, 32, 36],
 [33],
 [65, 66, 31, 52, 73],
 [91, 14, 13, 32, 50],
 [18, 39, 72, 91],
 [2, 11, 25, 58, 37],
 [58, 67, 43, 44, 14],
 [89, 67, 86, 14, 78],
 [78, 29, 11, 82, 22],
 [59, 91, 28, 4, 95],
 [69, 96, 89, 65, 26],
 [67, 3, 46],
 [67, 3, 46],
 [67, 3, 46],
 [1, 79, 45, 54, 24],
 [91, 77, 42, 16, 40],
 [91, 77, 42, 16, 40],
 [91, 77, 42, 16, 40],
 [91, 77, 42, 16, 40],
 [3, 48, 42, 77, 28],
 [91, 77, 42, 16, 40],
 [91, 77, 42, 16, 40],
 [91, 77, 42, 16, 40],
 [91, 77, 42, 16, 40],
 [91, 77, 42, 16, 40],
 [91, 77, 42, 16, 40],
 [28, 38, 72, 33, 3],
 [28, 21, 98, 33, 72],
 [48, 0, 9, 70, 18],
 [28, 48, 72, 25, 58],
 [17, 31, 90, 91, 73],
 [91, 48, 42, 9, 68],
 [28, 48, 4, 41, 91],
 [32, 33, 16, 42, 48],
 [30, 82, 61, 67, 46],
 [30, 82, 61, 67, 46],
 [14, 50, 4],
 [56, 70, 98, 41, 55],
 [13, 68, 95, 9, 42],
 [56, 70, 55, 21, 41],
 [25, 28, 72, 51, 13],
 [19, 4, 56, 21, 95],
 [65, 87, 31, 52, 85],
 [19, 4, 56, 21, 95],
 [9, 37, 4, 55, 0],
 [13, 39, 76, 51, 23],
 [13, 39, 76, 51, 23],
 [68, 33, 42, 46, 36],
 [43, 3, 14],
 [62, 67, 36, 44, 8],
 [28, 23, 7, 59, 72],
 [32, 16, 77, 7, 40],
 [],
 [48, 0, 9, 70, 18],
 [],
 [15, 50, 47, 42, 91],
 [83, 0, 50, 33, 51],
 [70, 83, 17, 41, 77],
 [43, 81, 62, 44, 61],
 [81, 61],
 [97, 20, 61, 76, 6],
 [59, 97, 15],
 [49, 4, 13, 19, 23],
 [64, 29, 2, 25, 58],
 [56, 70, 98, 41, 55],
 [56, 70, 98, 41, 55],
 [98, 70, 19, 55, 83],
 [43, 46, 78, 67, 36],
 [2, 11, 25, 58, 37],
 [33, 25, 40, 59, 98],
 [],
 [9, 37, 4, 55, 0],
 [9, 68, 98, 37, 95],
 [9, 68, 98, 37, 95],
 [95, 10, 4, 32, 7],
 [95, 10, 4, 32, 7],
 [1, 79, 45, 54, 24],
 [59, 91, 94, 39, 28],
 [98, 97, 23, 21, 41],
 [64, 99, 85, 82, 36],
 [64, 99, 85, 82, 36],
 [56, 70, 98, 41, 55],
 [98, 56, 55, 70, 41],
 [42, 98, 18, 95, 19],
 [59, 95, 18, 13, 94],
 [91, 48, 14, 33, 94],
 [0, 31, 25, 77, 48],
 [59, 91, 42, 16, 18],
 [0, 80, 26, 34, 96],
 [42, 91, 48, 47, 15],
 [],
 [16, 19, 18, 48, 99],
 [91, 15, 23, 7, 42],
 [64, 59, 70, 67, 41],
 [0, 31, 25, 77, 48],
 [9, 68, 98, 37, 95],
 [0, 31, 25, 77, 48],
 [9, 68, 98, 37, 95],
 [25, 75, 30, 6, 40],
 [43, 78, 82, 15, 64],
 [25, 97, 12],
 [59, 91, 42, 16, 18],
 [59, 91, 42, 16, 18],
 [59, 91, 42, 16, 18],
 [82, 10, 62, 44, 15],
 [33, 49, 83, 23, 68],
 [56, 70, 98, 41, 55],
 [56, 41, 70, 55, 98],
 [91, 11, 21],
 [16, 33, 68, 5, 94],
 [25, 5, 64, 11, 41],
 [16, 72, 6, 91, 39],
 [6, 82, 61, 77, 29],
 [48, 72, 18, 95, 47],
 [65, 66, 35, 87, 44],
 [40, 50, 37, 59, 32],
 [46, 29, 62, 36, 2],
 [50, 6, 48, 47, 91],
 [50, 6, 48, 47, 91],
 [],
 [2, 28, 48, 25, 50],
 [36, 59, 81],
 [0, 39, 90, 31, 41],
 [],
 [42, 91, 32, 23, 16],
 [],
 [48, 39, 6, 7, 76],
 [25, 33, 75, 11, 21],
 [9, 5, 64, 83, 48],
 [41, 5, 38, 64, 78],
 [41, 5, 38, 64, 78],
 [51, 6, 72, 4, 91],
 [91, 70, 95, 41, 72],
 [0, 31, 25, 77, 48],
 [7, 91, 23, 93, 83],
 [16, 90],
 [37, 83, 95, 20, 98],
 [6, 91, 48, 77, 18],
 [28, 91, 42, 76, 6],
 [48, 0, 9, 70, 18],
 [59, 6, 39, 91, 5],
 [48],
 [83, 40, 47, 48, 6],
 [42, 91, 6, 48, 37],
 [],
 [28, 23, 7, 59, 72],
 [32, 50, 16, 95, 23],
 [91, 14, 39, 51, 28],
 [48, 18, 91, 42, 68],
 [59, 18, 91, 48, 28],
 [15, 42, 91, 34, 76],
 [42, 18, 91, 68, 49],
 [15, 28, 42, 91, 62],
 [2, 48, 97, 18, 64],
 [46, 64, 78, 8, 36],
 [12, 49, 68, 83, 35],
 [2, 25, 11, 5, 64],
 [],
 [2, 59, 46, 29, 15],
 [36, 46, 67, 58, 62],
 [90, 36, 29, 58],
 [],
 [48, 69, 16, 29, 64],
 [95, 68, 2, 59, 99],
 [73, 71, 90, 93, 5],
 [],
 [10, 77, 41, 49, 6],
 [91, 48, 22],
 [91, 48, 22],
 [42, 28, 91, 90],
 [91, 99, 83, 18, 47],
 [97, 98, 21, 25, 11],
 [2, 11, 5, 30, 59],
 [2, 11, 5, 30, 59],
 [42, 43, 94, 48, 91],
 [48, 95, 64, 46, 69],
 [9, 97, 64, 11, 21],
 [9, 5, 64, 83, 48],
 [46, 67, 82, 81, 29],
 [46, 67, 82, 81, 29],
 [97, 98, 21, 25, 11],
 [43, 73, 13, 32, 4],
 [91, 42, 68, 47, 73],
 [56, 70, 98, 41, 55],
 [62, 65, 78, 52, 5],
 [65, 52, 50, 26, 66],
 [16],
 [39, 76, 42],
 [34, 63, 75, 51, 99],
 [59, 29, 18, 68, 81],
 [34, 63, 75, 51, 99],
 [59, 5, 20, 29, 30],
 [33, 25, 40, 59, 98],
 [34, 63, 75, 51, 99],
 [34, 63, 75, 51, 99],
 [34, 63, 75, 51, 99],
 [34, 63, 75, 51, 99],
 [34, 63, 75, 51, 99],
 [1, 79, 45, 54, 24],
 [67, 46, 82, 62, 44],
 [96, 48, 72, 28, 82],
 [0, 96, 48, 41, 97],
 [17, 3, 5, 93, 31],
 [73, 96, 65, 86, 89],
 [6, 42, 18, 91, 50],
 [6, 42, 18, 91, 50],
 [6, 50, 91, 18, 83],
 [6, 70, 51, 42, 76],
 [0, 31, 25, 77, 48],
 [68, 39, 48, 6, 99],
 [6, 91, 48, 77, 18],
 [59, 29, 18, 68, 81],
 [59, 29, 18, 68, 81],
 [59, 97, 15],
 [57, 30],
 [],
 [64, 37, 21, 59, 11],
 [16, 23, 32, 42, 91],
 [],
 [82, 99, 42, 7, 91],
 [],
 [],
 [16, 42],
 [71, 34, 77, 18, 50],
 [42, 77],
 [16],
 [82, 99, 42, 7, 91],
 [82, 99, 42, 7, 91],
 [82, 99, 42, 7, 91],
 [77, 42, 28, 16, 50],
 [16, 42],
 [],
 [77, 48, 47, 10, 39],
 [94],
 [16, 50, 77, 47],
 [40, 77, 15, 16, 7],
 [73],
 [],
 [82, 99, 42, 7, 91],
 [],
 [],
 [82, 99, 42, 7, 91],
 [91, 48, 40, 10, 47],
 [56, 70, 98, 41, 55],
 [1, 79, 45, 54, 24],
 [97, 98, 21, 25, 11],
 [56, 70, 98, 41, 55],
 [8, 64, 97, 46, 2],
 [70, 83, 17, 41, 77],
 [30, 15, 82, 85, 16],
 [48, 47, 16, 76],
 [59, 95, 18, 13, 94],
 [32, 48, 47],
 [46, 29, 44, 59, 84],
 [46, 29, 44, 59, 84],
 [44, 46, 62, 57, 63],
 [8, 64, 22, 5, 2],
 [36, 29, 46, 76, 12],
 [46, 29, 11, 5, 28],
 [29, 64, 46, 97, 62],
 [81, 63, 12, 57, 82],
 [59, 95, 18, 13, 94],
 [0, 31, 25, 77, 48],
 [2, 59, 15, 21, 89],
 [46, 16, 29, 58, 2],
 [34, 0, 96, 54, 26],
 [33, 32],
 [98, 70, 19, 55, 83],
 [2, 91, 39, 4, 18],
 [59, 91, 28, 4, 95],
 [65, 52, 87, 66, 5],
 [],
 [73, 44, 17, 3, 42],
 [6, 77, 42, 96, 48],
 [96, 75, 49, 41, 73],
 [90, 84, 0, 53, 93],
 [9, 95, 28, 13, 64],
 [65, 52, 87, 66, 5],
 [26, 73, 9, 72, 84],
 [18, 99, 95, 42, 91],
 [2, 33, 95, 18, 68],
 [43, 81, 62, 44, 61],
 [99, 68, 46, 29, 2],
 [],
 [46, 29, 38, 64, 36],
 [42, 78, 63, 90, 0],
 [90, 57, 62, 92, 44],
 [99, 18, 9, 37, 46],
 [2, 36, 29, 15, 58],
 [36, 30, 99, 15, 58],
 [78, 2, 36, 82, 64],
 [],
 [36, 30, 99, 15, 58],
 [82, 67, 78, 30, 20],
 [82, 67, 78, 30, 20],
 [78, 67, 43, 82, 30],
 [21, 55, 95, 6, 42],
 [21, 55, 95, 6, 42],
 [78, 67, 43, 82, 30],
 [82, 67, 78, 30, 20],
 [61],
 [],
 [8, 64, 22, 5, 2],
 [56, 70, 98, 41, 55],
 [2, 48, 97, 18, 64],
 [68, 9, 41, 95, 51],
 [64, 99, 85, 82, 36],
 [1, 79, 45, 24, 51],
 [2, 59, 46, 29, 15],
 [8, 96, 34, 94, 5],
 [2, 25, 11, 5, 64],
 [48, 91, 72, 77, 37],
 [65, 52, 66, 87, 96],
 [36, 59, 81],
 [46, 61, 82, 99, 38],
 [],
 [],
 [73, 96, 83, 40, 33],
 [],
 [59, 42, 91, 13, 60],
 [78],
 [42, 91],
 [42, 91],
 [29, 58, 82, 59, 46],
 [29, 58, 82, 59, 46],
 [70, 72, 25, 69, 17],
 [1, 79, 45, 54, 24],
 [],
 [2, 59, 46, 29, 15],
 [2, 59, 46, 29, 15],
 [36, 46, 67, 58, 62],
 [16, 72, 6, 91, 39],
 [10, 59, 5, 29, 15],
 [95, 33, 55, 83, 91],
 [95, 19, 18, 98, 21],
 [8, 46, 29, 64, 97],
 [59, 68, 28, 91, 19],
 [47, 32, 10, 14, 48],
 [48],
 [13, 5, 37, 42, 77],
 [13, 5, 37, 42, 77],
 [13, 5, 37, 42, 77],
 [13, 5, 37, 42, 77],
 [13, 5, 37, 42, 77],
 [13, 5, 37, 42, 77],
 [13, 5, 37, 42, 77],
 [13, 5, 37, 42, 77],
 [13, 5, 37, 42, 77],
 [13, 5, 37, 42, 77],
 [13, 5, 37, 42, 77],
 [69, 96, 89, 65, 26],
 [26, 0, 52, 33, 65],
 [84, 13, 73, 34, 5],
 [46, 29, 44, 59, 84],
 [56, 70, 98, 41, 55],
 [2, 48, 97, 18, 64],
 [68, 39, 48, 6, 99],
 [18, 95, 72, 90, 13],
 [18, 95, 72, 90, 13],
 [46, 29, 11, 5, 28],
 [95, 59, 12, 29, 81],
 [15, 46, 36, 81, 61],
 [15, 46, 36, 81, 61],
 [28, 48, 42, 14, 33],
 [28, 48, 42, 14, 33],
 [],
 [42],
 [31, 49, 26, 34, 73],
 [71, 34, 77, 18, 50],
 [26, 34, 17, 84, 41],
 [16],
 [91, 48, 95, 16, 2],
 [40, 84, 26],
 [25, 18, 37, 89, 72],
 [25, 18, 37, 89, 72],
 [25, 18, 37, 89, 72],
 [2, 33, 95, 18, 68],
 [16, 33, 18, 48, 49],
 [77, 19, 28, 50, 97],
 [9, 55, 21, 49, 95],
 [],
 [],
 [32, 48, 47],
 [],
 [32, 48, 47],
 [47, 40, 16, 23, 15],
 [47, 40, 16, 23, 15],
 [32, 48, 47],
 [32, 48, 47],
 [32, 48, 47],
 [],
 [13, 42, 40, 48, 5],
 [32, 48, 47],
 [13, 42, 40, 48, 5],
 [],
 [13, 42, 40, 48, 5],
 [13, 42, 40, 48, 5],
 [13, 42, 40, 48, 5],
 [13, 42, 40, 48, 5],
 [],
 [],
 [],
 [],
 [],
 [],
 [13, 42, 40, 48, 5],
 [13, 42, 40, 48, 5],
 [13, 42, 40, 48, 5],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [32, 48, 47],
 [32, 48, 47],
 [15, 91, 47, 40, 94],
 [15, 91, 47, 40, 94],
 [32, 48, 47],
 [15, 91, 47, 40, 94],
 [18, 68, 59, 6, 55],
 [15, 91, 47, 40, 94],
 [],
 [],
 [18, 99, 95, 42, 91],
 [18, 99, 95, 42, 91],
 [18, 99, 95, 42, 91],
 [18, 99, 95, 42, 91],
 [],
 [],
 [],
 [18, 99, 95, 42, 91],
 [],
 [18, 99, 95, 42, 91],
 [18, 99, 95, 42, 91],
 [32, 48, 47],
 [18, 6, 50, 77, 16],
 [32, 48, 47],
 [32, 48, 47],
 [32, 48, 47],
 [40, 91, 51],
 [0, 31, 25, 77, 48],
 [18],
 [48, 95, 9, 47, 16],
 [91, 16, 15],
 [],
 [14, 91, 18, 6],
 [91, 48, 42, 90, 7],
 [59, 91, 42, 16, 18],
 [91, 16, 15],
 [6, 50, 91, 18, 83],
 [6, 50, 91, 18, 83],
 [48],
 [59, 42, 28, 89],
 [54, 47, 39, 17, 10],
 [19, 41, 56, 70, 98],
 [36, 62, 46, 85, 82],
 [36, 62, 46, 85, 82],
 [48, 91, 72, 77, 37],
 [30, 37, 2, 78, 17],
 [68, 39, 48, 6, 99],
 [36, 29, 46, 76, 12],
 [43],
 [8, 64, 22, 5, 2],
 [78, 29, 11, 82, 22],
 [43, 22, 8, 14, 78],
 [64, 99, 85, 82, 36],
 [31, 20, 87, 53, 35],
 [35, 7, 85, 5, 61],
 [],
 [82, 30, 29, 61, 85],
 [81, 82, 58, 3, 62],
 [43, 62, 5, 78, 46],
 [82, 14, 93, 35, 5],
 [82, 14, 93, 35, 5],
 [33, 25, 40, 59, 98],
 [33, 25, 40, 59, 98],
 [75, 15, 48, 17, 42],
 [81, 82, 58, 3, 62],
 [8, 38, 61, 73, 78],
 [42, 78],
 [43, 62, 5, 78, 46],
 [2, 59, 46, 29, 15],
 [43, 61, 38, 40],
 [29, 64, 46, 97, 62],
 [],
 [38],
 [],
 [38],
 [],
 [25, 5, 64, 11, 41],
 [],
 [81, 82, 58, 3, 62],
 [],
 [25, 5, 64, 11, 41],
 [90, 61, 30],
 [82, 30, 29, 61, 85],
 [82, 30, 29, 61, 85],
 [],
 [78, 2, 36, 82, 64],
 [82, 30, 29, 61, 85],
 [82, 30, 29, 61, 85],
 [82, 30, 29, 61, 85],
 [25, 5, 64, 11, 41],
 [25, 5, 64, 11, 41],
 [82, 30, 29, 61, 85],
 [64, 37, 21, 59, 11],
 [],
 [43, 62],
 [82, 30, 29, 61, 85],
 [],
 [],
 [],
 [25, 5, 64, 11, 41],
 [38, 76, 35, 7, 78],
 [81, 82, 58, 3, 62],
 [78],
 [82, 30, 29, 61, 85],
 [],
 [],
 [1, 79, 45, 24, 51],
 [40, 77, 15, 16, 7],
 [5, 18, 19],
 [28, 21, 19, 18, 59],
 [77, 42, 28, 16, 50],
 [18, 39, 5, 48],
 [95, 91, 41, 70, 9],
 [33, 40, 4, 16, 70],
 [2, 33, 95, 18, 68],
 [72, 32, 39, 77, 47],
 [46, 29, 62, 36, 2],
 [56, 41, 70, 55, 98],
 [21, 97, 17, 40, 5],
 [97, 98, 21, 25, 11],
 [59, 98, 29, 89, 49],
 [46, 16, 29, 58, 2],
 [46, 29, 11, 5, 28],
 [2, 59, 46, 29, 15],
 [19, 4, 56, 21, 95],
 [59, 18, 16, 83, 5],
 [59, 18, 16, 83, 5],
 [57, 5, 46, 82, 36],
 [61, 62, 20, 12, 64],
 [57, 5, 46, 82, 36],
 [61, 62, 20, 12, 64],
 [9, 97, 64, 11, 21],
 [36, 61, 29, 46, 81],
 [64, 37, 21, 59, 11],
 [64, 99, 85, 82, 36],
 [64, 99, 85, 82, 36],
 [64, 99, 85, 82, 36],
 [8, 64, 97, 46, 2],
 [44, 46, 62, 57, 63],
 [63, 26, 64, 75, 92],
 [46, 29, 57, 58, 12],
 [46, 29, 57, 58, 12],
 [64, 37, 21, 59, 11],
 [64, 37, 21, 59, 11],
 [46, 29, 57, 58, 12],
 [28, 95, 25, 4, 41],
 [98, 70, 19, 55, 83],
 [40, 48, 28],
 [9, 13, 47, 41, 59],
 [1, 79, 45, 54, 24],
 [0, 80, 26, 34, 96],
 [77, 50, 72, 13, 49],
 [],
 [55, 37, 99, 91, 68],
 [1, 79, 45, 54, 24],
 [65, 52, 87, 66, 5],
 [48, 95, 9, 47, 16],
 [95, 72, 33, 42, 4],
 [46, 58, 64, 30, 27],
 [38, 29, 61, 36, 85],
 [68, 9, 41, 95, 51],
 [91, 13, 2, 6, 46],
 [91, 76, 23],
 [],
 [],
 [41, 28, 21, 98, 4],
 [],
 [91, 59, 6, 32, 48],
 [28, 23, 7, 59, 72],
 [82, 36, 85, 62, 5],
 [82, 36, 85, 62, 5],
 [82, 43, 90, 36, 46],
 [82, 43, 90, 36, 46],
 [98, 16, 37, 23, 6],
 [41, 68, 95, 69, 37],
 [80, 36, 63, 12, 62],
 [],
 [1, 79, 45, 54, 24],
 [1, 79, 45, 54, 24],
 [1, 79, 45, 54, 24],
 [77, 19, 28, 50, 97],
 [],
 [16, 96, 17, 68, 39],
 [91, 13, 18, 83, 40],
 [91, 13, 18, 83, 40],
 [59, 18, 91, 48, 28],
 [81, 46],
 [],
 [16, 18, 77, 23, 71],
 [1, 79, 45, 24, 51],
 [25, 28, 97, 2, 4],
 [9, 68, 98, 37, 95],
 [98, 70, 19, 55, 83],
 [59, 91, 28, 4, 95],
 [65, 52, 66, 31, 84],
 [21, 55, 95, 6, 42],
 [9, 55, 21, 49, 95],
 [13, 40, 5, 4, 77],
 [1, 79, 45, 54, 24],
 [48, 91, 42, 10, 50],
 [64, 11, 78, 29, 38],
 [67, 85, 30, 3, 12],
 [],
 [13, 28, 4, 33, 83],
 [],
 [91, 94, 18, 13, 83],
 [7, 25, 48, 16, 33],
 [37, 94],
 [],
 [28, 91, 48, 15, 14],
 [42, 5],
 [46, 29, 11, 5, 28],
 [25, 5, 64, 11, 41],
 [13, 91, 16, 48, 10],
 [39, 48, 10, 91, 32],
 [1, 79, 45, 54, 24],
 [1, 79, 45, 54, 24],
 [13, 91, 16, 48, 10],
 [68, 9, 41, 95, 51],
 [],
 [],
 [29, 64, 46, 97, 62],
 [40, 77, 6, 42, 76],
 [64, 22, 8, 99, 78],
 [37, 4, 38, 9, 5],
 [25, 59, 2, 90, 35],
 [9, 37, 4, 55, 0],
 [71, 34, 77, 18, 50],
 [91],
 [19, 33, 13, 95, 16],
 [18, 13, 28, 94, 42],
 [19, 33, 13, 95, 16],
 [16],
 [6, 50, 91, 18, 83],
 [18, 13, 28, 94, 42],
 [91, 59, 16, 28, 33],
 [6, 91, 48, 77, 18],
 [91, 59, 16, 28, 33],
 [77, 59, 9, 25, 41],
 [],
 [96, 69, 9, 37, 32],
 [91, 51],
 [47],
 [41, 5, 7, 42, 93],
 [6, 47, 39],
 [95, 19, 18, 98, 21],
 [78, 29, 11, 82, 22],
 [25, 5, 64, 11, 41],
 [91, 39, 13, 40, 76],
 [91, 39, 13, 40, 76],
 [91, 39, 13, 40, 76],
 [65, 52, 66, 87, 96],
 [77, 32, 13, 16, 18],
 [13, 39, 76, 51, 23],
 [73, 26, 0, 34, 28],
 [4, 49, 23, 13, 19],
 [4, 49, 23, 13, 19],
 [98, 97, 23, 21, 41],
 [39, 51, 47, 2],
 [98, 68, 95, 21, 9],
 [],
 [],
 [5, 25, 46, 29, 64],
 [64, 11, 78, 29, 38],
 [81, 36, 99, 46, 61],
 [36, 29, 46, 76, 12],
 [82, 7, 62, 46, 29],
 [46, 29, 36, 30, 58],
 [82, 58, 34, 62, 36],
 [],
 [62, 30, 43, 61, 85],
 [],
 [33, 13, 51, 32, 4],
 [59, 21, 49, 97, 33],
 [1, 79, 45, 54, 24],
 [1, 79, 45, 54, 24],
 [91, 14, 48, 42, 94],
 [],
 [42, 6, 16, 77, 48],
 [18, 72, 95, 4, 5],
 [41, 68, 95, 69, 37],
 [4, 95, 91, 25, 99],
 [16, 91, 48, 23, 94],
 [32, 50, 16, 95, 23],
 [77, 21, 28, 13, 59],
 [7, 6, 2, 40, 95],
 [59, 91, 28, 4, 95],
 [39, 4, 91, 77, 19],
 [77, 42, 28, 16, 50],
 [91, 95, 28, 48, 18],
 [77, 32, 13, 16, 18],
 [15, 42, 73, 48, 91],
 [81, 63, 12, 57, 82],
 [1, 79, 45, 54, 24],
 [95, 96, 55, 40, 83],
 [98, 70, 19, 55, 83],
 [42, 91, 48, 47, 15],
 [1, 79, 45, 54, 24],
 [98, 97, 23, 21, 41],
 [98, 70, 19, 55, 83],
 [56, 70, 98, 41, 55],
 [64, 29, 2, 25, 58],
 [46, 29, 11, 5, 28],
 [46, 29, 11, 5, 28],
 [46, 29, 11, 5, 28],
 [],
 [46, 6, 15, 2, 62],
 [49, 2, 16, 29, 59],
 [15, 82, 85, 81, 46],
 [19, 21, 9, 17, 23],
 [59, 98, 29, 89, 49],
 [97, 98, 21, 25, 11],
 [97, 98, 21, 25, 11],
 [76, 16, 2, 14],
 [97, 98, 21, 25, 11],
 [87, 11, 33, 17, 37],
 [96, 75, 49, 41, 73],
 [56, 70, 98, 41, 55],
 [48, 42, 41, 91, 16],
 [98, 56, 55, 70, 41],
 [95, 91, 68, 98, 18],
 [92, 80, 52, 44, 20],
 [91, 94, 18, 13, 83],
 [49, 4, 13, 19, 23],
 [56, 70, 98, 41, 55],
 [58, 68],
 [87, 15, 31, 73, 85],
 [87, 15, 31, 73, 85],
 [85],
 [15, 6, 55, 96, 3],
 [91, 18, 7],
 [46, 36, 62, 12, 81],
 [46, 36, 62, 12, 81],
 [32],
 [51, 48, 91, 76, 14],
 [1, 79, 45, 24, 51],
 [19, 40, 6, 28, 77],
 [19, 40, 6, 28, 77],
 [98, 16, 37, 23, 6],
 [89, 18, 51],
 [65, 66, 52, 31, 73],
 [33, 39, 89, 23, 48],
 [33, 39, 89, 23, 48],
 [98, 56, 55, 70, 41],
 [50, 28, 6],
 [91, 48],
 ...]

In [57]:
df_cali = df.loc[df["user_location_region"] == 174]
df_cali.columns


Out[57]:
Index([                  u'channel',                    u'ci_day',
                    u'ci_dayofweek',                  u'ci_month',
                      u'ci_quarter',                       u'cnt',
                          u'co_day',              u'co_dayofweek',
                        u'co_month',                u'co_quarter',
                             u'day',                 u'dayofweek',
                   u'hotel_cluster',           u'hotel_continent',
                   u'hotel_country',              u'hotel_market',
                            u'hour',                u'is_booking',
                       u'is_mobile',                u'is_package',
                          u'minute',                     u'month',
       u'orig_destination_distance',            u'posa_continent',
                         u'quarter',                 u'site_name',
                 u'srch_adults_cnt',         u'srch_children_cnt',
             u'srch_destination_id',  u'srch_destination_type_id',
                     u'srch_rm_cnt',                 u'stay_span',
                         u'user_id',        u'user_location_city',
           u'user_location_country',      u'user_location_region',
                            u'year',                            0,
                                  1,                            2],
      dtype='object')

This Kmeans exploration helped us get a sense of the 3 types of users in california region based on each user's hotel country and hotel market.


In [37]:
from sklearn.cluster import KMeans
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(df_cali[['hotel_country', 'hotel_market']])
df_cali['cluster'] = kmeans.labels_

def visualize_clusters(df, num_clusters):
    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']

    for n in range(num_clusters):
        clustered_df = df[df['cluster'] == n]
        plt.scatter(clustered_df['hotel_country'], clustered_df['hotel_market'], c=colors[n-1])
        plt.xlabel('hotel_country', fontsize=13)
        plt.ylabel('hotel_market', fontsize=13)


visualize_clusters(df_cali, num_clusters)


/Users/yaweny2/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

It seems for the California users, users in different clusters have different preference in hotel market. Maybe some people are richer than others, and they prefer better stars hotel.


In [ ]: