In [4]:

    
from IPython.core import display
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, cross_validation
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV

%matplotlib inline



In [17]:

    
def readCsv(name, nrows=6000000):
    df = pd.read_csv(
        "data/expedia/{}.csv".format(name), 
        nrows=nrows,
        parse_dates=["date_time", "srch_ci", "srch_co"])
    df = df[(df.is_booking == True) & (pd.DatetimeIndex(df['date_time']).year == 2014)].sample(100000)
    df["year"] = pd.DatetimeIndex(df['date_time']).year
    df["month"] = pd.DatetimeIndex(df['date_time']).month
    df["hour"] = pd.DatetimeIndex(df['date_time']).hour
    df["dayofweek"] = pd.DatetimeIndex(df['date_time']).dayofweek
    df["hour"] = pd.DatetimeIndex(df['date_time']).hour

    df["srch_ci_month"] = pd.DatetimeIndex(df['srch_ci']).month
    df["srch_ci_dayofweek"] = pd.DatetimeIndex(df['srch_ci']).dayofweek
    
    df["srch_co_month"] = pd.DatetimeIndex(df['srch_co']).month
    df["srch_co_dayofweek"] = pd.DatetimeIndex(df['srch_co']).dayofweek
    
    srch_ci_utime = pd.DatetimeIndex(df['srch_ci']).astype(np.int64) // 10**9 
    srch_co_utime = pd.DatetimeIndex(df['srch_co']).astype(np.int64) // 10**9 
    
    df["srch_days"] = (srch_co_utime - srch_ci_utime) / (3600 * 24)
    
    return df

#df = readCsv("train")
df = pd.read_csv("data/expedia/2014_bookings.csv")
print("shape: ", df.shape)
df.sample(5)









    



('shape: ', (200000, 34))






    Out[17]:






  
    
      
      Unnamed: 0
      date_time
      site_name
      posa_continent
      user_location_country
      user_location_region
      user_location_city
      orig_destination_distance
      user_id
      is_mobile
      ...
      hotel_cluster
      year
      month
      hour
      dayofweek
      srch_ci_month
      srch_ci_dayofweek
      srch_co_month
      srch_co_dayofweek
      srch_days
    
  
  
    
      166397
      650730
      2014-07-17 10:10:29
      13
      1
      46
      347
      22254
      NaN
      991404
      0
      ...
      62
      2014
      7
      10
      3
      7
      6
      7
      6
      7
    
    
      128815
      4176709
      2014-11-24 11:26:08
      2
      3
      66
      363
      14898
      NaN
      556
      0
      ...
      16
      2014
      11
      11
      0
      12
      5
      12
      6
      1
    
    
      23124
      4896682
      2014-07-16 20:17:38
      2
      3
      66
      442
      46296
      933.2417
      646444
      1
      ...
      52
      2014
      7
      20
      2
      8
      5
      8
      2
      4
    
    
      89394
      4749603
      2014-06-18 06:02:52
      2
      3
      66
      226
      42300
      152.4633
      263119
      0
      ...
      59
      2014
      6
      6
      2
      6
      3
      6
      4
      1
    
    
      33491
      922952
      2014-02-22 10:06:39
      2
      3
      66
      442
      35390
      5461.4049
      653311
      0
      ...
      29
      2014
      2
      10
      5
      3
      6
      3
      5
      6
    
  

5 rows × 34 columns

File descriptions

train.csv - the training set
test.csv - the test set
destinations.csv - hotel search latent attributes
sample_submission.csv - a sample submission file in the correct format

Data fields

train/test.csv

Column name Description Data type
date_time   Timestamp   string
site_name   ID of the Expedia point of sale (i.e. Expedia.com, Expedia.co.uk, Expedia.co.jp, ...)   int
posa_continent  ID of continent associated with site_name   int
user_location_country   The ID of the country the customer is located   int
user_location_region    The ID of the region the customer is located    int
user_location_city  The ID of the city the customer is located  int
orig_destination_distance   Physical distance between a hotel and a customer at the time of search. A null means the distance could not be calculated   double
user_id ID of user  int
is_mobile   1 when a user connected from a mobile device, 0 otherwise   tinyint
is_package  1 if the click/booking was generated as a part of a package (i.e. combined with a flight), 0 otherwise  int
channel ID of a marketing channel   int
srch_ci Checkin date    string
srch_co Checkout date   string
srch_adults_cnt The number of adults specified in the hotel room    int
srch_children_cnt   The number of (extra occupancy) children specified in the hotel room    int
srch_rm_cnt The number of hotel rooms specified in the search   int
srch_destination_id ID of the destination where the hotel search was performed  int
srch_destination_type_id    Type of destination int
hotel_continent Hotel continent int
hotel_country   Hotel country   int
hotel_market    Hotel market    int
is_booking  1 if a booking, 0 if a click    tinyint
cnt Numer of similar events in the context of the same user session bigint
hotel_cluster   ID of a hotel cluster   int

destinations.csv

Column name Description Data type
srch_destination_id ID of the destination where the hotel search was performed  int
d1-d149 latent description of search regions    double



In [18]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 0 to 199999
Data columns (total 34 columns):
Unnamed: 0                   200000 non-null int64
date_time                    200000 non-null object
site_name                    200000 non-null int64
posa_continent               200000 non-null int64
user_location_country        200000 non-null int64
user_location_region         200000 non-null int64
user_location_city           200000 non-null int64
orig_destination_distance    133493 non-null float64
user_id                      200000 non-null int64
is_mobile                    200000 non-null int64
is_package                   200000 non-null int64
channel                      200000 non-null int64
srch_ci                      200000 non-null object
srch_co                      200000 non-null object
srch_adults_cnt              200000 non-null int64
srch_children_cnt            200000 non-null int64
srch_rm_cnt                  200000 non-null int64
srch_destination_id          200000 non-null int64
srch_destination_type_id     200000 non-null int64
is_booking                   200000 non-null int64
cnt                          200000 non-null int64
hotel_continent              200000 non-null int64
hotel_country                200000 non-null int64
hotel_market                 200000 non-null int64
hotel_cluster                200000 non-null int64
year                         200000 non-null int64
month                        200000 non-null int64
hour                         200000 non-null int64
dayofweek                    200000 non-null int64
srch_ci_month                200000 non-null int64
srch_ci_dayofweek            200000 non-null int64
srch_co_month                200000 non-null int64
srch_co_dayofweek            200000 non-null int64
srch_days                    200000 non-null int64
dtypes: float64(1), int64(30), object(3)
memory usage: 53.4+ MB



In [19]:

    
df.describe()









    Out[19]:






  
    
      
      Unnamed: 0
      site_name
      posa_continent
      user_location_country
      user_location_region
      user_location_city
      orig_destination_distance
      user_id
      is_mobile
      is_package
      ...
      hotel_cluster
      year
      month
      hour
      dayofweek
      srch_ci_month
      srch_ci_dayofweek
      srch_co_month
      srch_co_dayofweek
      srch_days
    
  
  
    
      count
      200000.000000
      200000.00000
      200000.000000
      200000.000000
      200000.000000
      200000.000000
      133493.000000
      200000.000000
      200000.000000
      200000.000000
      ...
      200000.000000
      200000
      200000.00000
      200000.000000
      200000.000000
      200000.000000
      200000.000000
      200000.000000
      200000.000000
      200000.00000
    
    
      mean
      2968980.380510
      8.90200
      2.710410
      86.407160
      310.592555
      27742.208435
      1680.915612
      600921.967240
      0.103080
      0.124855
      ...
      47.672305
      2014
      7.21187
      13.309285
      2.721295
      7.228360
      3.109355
      7.149140
      3.297860
      2.40495
    
    
      std
      1733152.575779
      11.55783
      0.727132
      57.356216
      201.268131
      16744.860994
      2178.921124
      346603.687128
      0.304064
      0.330555
      ...
      28.963255
      0
      3.25968
      5.461572
      1.954934
      3.328379
      1.933203
      3.353591
      2.073702
      2.02480
    
    
      min
      1.000000
      2.00000
      0.000000
      0.000000
      0.000000
      0.000000
      0.005600
      12.000000
      0.000000
      0.000000
      ...
      0.000000
      2014
      1.00000
      0.000000
      0.000000
      1.000000
      0.000000
      1.000000
      0.000000
      1.00000
    
    
      25%
      1464997.250000
      2.00000
      3.000000
      66.000000
      174.000000
      13134.000000
      213.091900
      296765.000000
      0.000000
      0.000000
      ...
      22.000000
      2014
      5.00000
      9.000000
      1.000000
      5.000000
      1.000000
      5.000000
      1.000000
      1.00000
    
    
      50%
      2952695.000000
      2.00000
      3.000000
      66.000000
      314.000000
      27655.000000
      765.762500
      602792.500000
      0.000000
      0.000000
      ...
      46.000000
      2014
      7.00000
      13.000000
      3.000000
      8.000000
      3.000000
      8.000000
      3.000000
      2.00000
    
    
      75%
      4467778.500000
      11.00000
      3.000000
      69.000000
      385.000000
      42328.000000
      2196.837900
      906969.000000
      0.000000
      0.000000
      ...
      72.000000
      2014
      10.00000
      18.000000
      4.000000
      10.000000
      5.000000
      10.000000
      5.000000
      3.00000
    
    
      max
      5999992.000000
      53.00000
      4.000000
      239.000000
      1025.000000
      56507.000000
      11917.844900
      1198784.000000
      1.000000
      1.000000
      ...
      99.000000
      2014
      12.00000
      23.000000
      6.000000
      12.000000
      6.000000
      12.000000
      6.000000
      28.00000
    
  

8 rows × 31 columns

Removing outliers

@TODO: explain what I am doing here.



In [20]:

    
print "before removing outliers :", df.shape
df = df[(df["srch_days"] < 18) & (df["srch_days"] > -1)]
print "after removing outliers :", df.shape









    



before removing outliers : (200000, 34)
after removing outliers : (199733, 34)

Treating Missing values



In [21]:

    
def removeMisingvalues(df):
    """
    remove data rows associated with any missing value
    """
    return df[
        df.orig_destination_distance.notnull() &
        df.srch_ci.notnull()
    ]

print "before removing missing values :", df.shape
df = removeMisingvalues(df)
print "after removing missing values :", df.shape









    



before removing missing values : (199733, 34)
after removing missing values : (133348, 34)



In [22]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 133348 entries, 1 to 199996
Data columns (total 34 columns):
Unnamed: 0                   133348 non-null int64
date_time                    133348 non-null object
site_name                    133348 non-null int64
posa_continent               133348 non-null int64
user_location_country        133348 non-null int64
user_location_region         133348 non-null int64
user_location_city           133348 non-null int64
orig_destination_distance    133348 non-null float64
user_id                      133348 non-null int64
is_mobile                    133348 non-null int64
is_package                   133348 non-null int64
channel                      133348 non-null int64
srch_ci                      133348 non-null object
srch_co                      133348 non-null object
srch_adults_cnt              133348 non-null int64
srch_children_cnt            133348 non-null int64
srch_rm_cnt                  133348 non-null int64
srch_destination_id          133348 non-null int64
srch_destination_type_id     133348 non-null int64
is_booking                   133348 non-null int64
cnt                          133348 non-null int64
hotel_continent              133348 non-null int64
hotel_country                133348 non-null int64
hotel_market                 133348 non-null int64
hotel_cluster                133348 non-null int64
year                         133348 non-null int64
month                        133348 non-null int64
hour                         133348 non-null int64
dayofweek                    133348 non-null int64
srch_ci_month                133348 non-null int64
srch_ci_dayofweek            133348 non-null int64
srch_co_month                133348 non-null int64
srch_co_dayofweek            133348 non-null int64
srch_days                    133348 non-null int64
dtypes: float64(1), int64(30), object(3)
memory usage: 35.6+ MB

Data exploration



In [23]:

    
def exploreUnivariate(column):
    print "\n-------------------------\nColumn: {}\n".format(column.name)
    print column.describe()
    try:
        sns.distplot(column)
    except:
        print "cannot be plotted"
    plt.show()
#exploreUnivariate(df["srch_days"])
# exploreUnivariate(np.log(df["srch_days"]))  # less skewness



In [24]:

    
for i in df.drop(labels=["date_time"], axis=1).columns:
    exploreUnivariate(df[i])









    



-------------------------
Column: Unnamed: 0

count     133348.000000
mean     2989668.830646
std      1734625.842782
min            1.000000
25%      1489196.000000
50%      2986229.500000
75%      4490017.750000
max      5999924.000000
Name: Unnamed: 0, dtype: float64






    












    



-------------------------
Column: site_name

count    133348.000000
mean          5.803147
std           8.618301
min           2.000000
25%           2.000000
50%           2.000000
75%           2.000000
max          53.000000
Name: site_name, dtype: float64






    












    



-------------------------
Column: posa_continent

count    133348.000000
mean          2.902391
std           0.526496
min           0.000000
25%           3.000000
50%           3.000000
75%           3.000000
max           4.000000
Name: posa_continent, dtype: float64






    












    



-------------------------
Column: user_location_country

count    133348.000000
mean         86.271905
std          52.505029
min           0.000000
25%          66.000000
50%          66.000000
75%          66.000000
max         215.000000
Name: user_location_country, dtype: float64






    












    



-------------------------
Column: user_location_region

count    133348.000000
mean        309.955927
std         141.006778
min         135.000000
25%         174.000000
50%         318.000000
75%         363.000000
max        1021.000000
Name: user_location_region, dtype: float64






    












    



-------------------------
Column: user_location_city

count    133348.000000
mean      27915.649781
std       16564.997453
min           0.000000
25%       14241.000000
50%       27407.000000
75%       42858.000000
max       56507.000000
Name: user_location_city, dtype: float64






    












    



-------------------------
Column: orig_destination_distance

count    133348.000000
mean       1678.780720
std        2177.158572
min           0.005600
25%         212.921850
50%         764.511700
75%        2192.560975
max       11917.844900
Name: orig_destination_distance, dtype: float64






    












    



-------------------------
Column: user_id

count     133348.000000
mean      592986.717806
std       343237.674331
min           12.000000
25%       294230.000000
50%       592270.500000
75%       895418.000000
max      1198784.000000
Name: user_id, dtype: float64






    












    



-------------------------
Column: is_mobile

count    133348.000000
mean          0.104688
std           0.306153
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: is_mobile, dtype: float64






    












    



-------------------------
Column: is_package

count    133348.000000
mean          0.116552
std           0.320887
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: is_package, dtype: float64






    












    



-------------------------
Column: channel

count    133348.000000
mean          6.153388
std           3.654032
min           0.000000
25%           2.000000
50%           9.000000
75%           9.000000
max          10.000000
Name: channel, dtype: float64






    












    



-------------------------
Column: srch_ci

count         133348
unique           696
top       2014-12-26
freq             812
Name: srch_ci, dtype: object
cannot be plotted






    












    



-------------------------
Column: srch_co

count         133348
unique           697
top       2014-12-28
freq             876
Name: srch_co, dtype: object
cannot be plotted






    












    



-------------------------
Column: srch_adults_cnt

count    133348.000000
mean          1.907693
std           0.918556
min           0.000000
25%           1.000000
50%           2.000000
75%           2.000000
max           9.000000
Name: srch_adults_cnt, dtype: float64






    












    



-------------------------
Column: srch_children_cnt

count    133348.000000
mean          0.250127
std           0.666351
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           9.000000
Name: srch_children_cnt, dtype: float64






    












    



-------------------------
Column: srch_rm_cnt

count    133348.000000
mean          1.126586
std           0.479735
min           0.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           8.000000
Name: srch_rm_cnt, dtype: float64






    












    



-------------------------
Column: srch_destination_id

count    133348.000000
mean      15512.437772
std       11750.533471
min           4.000000
25%        8266.000000
50%       11975.000000
75%       22648.000000
max       65102.000000
Name: srch_destination_id, dtype: float64






    












    



-------------------------
Column: srch_destination_type_id

count    133348.000000
mean          2.886545
std           2.213398
min           1.000000
25%           1.000000
50%           1.000000
75%           6.000000
max           9.000000
Name: srch_destination_type_id, dtype: float64






    












    



-------------------------
Column: is_booking

count    133348
mean          1
std           0
min           1
25%           1
50%           1
75%           1
max           1
Name: is_booking, dtype: float64
cannot be plotted






    












    



-------------------------
Column: cnt

count    133348.000000
mean          1.014406
std           0.140207
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max          11.000000
Name: cnt, dtype: float64






    












    



-------------------------
Column: hotel_continent

count    133348.000000
mean          2.675218
std           1.416807
min           0.000000
25%           2.000000
50%           2.000000
75%           2.000000
max           6.000000
Name: hotel_continent, dtype: float64






    












    



-------------------------
Column: hotel_country

count    133348.000000
mean         73.614722
std          51.778146
min           0.000000
25%          50.000000
50%          50.000000
75%          70.000000
max         212.000000
Name: hotel_country, dtype: float64






    












    



-------------------------
Column: hotel_market

count    133348.000000
mean        639.435222
std         441.062634
min           0.000000
25%         366.000000
50%         628.000000
75%         701.000000
max        2117.000000
Name: hotel_market, dtype: float64






    












    



-------------------------
Column: hotel_cluster

count    133348.000000
mean         47.493491
std          29.112451
min           0.000000
25%          21.000000
50%          46.000000
75%          72.000000
max          99.000000
Name: hotel_cluster, dtype: float64






    












    



-------------------------
Column: year

count    133348
mean       2014
std           0
min        2014
25%        2014
50%        2014
75%        2014
max        2014
Name: year, dtype: float64
cannot be plotted






    












    



-------------------------
Column: month

count    133348.000000
mean          7.197408
std           3.253623
min           1.000000
25%           5.000000
50%           7.000000
75%          10.000000
max          12.000000
Name: month, dtype: float64






    












    



-------------------------
Column: hour

count    133348.000000
mean         13.415949
std           5.188417
min           0.000000
25%          10.000000
50%          13.000000
75%          18.000000
max          23.000000
Name: hour, dtype: float64






    












    



-------------------------
Column: dayofweek

count    133348.000000
mean          2.695758
std           1.948024
min           0.000000
25%           1.000000
50%           3.000000
75%           4.000000
max           6.000000
Name: dayofweek, dtype: float64






    












    



-------------------------
Column: srch_ci_month

count    133348.000000
mean          7.229182
std           3.315456
min           1.000000
25%           5.000000
50%           8.000000
75%          10.000000
max          12.000000
Name: srch_ci_month, dtype: float64






    












    



-------------------------
Column: srch_ci_dayofweek

count    133348.000000
mean          3.126391
std           1.920803
min           0.000000
25%           2.000000
50%           3.000000
75%           5.000000
max           6.000000
Name: srch_ci_dayofweek, dtype: float64






    












    



-------------------------
Column: srch_co_month

count    133348.000000
mean          7.157933
std           3.338320
min           1.000000
25%           5.000000
50%           8.000000
75%          10.000000
max          12.000000
Name: srch_co_month, dtype: float64






    












    



-------------------------
Column: srch_co_dayofweek

count    133348.000000
mean          3.343095
std           2.081142
min           0.000000
25%           2.000000
50%           4.000000
75%           5.000000
max           6.000000
Name: srch_co_dayofweek, dtype: float64






    












    



-------------------------
Column: srch_days

count    133348.000000
mean          2.281197
std           1.804944
min           1.000000
25%           1.000000
50%           2.000000
75%           3.000000
max          17.000000
Name: srch_days, dtype: float64



In [11]:

    
# @TODO: plot bivariate charts



In [25]:

    
sns.stripplot(x="user_location_country", y="hotel_cluster", jitter=True, data=df[
            (df["hotel_country"] == 19) & (df["year"] == 2014)])
plt.show()
sns.stripplot(x="user_location_country", y="hotel_cluster", jitter=True ,data=df[
            (df["hotel_country"] == 20) & (df["year"] == 2014)])
plt.show()
sns.stripplot(x="user_location_country", y="hotel_cluster", jitter=True, data=df[
            (df["hotel_country"] == 21) & (df["year"] == 2014)])
plt.show()



In [34]:

    
cols = [u'site_name', u'posa_continent', u'user_location_country',
       u'channel', u'srch_adults_cnt',
       u'srch_children_cnt', u'srch_rm_cnt', u'srch_destination_id',
       u'is_booking', u'cnt', u'hotel_continent',
       u'hotel_country', u'hotel_market', u'hotel_cluster', u'year', u'month',
       u'hour', u'dayofweek', u'srch_ci_month', u'srch_ci_dayofweek',
       u'srch_co_month', u'srch_co_dayofweek', u'srch_days']

for i in cols:
    g = sns.FacetGrid(
        df[
            (
                (df["hotel_country"] == 19) | 
                (df["hotel_country"] == 20) | 
                (df["hotel_country"] == 21) | 
                (df["hotel_country"] == 22) 
            ) 
            & (df["year"] == 2014)
            & (df["srch_co_month"] == 8)
        ], col="hotel_country")
    g.map(sns.stripplot, i, "hotel_cluster", jitter=True)
    #g.add_legenad();
    #sns.boxplot(x=i, y="hotel_cluster", data=df[
    #        (df["hotel_country"] == 21) & (df["year"] == 2014)])
    plt.show()



In [29]:

    
sns.stripplot(
    x="hotel_market", y="hotel_cluster", 
    data=df[df["hotel_country"] == 48], jitter=True)









    Out[29]:





<matplotlib.axes._subplots.AxesSubplot at 0x10fa02c10>



In [32]:

    
def biVariate(df):
    cols = [
        #"srch_destination_id"]
        "hotel_continent", "hotel_country",
        "srch_adults_cnt", "srch_rm_cnt", "srch_children_cnt", 
        "srch_destination_type_id", "srch_destination_id",
        "month", "year", "hour", "dayofweek", "srch_ci_month", 
        "srch_ci_dayofweek", "srch_co_month", "srch_co_dayofweek", "srch_days"]
    for i in cols:
        sns.stripplot(x=i, y="hotel_cluster", data=df, jitter=True)
        plt.show()
        sns.boxplot(x=i, y="hotel_cluster", data=df)
        plt.show()
biVariate(df.sample(50000))









    












    












    












    












    












    












    












    












    












    












    












    












    












    












    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-32-19e7a8ba78c8> in <module>()
     12         sns.boxplot(x=i, y="hotel_cluster", data=df)
     13         plt.show()
---> 14 biVariate(df.sample(50000))

<ipython-input-32-19e7a8ba78c8> in biVariate(df)
      8         "srch_ci_dayofweek", "srch_co_month", "srch_co_dayofweek", "srch_days"]
      9     for i in cols:
---> 10         sns.stripplot(x=i, y="hotel_cluster", data=df, jitter=True)
     11         plt.show()
     12         sns.boxplot(x=i, y="hotel_cluster", data=df)

/Users/muatik/venv/lib/python2.7/site-packages/seaborn/categorical.pyc in stripplot(x, y, hue, data, order, hue_order, jitter, split, orient, color, palette, size, edgecolor, linewidth, ax, **kwargs)
   2513                        linewidth=linewidth))
   2514 
-> 2515     plotter.plot(ax, kwargs)
   2516     return ax
   2517 

/Users/muatik/venv/lib/python2.7/site-packages/seaborn/categorical.pyc in plot(self, ax, kws)
   1165     def plot(self, ax, kws):
   1166         """Make the plot."""
-> 1167         self.draw_stripplot(ax, kws)
   1168         self.add_legend_data(ax)
   1169         self.annotate_axes(ax)

/Users/muatik/venv/lib/python2.7/site-packages/seaborn/categorical.pyc in draw_stripplot(self, ax, kws)
   1141                 cat_pos = np.ones(strip_data.size) * i
   1142                 cat_pos += self.jitterer(len(strip_data))
-> 1143                 kws.update(c=self.point_colors[i][hue_mask])
   1144                 if self.orient == "v":
   1145                     ax.scatter(cat_pos, strip_data, **kws)

/Users/muatik/venv/lib/python2.7/site-packages/seaborn/categorical.pyc in point_colors(self)
   1077                 # Use the same color for all points at this level
   1078                 group_color = self.colors[i]
-> 1079                 group_colors[:] = group_color
   1080 
   1081             else:

KeyboardInterrupt:



In [21]:

    
sns.stripplot(x="hotel_country", y="hotel_market", data=df, jitter=True)









    Out[21]:





<matplotlib.axes._subplots.AxesSubplot at 0x1152dc3d0>



In [14]:

    
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(df["hotel_cluster"], df["hotel_country"], df["srch_days"], c="b")









    Out[14]:





<mpl_toolkits.mplot3d.art3d.Patch3DCollection at 0x7f8ed62a5150>



In [53]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 3833548 to 2732975
Data columns (total 33 columns):
date_time                    50000 non-null datetime64[ns]
site_name                    50000 non-null int64
posa_continent               50000 non-null int64
user_location_country        50000 non-null int64
user_location_region         50000 non-null int64
user_location_city           50000 non-null int64
orig_destination_distance    33080 non-null float64
user_id                      50000 non-null int64
is_mobile                    50000 non-null int64
is_package                   50000 non-null int64
channel                      50000 non-null int64
srch_ci                      50000 non-null object
srch_co                      50000 non-null object
srch_adults_cnt              50000 non-null int64
srch_children_cnt            50000 non-null int64
srch_rm_cnt                  50000 non-null int64
srch_destination_id          50000 non-null int64
srch_destination_type_id     50000 non-null int64
is_booking                   50000 non-null int64
cnt                          50000 non-null int64
hotel_continent              50000 non-null int64
hotel_country                50000 non-null int64
hotel_market                 50000 non-null int64
hotel_cluster                50000 non-null int64
year                         50000 non-null int32
month                        50000 non-null int32
hour                         50000 non-null int32
dayofweek                    50000 non-null int32
srch_ci_month                50000 non-null int32
srch_ci_dayofweek            50000 non-null int32
srch_co_month                50000 non-null int32
srch_co_dayofweek            50000 non-null int32
srch_days                    50000 non-null int64
dtypes: datetime64[ns](1), float64(1), int32(8), int64(21), object(2)
memory usage: 11.4+ MB



In [2]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation



In [11]:

    
clf = RandomForestClassifier(n_estimators=10)

a = df[[
        "hotel_continent",
        "hotel_market",
        "user_location_country",
        "srch_destination_type_id", 
        "hotel_country", 
        "is_package",
        "srch_ci_month", 
        "srch_days",
        "hotel_cluster"
    ]]

X = a.drop(labels=["hotel_cluster"], axis=1)
y = a["hotel_cluster"]

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=0.2, random_state=0)


clf.fit(X_train, y_train)
clf.score(X_test, y_test)









    Out[11]:





0.13171250000000001



In [ ]:

    
from sklearn.svm import SVC

clf = SVC(kernel="rbf", gamma=0.002)

a = df[[
        "hotel_continent",
        "hotel_market",
        "user_location_country",
        "srch_destination_type_id", 
        "hotel_country", 
        "is_package",
        "srch_ci_month", 
        "srch_days",
        "hotel_cluster"
    ]]

X = a.drop(labels=["hotel_cluster"], axis=1)
y = a["hotel_cluster"]

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=0.2, random_state=0)


clf.fit(X_train, y_train)
clf.score(X_test, y_test)



In [2]:

    
dest = pd.read_csv("data/expedia/destinations.csv")



In [4]:

    
dest.sample(5)









    Out[4]:






  
    
      
      srch_destination_id
      d1
      d2
      d3
      d4
      d5
      d6
      d7
      d8
      d9
      ...
      d140
      d141
      d142
      d143
      d144
      d145
      d146
      d147
      d148
      d149
    
  
  
    
      26231
      27437
      -2.207447
      -2.213283
      -2.213283
      -2.184263
      -2.042806
      -2.158912
      -2.213283
      -2.213283
      -2.213283
      ...
      -2.213283
      -2.183072
      -2.177808
      -2.213283
      -2.114200
      -2.213283
      -2.181598
      -2.213283
      -2.213283
      -2.152779
    
    
      24091
      25276
      -2.180774
      -2.180774
      -2.180774
      -2.180774
      -2.180774
      -2.149468
      -2.180774
      -2.180774
      -2.180774
      ...
      -2.180774
      -2.180774
      -2.180774
      -2.180774
      -2.180774
      -2.180774
      -2.180774
      -2.180774
      -2.180774
      -2.180774
    
    
      50521
      52761
      -2.311128
      -2.300818
      -2.313542
      -2.208092
      -1.889265
      -1.681330
      -2.313542
      -2.313542
      -2.253724
      ...
      -2.312284
      -2.313195
      -2.313542
      -2.268086
      -2.268086
      -2.313542
      -2.313542
      -2.313542
      -2.313542
      -2.201945
    
    
      9479
      9783
      -2.181958
      -2.181958
      -2.181958
      -2.181958
      -2.181958
      -2.172306
      -2.181958
      -2.181958
      -2.172306
      ...
      -2.172306
      -2.181958
      -2.181958
      -2.181958
      -2.181958
      -2.181958
      -2.181958
      -2.181958
      -2.181958
      -2.181958
    
    
      24922
      26116
      -2.192287
      -2.192287
      -2.192287
      -2.192287
      -2.192287
      -2.155291
      -2.192287
      -2.192287
      -2.192287
      ...
      -2.185266
      -2.192287
      -2.192287
      -2.192287
      -2.192287
      -2.192287
      -2.192287
      -2.192287
      -2.192287
      -2.192287
    
  

5 rows × 150 columns



In [11]:

    
import pymongo
from pprint import pprint as pp
from datetime import datetime



In [60]:

    
db = pymongo.MongoClient("192.168.5.5")["expedia2"]



In [70]:

    
i = 0
with open("data/expedia/train.csv") as f:
    cache = []
    headers = f.readline().strip().split(",") # reading the header line, we do not need this.
    strptime_format1 = "%Y-%m-%d %H:%M:%S"
    strptime_format2 = "%Y-%m-%d"
    
    def toInt(line):
        for h in headers:
            if h not in ["date_time", "srch_ci", "srch_co"]:
                line[h] = float(line[h])
        return line
    
    for line in f:
        i += 1
        line = line.strip().split(",")
        if line[18] == 0:
            continue

        line = dict(zip(headers, line))
        if line["orig_destination_distance"] == "":
            continue
        
        if line["srch_ci"] == "" or line["srch_co"] == "":
            continue

#         line["date_time"] = datetime.strptime(line["date_time"], strptime_format)
#         line["srch_ci"] = datetime.strptime(line["srch_ci"], strptime_format2)
#         line["srch_co"] = datetime.strptime(line["srch_co"], strptime_format2)
#         line = toInt(line)
#         pp(line)
        
        cache.append(line)
        if len(cache) == 10000:
            # print "inserting , ", i, len(cache)
            db["train2"].insert_many(cache)
            cache = []

        if i % 40000 == 0:
            print i









    



40000
80000
120000
160000
280000
320000
360000
400000






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-70-c4a60c6c5f4d> in <module>()
     34         if len(cache) == 10000:
     35             # print "inserting , ", i, len(cache)
---> 36             db["train2"].insert_many(cache)
     37             cache = []
     38 

/Users/muatik/venv/lib/python2.7/site-packages/pymongo/collection.pyc in insert_many(self, documents, ordered, bypass_document_validation)
    677         blk = _Bulk(self, ordered, bypass_document_validation)
    678         blk.ops = [doc for doc in gen()]
--> 679         blk.execute(self.write_concern.document)
    680         return InsertManyResult(inserted_ids, self.write_concern.acknowledged)
    681 

/Users/muatik/venv/lib/python2.7/site-packages/pymongo/bulk.pyc in execute(self, write_concern)
    468                 return self.execute_command(sock_info, generator, write_concern)
    469             else:
--> 470                 return self.execute_legacy(sock_info, generator, write_concern)
    471 
    472 

/Users/muatik/venv/lib/python2.7/site-packages/pymongo/bulk.pyc in execute_legacy(self, sock_info, generator, write_concern)
    391                                      self.ordered,
    392                                      write_concern=write_concern,
--> 393                                      op_id=op_id)
    394                         result = {}
    395                     elif run.op_type == _UPDATE:

/Users/muatik/venv/lib/python2.7/site-packages/pymongo/collection.pyc in _insert(self, sock_info, docs, ordered, check_keys, manipulate, write_concern, op_id, bypass_doc_val)
    528             return self._insert_one(
    529                 sock_info, docs, ordered,
--> 530                 check_keys, manipulate, write_concern, op_id, bypass_doc_val)
    531 
    532         ids = []

/Users/muatik/venv/lib/python2.7/site-packages/pymongo/collection.pyc in _insert_one(self, sock_info, doc, ordered, check_keys, manipulate, write_concern, op_id, bypass_doc_val)
    517                 sock_info, 'insert', command, acknowledged, op_id,
    518                 bypass_doc_val, message.insert, self.__full_name, [doc],
--> 519                 check_keys, acknowledged, concern, False, self.codec_options)
    520         if not isinstance(doc, RawBSONDocument):
    521             return doc.get('_id')

/Users/muatik/venv/lib/python2.7/site-packages/pymongo/collection.pyc in _legacy_write(self, sock_info, name, cmd, acknowledged, op_id, bypass_doc_val, func, *args)
    456         try:
    457             result = sock_info.legacy_write(
--> 458                 rqst_id, msg, max_size, acknowledged)
    459         except Exception as exc:
    460             if publish:

/Users/muatik/venv/lib/python2.7/site-packages/pymongo/pool.pyc in legacy_write(self, request_id, msg, max_doc_size, with_last_error)
    264         self.send_message(msg, max_doc_size)
    265         if with_last_error:
--> 266             response = self.receive_message(1, request_id)
    267             return helpers._check_gle_response(response)
    268 

/Users/muatik/venv/lib/python2.7/site-packages/pymongo/pool.pyc in receive_message(self, operation, request_id)
    244                 self.sock, operation, request_id, self.max_message_size)
    245         except BaseException as error:
--> 246             self._raise_connection_failure(error)
    247 
    248     def legacy_write(self, request_id, msg, max_doc_size, with_last_error):

/Users/muatik/venv/lib/python2.7/site-packages/pymongo/pool.pyc in _raise_connection_failure(self, error)
    344             _raise_connection_failure(self.address, error)
    345         else:
--> 346             raise error
    347 
    348     def __eq__(self, other):

KeyboardInterrupt:



In [71]:

    
headers









    Out[71]:





['date_time',
 'site_name',
 'posa_continent',
 'user_location_country',
 'user_location_region',
 'user_location_city',
 'orig_destination_distance',
 'user_id',
 'is_mobile',
 'is_package',
 'channel',
 'srch_ci',
 'srch_co',
 'srch_adults_cnt',
 'srch_children_cnt',
 'srch_rm_cnt',
 'srch_destination_id',
 'srch_destination_type_id',
 'is_booking',
 'cnt',
 'hotel_continent',
 'hotel_country',
 'hotel_market',
 'hotel_cluster']



In [ ]:

	Unnamed: 0	date_time	site_name	posa_continent	user_location_country	user_location_region	user_location_city	orig_destination_distance	user_id	is_mobile	...	hotel_cluster	year	month	hour	dayofweek	srch_ci_month	srch_ci_dayofweek	srch_co_month	srch_co_dayofweek	srch_days
166397	650730	2014-07-17 10:10:29	13	1	46	347	22254	NaN	991404	0	...	62	2014	7	10	3	7	6	7	6	7
128815	4176709	2014-11-24 11:26:08	2	3	66	363	14898	NaN	556	0	...	16	2014	11	11	0	12	5	12	6	1
23124	4896682	2014-07-16 20:17:38	2	3	66	442	46296	933.2417	646444	1	...	52	2014	7	20	2	8	5	8	2	4
89394	4749603	2014-06-18 06:02:52	2	3	66	226	42300	152.4633	263119	0	...	59	2014	6	6	2	6	3	6	4	1
33491	922952	2014-02-22 10:06:39	2	3	66	442	35390	5461.4049	653311	0	...	29	2014	2	10	5	3	6	3	5	6

	Unnamed: 0	site_name	posa_continent	user_location_country	user_location_region	user_location_city	orig_destination_distance	user_id	is_mobile	is_package	...	hotel_cluster	year	month	hour	dayofweek	srch_ci_month	srch_ci_dayofweek	srch_co_month	srch_co_dayofweek	srch_days
count	200000.000000	200000.00000	200000.000000	200000.000000	200000.000000	200000.000000	133493.000000	200000.000000	200000.000000	200000.000000	...	200000.000000	200000	200000.00000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.000000	200000.00000
mean	2968980.380510	8.90200	2.710410	86.407160	310.592555	27742.208435	1680.915612	600921.967240	0.103080	0.124855	...	47.672305	2014	7.21187	13.309285	2.721295	7.228360	3.109355	7.149140	3.297860	2.40495
std	1733152.575779	11.55783	0.727132	57.356216	201.268131	16744.860994	2178.921124	346603.687128	0.304064	0.330555	...	28.963255	0	3.25968	5.461572	1.954934	3.328379	1.933203	3.353591	2.073702	2.02480
min	1.000000	2.00000	0.000000	0.000000	0.000000	0.000000	0.005600	12.000000	0.000000	0.000000	...	0.000000	2014	1.00000	0.000000	0.000000	1.000000	0.000000	1.000000	0.000000	1.00000
25%	1464997.250000	2.00000	3.000000	66.000000	174.000000	13134.000000	213.091900	296765.000000	0.000000	0.000000	...	22.000000	2014	5.00000	9.000000	1.000000	5.000000	1.000000	5.000000	1.000000	1.00000
50%	2952695.000000	2.00000	3.000000	66.000000	314.000000	27655.000000	765.762500	602792.500000	0.000000	0.000000	...	46.000000	2014	7.00000	13.000000	3.000000	8.000000	3.000000	8.000000	3.000000	2.00000
75%	4467778.500000	11.00000	3.000000	69.000000	385.000000	42328.000000	2196.837900	906969.000000	0.000000	0.000000	...	72.000000	2014	10.00000	18.000000	4.000000	10.000000	5.000000	10.000000	5.000000	3.00000
max	5999992.000000	53.00000	4.000000	239.000000	1025.000000	56507.000000	11917.844900	1198784.000000	1.000000	1.000000	...	99.000000	2014	12.00000	23.000000	6.000000	12.000000	6.000000	12.000000	6.000000	28.00000

	srch_destination_id	d1	d2	d3	d4	d5	d6	d7	d8	d9	...	d140	d141	d142	d143	d144	d145	d146	d147	d148	d149
26231	27437	-2.207447	-2.213283	-2.213283	-2.184263	-2.042806	-2.158912	-2.213283	-2.213283	-2.213283	...	-2.213283	-2.183072	-2.177808	-2.213283	-2.114200	-2.213283	-2.181598	-2.213283	-2.213283	-2.152779
24091	25276	-2.180774	-2.180774	-2.180774	-2.180774	-2.180774	-2.149468	-2.180774	-2.180774	-2.180774	...	-2.180774	-2.180774	-2.180774	-2.180774	-2.180774	-2.180774	-2.180774	-2.180774	-2.180774	-2.180774
50521	52761	-2.311128	-2.300818	-2.313542	-2.208092	-1.889265	-1.681330	-2.313542	-2.313542	-2.253724	...	-2.312284	-2.313195	-2.313542	-2.268086	-2.268086	-2.313542	-2.313542	-2.313542	-2.313542	-2.201945
9479	9783	-2.181958	-2.181958	-2.181958	-2.181958	-2.181958	-2.172306	-2.181958	-2.181958	-2.172306	...	-2.172306	-2.181958	-2.181958	-2.181958	-2.181958	-2.181958	-2.181958	-2.181958	-2.181958	-2.181958
24922	26116	-2.192287	-2.192287	-2.192287	-2.192287	-2.192287	-2.155291	-2.192287	-2.192287	-2.192287	...	-2.185266	-2.192287	-2.192287	-2.192287	-2.192287	-2.192287	-2.192287	-2.192287	-2.192287	-2.192287