In [1]:
from pandas import Series, DataFrame
import pandas as pd


%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
destinations = pd.read_csv("destinations.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-2-69ec021b9a60> in <module>()
      1 destinations = pd.read_csv("destinations.csv")
----> 2 test = pd.read_csv("test.csv")
      3 train = pd.read_csv("train.csv")

C:\Users\Summer\Anaconda2\lib\site-packages\pandas\io\parsers.pyc in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    560                     skip_blank_lines=skip_blank_lines)
    561 
--> 562         return _read(filepath_or_buffer, kwds)
    563 
    564     parser_f.__name__ = name

C:\Users\Summer\Anaconda2\lib\site-packages\pandas\io\parsers.pyc in _read(filepath_or_buffer, kwds)
    323         return parser
    324 
--> 325     return parser.read()
    326 
    327 _parser_defaults = {

C:\Users\Summer\Anaconda2\lib\site-packages\pandas\io\parsers.pyc in read(self, nrows)
    821         index, columns, col_dict = self._create_index(ret)
    822 
--> 823         df = DataFrame(col_dict, columns=columns, index=index)
    824 
    825         if self.squeeze and len(df.columns) == 1:

C:\Users\Summer\Anaconda2\lib\site-packages\pandas\core\frame.pyc in __init__(self, data, index, columns, dtype, copy)
    222                                  dtype=dtype, copy=copy)
    223         elif isinstance(data, dict):
--> 224             mgr = self._init_dict(data, index, columns, dtype=dtype)
    225         elif isinstance(data, ma.MaskedArray):
    226             import numpy.ma.mrecords as mrecords

C:\Users\Summer\Anaconda2\lib\site-packages\pandas\core\frame.pyc in _init_dict(self, data, index, columns, dtype)
    358             arrays = [data[k] for k in keys]
    359 
--> 360         return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
    361 
    362     def _init_ndarray(self, values, index, columns, dtype=None, copy=False):

C:\Users\Summer\Anaconda2\lib\site-packages\pandas\core\frame.pyc in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
   5239     axes = [_ensure_index(columns), _ensure_index(index)]
   5240 
-> 5241     return create_block_manager_from_arrays(arrays, arr_names, axes)
   5242 
   5243 

C:\Users\Summer\Anaconda2\lib\site-packages\pandas\core\internals.pyc in create_block_manager_from_arrays(arrays, names, axes)
   3997 
   3998     try:
-> 3999         blocks = form_blocks(arrays, names, axes)
   4000         mgr = BlockManager(blocks, axes)
   4001         mgr._consolidate_inplace()

C:\Users\Summer\Anaconda2\lib\site-packages\pandas\core\internals.pyc in form_blocks(arrays, names, axes)
   4074 
   4075     if len(int_items):
-> 4076         int_blocks = _multi_blockify(int_items)
   4077         blocks.extend(int_blocks)
   4078 

C:\Users\Summer\Anaconda2\lib\site-packages\pandas\core\internals.pyc in _multi_blockify(tuples, dtype)
   4143     for dtype, tup_block in grouper:
   4144 
-> 4145         values, placement = _stack_arrays(list(tup_block), dtype)
   4146 
   4147         block = make_block(values, placement=placement)

C:\Users\Summer\Anaconda2\lib\site-packages\pandas\core\internals.pyc in _stack_arrays(tuples, dtype)
   4186     shape = (len(arrays),) + _shape_compat(first)
   4187 
-> 4188     stacked = np.empty(shape, dtype=dtype)
   4189     for i, arr in enumerate(arrays):
   4190         stacked[i] = _asarray_compat(arr)

MemoryError: 

Convert date time type to seperate the train and test set. becasue the test set data time have to be come later than the train set


In [ ]:
train["date_time"] = pd.to_datetime(train["date_time"])
train["year"] = train["date_time"].dt.year
train["month"] = train["date_time"].dt.month

pick random 10000 users row as our train data set


In [ ]:
import random

unique_users = train.user_id.unique()

sel_user_ids = [unique_users[i] for i in sorted(random.sample(range(len(unique_users)), 10000)) ]
sel_train = train[train.user_id.isin(sel_user_ids)]

In [ ]:
t1 = sel_train[((sel_train.year == 2013) | ((sel_train.year == 2014) & (sel_train.month < 8)))]
t2 = sel_train[((sel_train.year == 2014) & (sel_train.month >= 8))]

In [ ]:
# remove the empty bookinf in test set
t2 = t2[t2.is_booking == True]

Simple predication: use the most 5 common cluster as predication for each data in test


In [ ]:
t2[:10]

In [ ]:
most_common_clusters = list(train.hotel_cluster.value_counts().head().index)

In [ ]:
predictions = [most_common_clusters for i in range(t2.shape[0])]

In [ ]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
dest_small = pca.fit_transform(destinations[["d{0}".format(i + 1) for i in range(149)]])
dest_small = pd.DataFrame(dest_small)
dest_small["srch_destination_id"] = destinations["srch_destination_id"]

In [ ]: