In [3]:
# coding: utf-8
__author__ = 'Sandro Vega Pons : https://www.kaggle.com/svpons'

'''Partially based on grid_plus_classifier script:
https://www.kaggle.com/svpons/facebook-v-predicting-check-ins/grid-plus-classifier
'''

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

In [1]:
def prepare_data(df, n_cell_x, n_cell_y):
    """
    Feature engineering and computation of the grid.
    """
    #Creating the grid
    size_x = 10. / n_cell_x
    size_y = 10. / n_cell_y
    eps = 0.00001  
    xs = np.where(df.x.values < eps, 0, df.x.values - eps)
    ys = np.where(df.y.values < eps, 0, df.y.values - eps)
    pos_x = (xs / size_x).astype(np.int)
    pos_y = (ys / size_y).astype(np.int)
    df['grid_cell'] = pos_y * n_cell_x + pos_x
    
    #Feature engineering
    fw = [800, 1000, 4, 3, 1./22., 2, 10] #feature weights (black magic here)
    df.x = df.x.values * fw[0]
    df.y = df.y.values * fw[1]
    initial_date = np.datetime64('2014-01-01T01:01', dtype='datetime64[m]') 
    d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm') 
                               for mn in df.time.values)    
    df['hour'] = (d_times.hour+ d_times.minute/60) * fw[2]
    df['weekday'] = d_times.weekday * fw[3]
    df['day'] = (d_times.dayofyear * fw[4]).astype(int)
    df['month'] = d_times.month * fw[5]
    df['year'] = (d_times.year - 2013) * fw[6]

    df = df.drop(['time'], axis=1) 
    return df

In [2]:
def process_one_cell(df_train, df_test, grid_id, th):
    """   
    Classification inside one grid cell.
    """   
    #Working on df_train
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    #Working on df_test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    row_ids = df_cell_test.index
    
    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int)
    X_test = df_cell_test.drop(['grid_cell'], axis = 1).values.astype(int)
    
    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=30, weights='distance', 
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])    
    return pred_labels, row_ids

In [ ]:
def process_grid(df_train, df_test, th, n_cells):
    """
    Iterates over all grid cells, aggregates the results and makes the
    submission.
    """ 
    preds = np.zeros((df_test.shape[0], 3), dtype=int)
    
    for g_id in range(n_cells):
        if g_id % 100 == 0:
            print('iter: %s' %(g_id))
        
        #Applying classifier to one grid cell
        pred_labels, row_ids = process_one_cell(df_train, df_test, g_id, th)

        #Updating predictions
        preds[row_ids] = pred_labels

    print('Generating submission file ...')
    #Auxiliary dataframe with the 3 best predictions for each sample
    df_aux = pd.DataFrame(preds, dtype=str, columns=['l1', 'l2', 'l3'])  
    
    #Concatenating the 3 predictions for each sample
    ds_sub = df_aux.l1.str.cat([df_aux.l2, df_aux.l3], sep=' ')
    
    #Writting to csv
    ds_sub.name = 'place_id'
    ds_sub.to_csv('output/sub_knn.csv', index=True, header=True, index_label='row_id')

In [18]:
print('Loading data ...')
df_train = pd.read_csv('../input/train.csv',
                           usecols=['row_id','x','y','accuracy','time','place_id'], 
                           index_col = 0)
df_test = pd.read_csv('../input/test.csv',
                          usecols=['row_id','x','y','time'],
                          index_col = 0)
 
    
#Defining the size of the grid
n_cell_x = 20
n_cell_y = 40
    
# set threshold 
#df_train = df_train[df_train['accuracy'] > -1 ]
#df_train = df_train.drop('accuracy', 1)
    
print('Preparing train data')
df_train = prepare_data(df_train, n_cell_x, n_cell_y)

print('Preparing test data')
df_test = prepare_data(df_test, n_cell_x, n_cell_y)
    
#Solving classification problems inside each grid cell
th = 5 #Keeping place_ids with more than th samples.


Loading data ...
Preparing train data
Preparing test data

In [105]:
df_train.loc[df_train['grid_cell']==1]


Out[105]:
x y accuracy place_id grid_cell hour weekday day month year
row_id
402 696.00 68.1 65 7131826933 1 36 9 0 2 20
519 475.12 6.0 66 1809053339 1 20 0 9 14 10
741 770.96 218.8 4 2198399075 1 36 3 8 12 20
2641 660.00 144.1 58 7121847139 1 0 12 3 6 20
3386 517.84 150.3 68 2043954571 1 60 0 2 4 20
5329 689.44 247.7 70 2190112329 1 64 12 1 4 20
5483 585.92 183.1 32 5059842720 1 60 12 7 12 20
7691 727.76 67.2 186 1342336464 1 48 6 0 2 10
7699 719.28 238.3 139 6398563809 1 28 6 7 12 20
9679 552.64 182.3 36 4388715331 1 76 18 11 18 10
9895 744.48 84.6 55 3181810487 1 0 3 15 24 10
9965 781.92 148.9 25 8304719796 1 64 3 14 22 10
10517 487.84 38.5 6 1809053339 1 32 0 3 6 20
12947 469.36 229.8 54 1387737890 1 88 12 3 6 10
13877 483.60 150.2 57 1836170873 1 8 12 11 18 10
14748 520.48 61.2 47 6238105828 1 28 3 0 2 20
14843 636.96 96.1 57 9631254236 1 40 6 2 6 10
15076 657.84 130.5 74 7121847139 1 92 15 3 6 20
15219 729.76 125.5 64 8710316056 1 12 6 7 12 20
15882 735.12 82.2 44 6803385643 1 60 3 3 6 10
16197 606.48 242.2 113 8851116683 1 16 12 2 6 20
16662 715.60 64.2 29 1342336464 1 48 15 11 18 10
17513 735.04 30.2 175 1425204074 1 32 18 6 10 20
17813 404.08 211.0 217 8508406089 1 88 18 12 18 10
17861 715.60 52.1 160 1342336464 1 32 15 3 6 10
18932 761.36 8.3 74 5530844814 1 0 0 6 10 20
19381 444.40 52.7 88 3097190601 1 44 18 8 14 10
19820 543.92 210.0 15 7300335457 1 48 6 3 6 10
20249 471.04 208.4 131 1481132469 1 24 15 5 8 20
21005 453.68 189.6 67 1292505932 1 52 3 7 12 20
... ... ... ... ... ... ... ... ... ... ...
29097499 454.48 191.6 66 1292505932 1 52 9 1 2 10
29097544 484.00 226.6 66 9019303744 1 40 6 15 24 10
29098110 577.68 233.1 66 5108647025 1 12 15 3 6 10
29099505 500.80 138.9 108 7517407233 1 32 12 5 10 20
29100200 602.24 219.1 162 6260881309 1 4 9 7 12 10
29100269 446.08 162.6 13 7920940815 1 4 6 10 16 10
29101757 604.08 94.5 160 1890025047 1 4 0 2 4 20
29102065 409.92 62.8 168 7858695423 1 60 18 4 6 10
29102549 480.64 245.8 64 9019303744 1 40 6 2 6 10
29103454 476.88 216.3 58 9019303744 1 32 15 16 24 10
29103735 616.80 85.8 165 1890025047 1 88 3 0 2 20
29105422 565.68 37.8 162 2874035592 1 0 12 13 20 10
29106388 768.32 223.1 18 2198399075 1 24 18 5 8 10
29106548 500.00 219.2 74 7300335457 1 76 0 8 14 10
29106644 417.28 177.4 66 1254758593 1 84 15 6 10 20
29106768 794.72 10.3 2 2332100459 1 60 0 12 18 10
29106884 760.08 67.1 2 6803385643 1 88 3 5 10 10
29107074 597.52 238.6 13 2190112329 1 60 9 1 4 20
29107309 737.20 90.1 59 1890025047 1 4 18 14 22 10
29107551 439.52 196.6 35 5957043142 1 0 12 5 8 20
29107988 653.84 98.2 71 6803385643 1 92 12 4 8 20
29108356 798.56 99.4 15 3181810487 1 48 9 4 8 20
29109439 561.92 34.8 74 2874035592 1 56 15 4 8 20
29112877 792.08 44.5 52 5876429395 1 64 12 14 22 10
29112973 752.32 109.4 71 1918831201 1 76 3 14 22 10
29114024 544.32 240.9 9 2207140084 1 32 0 3 6 10
29114894 655.68 96.7 2 4690524967 1 0 18 4 8 10
29115223 784.00 16.4 166 6323629255 1 84 6 14 22 10
29116559 630.16 20.7 62 2046572374 1 44 12 6 10 20
29116686 404.32 42.9 58 7858695423 1 56 3 8 12 20

36087 rows × 10 columns


In [106]:
# prepare training and testing data for cross validation
from sklearn.cross_validation import train_test_split
df_train_sample = df_train.loc[df_train['grid_cell']==1]
X = df_train_sample.drop(['place_id','grid_cell'],1).values.astype(int)
y = df_train_sample['place_id'].values.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [112]:
from sklearn.cross_validation import KFold
kf_total = KFold(len(X_train), n_folds=5)

In [113]:
clf = KNeighborsClassifier(n_neighbors=1, weights='distance', metric='manhattan')

In [114]:
[clf.fit(X_train[train_index],y_train[train_index]).score(X_train[test_index],y_train[test_index]) for train_index, test_index in kf_total]


Out[114]:
[0.47644613785936957,
 0.4715968133010045,
 0.48250779355732593,
 0.47332871492899203,
 0.48051273168196779]

In [ ]:


In [50]:
kf_total.n


Out[50]:
23294416

In [55]:



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-55-d301e6cb314b> in <module>()
----> 1 X_train[1,2,]

/usr/local/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1995             return self._getitem_multilevel(key)
   1996         else:
-> 1997             return self._getitem_column(key)
   1998 
   1999     def _getitem_column(self, key):

/usr/local/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2002         # get column
   2003         if self.columns.is_unique:
-> 2004             return self._get_item_cache(key)
   2005 
   2006         # duplicate columns & possible reduce dimensionality

/usr/local/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1348         res = cache.get(item)
   1349         if res is None:
-> 1350             values = self._data.get(item)
   1351             res = self._box_item_values(item, values)
   1352             cache[item] = res

/usr/local/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   3288 
   3289             if not isnull(item):
-> 3290                 loc = self.items.get_loc(item)
   3291             else:
   3292                 indexer = np.arange(len(self.items))[isnull(self.items)]

/usr/local/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_loc(self, key, method, tolerance)
   1945                 return self._engine.get_loc(key)
   1946             except KeyError:
-> 1947                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   1948 
   1949         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()

KeyError: (1, 2)

In [ ]:


In [57]:
test_x = X_train[:10]

In [60]:
kf_test = KFold(len(test_x), n_folds=2)

In [61]:
[(x,y) for x,y in kf_test]


Out[61]:
[(array([5, 6, 7, 8, 9]), array([0, 1, 2, 3, 4])),
 (array([0, 1, 2, 3, 4]), array([5, 6, 7, 8, 9]))]

In [66]:
test_x_matrix = test_x.as_matrix()

In [84]:
test_x.ix[15192880,]


Out[84]:
x            3911.76
y            6567.60
accuracy       38.00
grid_cell     529.00
hour           60.00
weekday         3.00
day             7.00
month          12.00
year           10.00
Name: 15192880, dtype: float64

In [ ]:


In [ ]:


In [43]:
len(X_train)


Out[43]:
23294416

In [ ]:


In [44]:
18635533+4658883


Out[44]:
23294416

In [29]:
[(len(train_index), len(test_index)) for train_index, test_index in kf_total]


Out[29]:
[(18635532, 4658884),
 (18635533, 4658883),
 (18635533, 4658883),
 (18635533, 4658883),
 (18635533, 4658883)]

In [ ]:
len(X_train)

In [ ]:


In [ ]:


In [ ]:
lr.fit(x[train_indices], y[train_indices]).score(x[test_indices],y[test_indices])

In [ ]:


In [ ]:


In [ ]:


In [ ]:
process_grid(df_train, df_test, th, n_cell_x*n_cell_y)