notebook.community

Edit and run



In [3]:

    
# coding: utf-8
__author__ = 'Sandro Vega Pons : https://www.kaggle.com/svpons'

'''Partially based on grid_plus_classifier script:
https://www.kaggle.com/svpons/facebook-v-predicting-check-ins/grid-plus-classifier
'''

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier



In [1]:

    
def prepare_data(df, n_cell_x, n_cell_y):
    """
    Feature engineering and computation of the grid.
    """
    #Creating the grid
    size_x = 10. / n_cell_x
    size_y = 10. / n_cell_y
    eps = 0.00001  
    xs = np.where(df.x.values < eps, 0, df.x.values - eps)
    ys = np.where(df.y.values < eps, 0, df.y.values - eps)
    pos_x = (xs / size_x).astype(np.int)
    pos_y = (ys / size_y).astype(np.int)
    df['grid_cell'] = pos_y * n_cell_x + pos_x
    
    #Feature engineering
    fw = [800, 1000, 4, 3, 1./22., 2, 10] #feature weights (black magic here)
    df.x = df.x.values * fw[0]
    df.y = df.y.values * fw[1]
    initial_date = np.datetime64('2014-01-01T01:01', dtype='datetime64[m]') 
    d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm') 
                               for mn in df.time.values)    
    df['hour'] = (d_times.hour+ d_times.minute/60) * fw[2]
    df['weekday'] = d_times.weekday * fw[3]
    df['day'] = (d_times.dayofyear * fw[4]).astype(int)
    df['month'] = d_times.month * fw[5]
    df['year'] = (d_times.year - 2013) * fw[6]

    df = df.drop(['time'], axis=1) 
    return df



In [2]:

    
def process_one_cell(df_train, df_test, grid_id, th):
    """   
    Classification inside one grid cell.
    """   
    #Working on df_train
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    #Working on df_test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    row_ids = df_cell_test.index
    
    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int)
    X_test = df_cell_test.drop(['grid_cell'], axis = 1).values.astype(int)
    
    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=30, weights='distance', 
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])    
    return pred_labels, row_ids



In [ ]:

    
def process_grid(df_train, df_test, th, n_cells):
    """
    Iterates over all grid cells, aggregates the results and makes the
    submission.
    """ 
    preds = np.zeros((df_test.shape[0], 3), dtype=int)
    
    for g_id in range(n_cells):
        if g_id % 100 == 0:
            print('iter: %s' %(g_id))
        
        #Applying classifier to one grid cell
        pred_labels, row_ids = process_one_cell(df_train, df_test, g_id, th)

        #Updating predictions
        preds[row_ids] = pred_labels

    print('Generating submission file ...')
    #Auxiliary dataframe with the 3 best predictions for each sample
    df_aux = pd.DataFrame(preds, dtype=str, columns=['l1', 'l2', 'l3'])  
    
    #Concatenating the 3 predictions for each sample
    ds_sub = df_aux.l1.str.cat([df_aux.l2, df_aux.l3], sep=' ')
    
    #Writting to csv
    ds_sub.name = 'place_id'
    ds_sub.to_csv('output/sub_knn.csv', index=True, header=True, index_label='row_id')



In [18]:

    
print('Loading data ...')
df_train = pd.read_csv('../input/train.csv',
                           usecols=['row_id','x','y','accuracy','time','place_id'], 
                           index_col = 0)
df_test = pd.read_csv('../input/test.csv',
                          usecols=['row_id','x','y','time'],
                          index_col = 0)
 
    
#Defining the size of the grid
n_cell_x = 20
n_cell_y = 40
    
# set threshold 
#df_train = df_train[df_train['accuracy'] > -1 ]
#df_train = df_train.drop('accuracy', 1)
    
print('Preparing train data')
df_train = prepare_data(df_train, n_cell_x, n_cell_y)

print('Preparing test data')
df_test = prepare_data(df_test, n_cell_x, n_cell_y)
    
#Solving classification problems inside each grid cell
th = 5 #Keeping place_ids with more than th samples.









    



Loading data ...
Preparing train data
Preparing test data



In [105]:

    
df_train.loc[df_train['grid_cell']==1]









    Out[105]:






  
    
      
      x
      y
      accuracy
      place_id
      grid_cell
      hour
      weekday
      day
      month
      year
    
    
      row_id
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      402
      696.00
      68.1
      65
      7131826933
      1
      36
      9
      0
      2
      20
    
    
      519
      475.12
      6.0
      66
      1809053339
      1
      20
      0
      9
      14
      10
    
    
      741
      770.96
      218.8
      4
      2198399075
      1
      36
      3
      8
      12
      20
    
    
      2641
      660.00
      144.1
      58
      7121847139
      1
      0
      12
      3
      6
      20
    
    
      3386
      517.84
      150.3
      68
      2043954571
      1
      60
      0
      2
      4
      20
    
    
      5329
      689.44
      247.7
      70
      2190112329
      1
      64
      12
      1
      4
      20
    
    
      5483
      585.92
      183.1
      32
      5059842720
      1
      60
      12
      7
      12
      20
    
    
      7691
      727.76
      67.2
      186
      1342336464
      1
      48
      6
      0
      2
      10
    
    
      7699
      719.28
      238.3
      139
      6398563809
      1
      28
      6
      7
      12
      20
    
    
      9679
      552.64
      182.3
      36
      4388715331
      1
      76
      18
      11
      18
      10
    
    
      9895
      744.48
      84.6
      55
      3181810487
      1
      0
      3
      15
      24
      10
    
    
      9965
      781.92
      148.9
      25
      8304719796
      1
      64
      3
      14
      22
      10
    
    
      10517
      487.84
      38.5
      6
      1809053339
      1
      32
      0
      3
      6
      20
    
    
      12947
      469.36
      229.8
      54
      1387737890
      1
      88
      12
      3
      6
      10
    
    
      13877
      483.60
      150.2
      57
      1836170873
      1
      8
      12
      11
      18
      10
    
    
      14748
      520.48
      61.2
      47
      6238105828
      1
      28
      3
      0
      2
      20
    
    
      14843
      636.96
      96.1
      57
      9631254236
      1
      40
      6
      2
      6
      10
    
    
      15076
      657.84
      130.5
      74
      7121847139
      1
      92
      15
      3
      6
      20
    
    
      15219
      729.76
      125.5
      64
      8710316056
      1
      12
      6
      7
      12
      20
    
    
      15882
      735.12
      82.2
      44
      6803385643
      1
      60
      3
      3
      6
      10
    
    
      16197
      606.48
      242.2
      113
      8851116683
      1
      16
      12
      2
      6
      20
    
    
      16662
      715.60
      64.2
      29
      1342336464
      1
      48
      15
      11
      18
      10
    
    
      17513
      735.04
      30.2
      175
      1425204074
      1
      32
      18
      6
      10
      20
    
    
      17813
      404.08
      211.0
      217
      8508406089
      1
      88
      18
      12
      18
      10
    
    
      17861
      715.60
      52.1
      160
      1342336464
      1
      32
      15
      3
      6
      10
    
    
      18932
      761.36
      8.3
      74
      5530844814
      1
      0
      0
      6
      10
      20
    
    
      19381
      444.40
      52.7
      88
      3097190601
      1
      44
      18
      8
      14
      10
    
    
      19820
      543.92
      210.0
      15
      7300335457
      1
      48
      6
      3
      6
      10
    
    
      20249
      471.04
      208.4
      131
      1481132469
      1
      24
      15
      5
      8
      20
    
    
      21005
      453.68
      189.6
      67
      1292505932
      1
      52
      3
      7
      12
      20
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      29097499
      454.48
      191.6
      66
      1292505932
      1
      52
      9
      1
      2
      10
    
    
      29097544
      484.00
      226.6
      66
      9019303744
      1
      40
      6
      15
      24
      10
    
    
      29098110
      577.68
      233.1
      66
      5108647025
      1
      12
      15
      3
      6
      10
    
    
      29099505
      500.80
      138.9
      108
      7517407233
      1
      32
      12
      5
      10
      20
    
    
      29100200
      602.24
      219.1
      162
      6260881309
      1
      4
      9
      7
      12
      10
    
    
      29100269
      446.08
      162.6
      13
      7920940815
      1
      4
      6
      10
      16
      10
    
    
      29101757
      604.08
      94.5
      160
      1890025047
      1
      4
      0
      2
      4
      20
    
    
      29102065
      409.92
      62.8
      168
      7858695423
      1
      60
      18
      4
      6
      10
    
    
      29102549
      480.64
      245.8
      64
      9019303744
      1
      40
      6
      2
      6
      10
    
    
      29103454
      476.88
      216.3
      58
      9019303744
      1
      32
      15
      16
      24
      10
    
    
      29103735
      616.80
      85.8
      165
      1890025047
      1
      88
      3
      0
      2
      20
    
    
      29105422
      565.68
      37.8
      162
      2874035592
      1
      0
      12
      13
      20
      10
    
    
      29106388
      768.32
      223.1
      18
      2198399075
      1
      24
      18
      5
      8
      10
    
    
      29106548
      500.00
      219.2
      74
      7300335457
      1
      76
      0
      8
      14
      10
    
    
      29106644
      417.28
      177.4
      66
      1254758593
      1
      84
      15
      6
      10
      20
    
    
      29106768
      794.72
      10.3
      2
      2332100459
      1
      60
      0
      12
      18
      10
    
    
      29106884
      760.08
      67.1
      2
      6803385643
      1
      88
      3
      5
      10
      10
    
    
      29107074
      597.52
      238.6
      13
      2190112329
      1
      60
      9
      1
      4
      20
    
    
      29107309
      737.20
      90.1
      59
      1890025047
      1
      4
      18
      14
      22
      10
    
    
      29107551
      439.52
      196.6
      35
      5957043142
      1
      0
      12
      5
      8
      20
    
    
      29107988
      653.84
      98.2
      71
      6803385643
      1
      92
      12
      4
      8
      20
    
    
      29108356
      798.56
      99.4
      15
      3181810487
      1
      48
      9
      4
      8
      20
    
    
      29109439
      561.92
      34.8
      74
      2874035592
      1
      56
      15
      4
      8
      20
    
    
      29112877
      792.08
      44.5
      52
      5876429395
      1
      64
      12
      14
      22
      10
    
    
      29112973
      752.32
      109.4
      71
      1918831201
      1
      76
      3
      14
      22
      10
    
    
      29114024
      544.32
      240.9
      9
      2207140084
      1
      32
      0
      3
      6
      10
    
    
      29114894
      655.68
      96.7
      2
      4690524967
      1
      0
      18
      4
      8
      10
    
    
      29115223
      784.00
      16.4
      166
      6323629255
      1
      84
      6
      14
      22
      10
    
    
      29116559
      630.16
      20.7
      62
      2046572374
      1
      44
      12
      6
      10
      20
    
    
      29116686
      404.32
      42.9
      58
      7858695423
      1
      56
      3
      8
      12
      20
    
  

36087 rows × 10 columns



In [106]:

    
# prepare training and testing data for cross validation
from sklearn.cross_validation import train_test_split
df_train_sample = df_train.loc[df_train['grid_cell']==1]
X = df_train_sample.drop(['place_id','grid_cell'],1).values.astype(int)
y = df_train_sample['place_id'].values.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)



In [112]:

    
from sklearn.cross_validation import KFold
kf_total = KFold(len(X_train), n_folds=5)



In [113]:

    
clf = KNeighborsClassifier(n_neighbors=1, weights='distance', metric='manhattan')



In [114]:

    
[clf.fit(X_train[train_index],y_train[train_index]).score(X_train[test_index],y_train[test_index]) for train_index, test_index in kf_total]









    Out[114]:





[0.47644613785936957,
 0.4715968133010045,
 0.48250779355732593,
 0.47332871492899203,
 0.48051273168196779]



In [ ]:



In [50]:

    
kf_total.n









    Out[50]:





23294416



In [55]:









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-55-d301e6cb314b> in <module>()
----> 1 X_train[1,2,]

/usr/local/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1995             return self._getitem_multilevel(key)
   1996         else:
-> 1997             return self._getitem_column(key)
   1998 
   1999     def _getitem_column(self, key):

/usr/local/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2002         # get column
   2003         if self.columns.is_unique:
-> 2004             return self._get_item_cache(key)
   2005 
   2006         # duplicate columns & possible reduce dimensionality

/usr/local/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1348         res = cache.get(item)
   1349         if res is None:
-> 1350             values = self._data.get(item)
   1351             res = self._box_item_values(item, values)
   1352             cache[item] = res

/usr/local/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   3288 
   3289             if not isnull(item):
-> 3290                 loc = self.items.get_loc(item)
   3291             else:
   3292                 indexer = np.arange(len(self.items))[isnull(self.items)]

/usr/local/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_loc(self, key, method, tolerance)
   1945                 return self._engine.get_loc(key)
   1946             except KeyError:
-> 1947                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   1948 
   1949         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()

KeyError: (1, 2)



In [ ]:



In [57]:

    
test_x = X_train[:10]



In [60]:

    
kf_test = KFold(len(test_x), n_folds=2)



In [61]:

    
[(x,y) for x,y in kf_test]









    Out[61]:





[(array([5, 6, 7, 8, 9]), array([0, 1, 2, 3, 4])),
 (array([0, 1, 2, 3, 4]), array([5, 6, 7, 8, 9]))]



In [66]:

    
test_x_matrix = test_x.as_matrix()



In [84]:

    
test_x.ix[15192880,]









    Out[84]:





x            3911.76
y            6567.60
accuracy       38.00
grid_cell     529.00
hour           60.00
weekday         3.00
day             7.00
month          12.00
year           10.00
Name: 15192880, dtype: float64



In [ ]:



In [ ]:



In [43]:

    
len(X_train)









    Out[43]:





23294416



In [ ]:



In [44]:

    
18635533+4658883









    Out[44]:





23294416



In [29]:

    
[(len(train_index), len(test_index)) for train_index, test_index in kf_total]









    Out[29]:





[(18635532, 4658884),
 (18635533, 4658883),
 (18635533, 4658883),
 (18635533, 4658883),
 (18635533, 4658883)]



In [ ]:

    
len(X_train)



In [ ]:



In [ ]:



In [ ]:

    
lr.fit(x[train_indices], y[train_indices]).score(x[test_indices],y[test_indices])



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
process_grid(df_train, df_test, th, n_cell_x*n_cell_y)

	x	y	accuracy	place_id	grid_cell	hour	weekday	day	month	year
row_id
402	696.00	68.1	65	7131826933	1	36	9	0	2	20
519	475.12	6.0	66	1809053339	1	20	0	9	14	10
741	770.96	218.8	4	2198399075	1	36	3	8	12	20
2641	660.00	144.1	58	7121847139	1	0	12	3	6	20
3386	517.84	150.3	68	2043954571	1	60	0	2	4	20
5329	689.44	247.7	70	2190112329	1	64	12	1	4	20
5483	585.92	183.1	32	5059842720	1	60	12	7	12	20
7691	727.76	67.2	186	1342336464	1	48	6	0	2	10
7699	719.28	238.3	139	6398563809	1	28	6	7	12	20
9679	552.64	182.3	36	4388715331	1	76	18	11	18	10
9895	744.48	84.6	55	3181810487	1	0	3	15	24	10
9965	781.92	148.9	25	8304719796	1	64	3	14	22	10
10517	487.84	38.5	6	1809053339	1	32	0	3	6	20
12947	469.36	229.8	54	1387737890	1	88	12	3	6	10
13877	483.60	150.2	57	1836170873	1	8	12	11	18	10
14748	520.48	61.2	47	6238105828	1	28	3	0	2	20
14843	636.96	96.1	57	9631254236	1	40	6	2	6	10
15076	657.84	130.5	74	7121847139	1	92	15	3	6	20
15219	729.76	125.5	64	8710316056	1	12	6	7	12	20
15882	735.12	82.2	44	6803385643	1	60	3	3	6	10
16197	606.48	242.2	113	8851116683	1	16	12	2	6	20
16662	715.60	64.2	29	1342336464	1	48	15	11	18	10
17513	735.04	30.2	175	1425204074	1	32	18	6	10	20
17813	404.08	211.0	217	8508406089	1	88	18	12	18	10
17861	715.60	52.1	160	1342336464	1	32	15	3	6	10
18932	761.36	8.3	74	5530844814	1	0	0	6	10	20
19381	444.40	52.7	88	3097190601	1	44	18	8	14	10
19820	543.92	210.0	15	7300335457	1	48	6	3	6	10
20249	471.04	208.4	131	1481132469	1	24	15	5	8	20
21005	453.68	189.6	67	1292505932	1	52	3	7	12	20
...	...	...	...	...	...	...	...	...	...	...
29097499	454.48	191.6	66	1292505932	1	52	9	1	2	10
29097544	484.00	226.6	66	9019303744	1	40	6	15	24	10
29098110	577.68	233.1	66	5108647025	1	12	15	3	6	10
29099505	500.80	138.9	108	7517407233	1	32	12	5	10	20
29100200	602.24	219.1	162	6260881309	1	4	9	7	12	10
29100269	446.08	162.6	13	7920940815	1	4	6	10	16	10
29101757	604.08	94.5	160	1890025047	1	4	0	2	4	20
29102065	409.92	62.8	168	7858695423	1	60	18	4	6	10
29102549	480.64	245.8	64	9019303744	1	40	6	2	6	10
29103454	476.88	216.3	58	9019303744	1	32	15	16	24	10
29103735	616.80	85.8	165	1890025047	1	88	3	0	2	20
29105422	565.68	37.8	162	2874035592	1	0	12	13	20	10
29106388	768.32	223.1	18	2198399075	1	24	18	5	8	10
29106548	500.00	219.2	74	7300335457	1	76	0	8	14	10
29106644	417.28	177.4	66	1254758593	1	84	15	6	10	20
29106768	794.72	10.3	2	2332100459	1	60	0	12	18	10
29106884	760.08	67.1	2	6803385643	1	88	3	5	10	10
29107074	597.52	238.6	13	2190112329	1	60	9	1	4	20
29107309	737.20	90.1	59	1890025047	1	4	18	14	22	10
29107551	439.52	196.6	35	5957043142	1	0	12	5	8	20
29107988	653.84	98.2	71	6803385643	1	92	12	4	8	20
29108356	798.56	99.4	15	3181810487	1	48	9	4	8	20
29109439	561.92	34.8	74	2874035592	1	56	15	4	8	20
29112877	792.08	44.5	52	5876429395	1	64	12	14	22	10
29112973	752.32	109.4	71	1918831201	1	76	3	14	22	10
29114024	544.32	240.9	9	2207140084	1	32	0	3	6	10
29114894	655.68	96.7	2	4690524967	1	0	18	4	8	10
29115223	784.00	16.4	166	6323629255	1	84	6	14	22	10
29116559	630.16	20.7	62	2046572374	1	44	12	6	10	20
29116686	404.32	42.9	58	7858695423	1	56	3	8	12	20