In [3]:
# coding: utf-8
__author__ = 'Sandro Vega Pons : https://www.kaggle.com/svpons'
'''Partially based on grid_plus_classifier script:
https://www.kaggle.com/svpons/facebook-v-predicting-check-ins/grid-plus-classifier
'''
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
In [1]:
def prepare_data(df, n_cell_x, n_cell_y):
"""
Feature engineering and computation of the grid.
"""
#Creating the grid
size_x = 10. / n_cell_x
size_y = 10. / n_cell_y
eps = 0.00001
xs = np.where(df.x.values < eps, 0, df.x.values - eps)
ys = np.where(df.y.values < eps, 0, df.y.values - eps)
pos_x = (xs / size_x).astype(np.int)
pos_y = (ys / size_y).astype(np.int)
df['grid_cell'] = pos_y * n_cell_x + pos_x
#Feature engineering
fw = [800, 1000, 4, 3, 1./22., 2, 10] #feature weights (black magic here)
df.x = df.x.values * fw[0]
df.y = df.y.values * fw[1]
initial_date = np.datetime64('2014-01-01T01:01', dtype='datetime64[m]')
d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm')
for mn in df.time.values)
df['hour'] = (d_times.hour+ d_times.minute/60) * fw[2]
df['weekday'] = d_times.weekday * fw[3]
df['day'] = (d_times.dayofyear * fw[4]).astype(int)
df['month'] = d_times.month * fw[5]
df['year'] = (d_times.year - 2013) * fw[6]
df = df.drop(['time'], axis=1)
return df
In [2]:
def process_one_cell(df_train, df_test, grid_id, th):
"""
Classification inside one grid cell.
"""
#Working on df_train
df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
place_counts = df_cell_train.place_id.value_counts()
mask = (place_counts[df_cell_train.place_id.values] >= th).values
df_cell_train = df_cell_train.loc[mask]
#Working on df_test
df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
row_ids = df_cell_test.index
#Preparing data
le = LabelEncoder()
y = le.fit_transform(df_cell_train.place_id.values)
X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int)
X_test = df_cell_test.drop(['grid_cell'], axis = 1).values.astype(int)
#Applying the classifier
clf = KNeighborsClassifier(n_neighbors=30, weights='distance',
metric='manhattan')
clf.fit(X, y)
y_pred = clf.predict_proba(X_test)
pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])
return pred_labels, row_ids
In [ ]:
def process_grid(df_train, df_test, th, n_cells):
"""
Iterates over all grid cells, aggregates the results and makes the
submission.
"""
preds = np.zeros((df_test.shape[0], 3), dtype=int)
for g_id in range(n_cells):
if g_id % 100 == 0:
print('iter: %s' %(g_id))
#Applying classifier to one grid cell
pred_labels, row_ids = process_one_cell(df_train, df_test, g_id, th)
#Updating predictions
preds[row_ids] = pred_labels
print('Generating submission file ...')
#Auxiliary dataframe with the 3 best predictions for each sample
df_aux = pd.DataFrame(preds, dtype=str, columns=['l1', 'l2', 'l3'])
#Concatenating the 3 predictions for each sample
ds_sub = df_aux.l1.str.cat([df_aux.l2, df_aux.l3], sep=' ')
#Writting to csv
ds_sub.name = 'place_id'
ds_sub.to_csv('output/sub_knn.csv', index=True, header=True, index_label='row_id')
In [18]:
print('Loading data ...')
df_train = pd.read_csv('../input/train.csv',
usecols=['row_id','x','y','accuracy','time','place_id'],
index_col = 0)
df_test = pd.read_csv('../input/test.csv',
usecols=['row_id','x','y','time'],
index_col = 0)
#Defining the size of the grid
n_cell_x = 20
n_cell_y = 40
# set threshold
#df_train = df_train[df_train['accuracy'] > -1 ]
#df_train = df_train.drop('accuracy', 1)
print('Preparing train data')
df_train = prepare_data(df_train, n_cell_x, n_cell_y)
print('Preparing test data')
df_test = prepare_data(df_test, n_cell_x, n_cell_y)
#Solving classification problems inside each grid cell
th = 5 #Keeping place_ids with more than th samples.
In [105]:
df_train.loc[df_train['grid_cell']==1]
Out[105]:
In [106]:
# prepare training and testing data for cross validation
from sklearn.cross_validation import train_test_split
df_train_sample = df_train.loc[df_train['grid_cell']==1]
X = df_train_sample.drop(['place_id','grid_cell'],1).values.astype(int)
y = df_train_sample['place_id'].values.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
In [112]:
from sklearn.cross_validation import KFold
kf_total = KFold(len(X_train), n_folds=5)
In [113]:
clf = KNeighborsClassifier(n_neighbors=1, weights='distance', metric='manhattan')
In [114]:
[clf.fit(X_train[train_index],y_train[train_index]).score(X_train[test_index],y_train[test_index]) for train_index, test_index in kf_total]
Out[114]:
In [ ]:
In [50]:
kf_total.n
Out[50]:
In [55]:
In [ ]:
In [57]:
test_x = X_train[:10]
In [60]:
kf_test = KFold(len(test_x), n_folds=2)
In [61]:
[(x,y) for x,y in kf_test]
Out[61]:
In [66]:
test_x_matrix = test_x.as_matrix()
In [84]:
test_x.ix[15192880,]
Out[84]:
In [ ]:
In [ ]:
In [43]:
len(X_train)
Out[43]:
In [ ]:
In [44]:
18635533+4658883
Out[44]:
In [29]:
[(len(train_index), len(test_index)) for train_index, test_index in kf_total]
Out[29]:
In [ ]:
len(X_train)
In [ ]:
In [ ]:
In [ ]:
lr.fit(x[train_indices], y[train_indices]).score(x[test_indices],y[test_indices])
In [ ]:
In [ ]:
In [ ]:
In [ ]:
process_grid(df_train, df_test, th, n_cell_x*n_cell_y)