In [2]:
import math
import os
import pandas as pd
import numpy as np
import time
from datetime import timedelta
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")
# For checking progress
from tqdm import tqdm_notebook, tnrange
# For plotting
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D # For 3D scatter
import matplotlib.cm as cm # For colored labels
import seaborn as sns
%matplotlib inline
plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['font.size'] = 16
In [3]:
# Reading data
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
In [5]:
# Count the occurrences of place_id
place_ranking = df_train.place_id.value_counts()
In [4]:
# Basic statistics of data
print 'Size of training data: {}'.format(df_train.shape)
print 'Size of testing data: {}'.format(df_test.shape)
In [5]:
print "Description of training data: \n"
print df_train.describe()
In [6]:
print "Description of testing data: \n"
print df_test.describe()
del df_test
In [7]:
sns.jointplot(data=df_train.sample(frac=0.01), x='x',y='y',kind='hex', stat_func=None,
xlim=(0,10),ylim=(0,10));
plt.title('Distribution of 1% random samples',x=-2.5,y=1.2,fontsize=18);
In [8]:
data = df_train[df_train.place_id==place_ranking.index[0]]
data.plot(kind='scatter', x='x', y='y')
plt.title('Distribution at 1st popular place',fontsize=18)
plt.xlabel('X')
plt.ylabel('Y')
print 'Standard deviation of x: {}'.format(data.x.std())
print 'Standard deviation of y: {}'.format(data.y.std())
del data
In [9]:
data = df_train[df_train.place_id==place_ranking.index[1]]
data.plot(kind='scatter', x='x', y='y')
plt.title('Distribution at 2nd popular place',fontsize=18)
plt.xlabel('X')
plt.ylabel('Y')
print 'Standard deviation of x: {}'.format(data.x.std())
print 'Standard deviation of y: {}'.format(data.y.std())
del data
In [10]:
data = df_train[df_train.place_id==place_ranking.index[2]]
data.plot(kind='scatter', x='x', y='y')
plt.title('Distribution at 3rd popular place',fontsize=18)
plt.xlabel('X')
plt.ylabel('Y')
print 'Standard deviation of x: {}'.format(data.x.std())
print 'Standard deviation of y: {}'.format(data.y.std())
del data
In [11]:
_, AX = plt.subplots(nrows=2, sharey=True)
AX[0].set_title('Histogram of accuracy')
AX[0].set_xlabel('Accuracy')
AX[0].set_ylabel('Frequency')
df_train['accuracy'].hist(bins=100,ax=AX[0])
AX[1].set_title('Histogram of accuracy under 200')
AX[1].set_xlim((0,200))
AX[1].set_xlabel('Accuracy')
AX[1].set_ylabel('Frequency')
df_train.accuracy.hist(bins=100,ax=AX[1])
plt.tight_layout(h_pad=1.5)
In [12]:
unique_id = df_train.place_id.unique()
print "Number of unique place id: {}, roughly {:.3f} % of traing data.".format(len(unique_id),
len(unique_id) * 100.0 / df_train.shape[0])
del unique_id
In [13]:
plt.xlabel('Frequency')
plt.ylabel('Place id')
place_ranking.head(5).plot.barh(xlim=(0,2000)).invert_yaxis()
plt.title('Top 5 most popular place id', fontsize=18);
In [14]:
plt.xlabel('Frequency')
plt.ylabel('Place id')
place_ranking.tail(5).plot.barh(xlim=(0,5))
plt.title('Bottom 5 least popular place id', fontsize=18);
In [15]:
# 1st popular place : 8772469670
time = df_train[df_train.place_id==place_ranking.index[0]].time
time = time % (24*60*7) # Converted into weekdays
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.title('Weekly cycle of 1st popular place', fontsize=18)
time.hist(bins=100);
In [16]:
# 2nd popular place : 1623394281
time = df_train[df_train.place_id==place_ranking.index[1]].time
time = time % (24*60*7) # Converted into weekdays
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.title('Weekly cycle of 2nd place', fontsize=18)
time.hist(bins=100);
In [17]:
# 3rd popular place : 1308450003
time = df_train[df_train.place_id==place_ranking.index[2]].time
time = time % (24*60*7) # Converted into weekdays
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.title('Weekly cycle of 3nd place', fontsize=18)
time.hist(bins=100);
In [18]:
# 1st popular place : 8772469670
time = df_train[df_train.place_id==place_ranking.index[0]].time
# Histogram of time
hist = np.histogram(time,5000)
# FFT of Histogram
hist_fft = np.absolute(np.fft.fft(hist[0]))
plt.plot(hist_fft)
plt.xlim([0,1000])
plt.title('FFT of Histogram at 1st place', fontsize=18)
plt.xlabel('Frequency')
print "Period 1: {}, close to 10080 minutes a week.".format(time.max() / (hist_fft[2:200].argmax()+2.0))
print "Period 2: {}, close to 1440 minutes a day.".format(time.max() / (hist_fft[400:600].argmax()+400.0))
In [19]:
# 2nd popular place : 1623394281
time = df_train[df_train.place_id==place_ranking.index[1]].time
# Histogram of time
hist = np.histogram(time,5000)
# FFT of Histogram
hist_fft = np.absolute(np.fft.fft(hist[0]))
plt.plot(hist_fft)
plt.xlim([0,1000])
plt.title('FFT of Histogram at 2nd place', fontsize=18)
plt.xlabel('Frequency')
print "Period 1: {}, close to 10080 minutes a week.".format(time.max() / (hist_fft[2:200].argmax()+2.0))
print "Period 2: {}, close to 1440 minutes a day.".format(time.max() / (hist_fft[400:600].argmax()+400.0))
In [20]:
# 3rd popular place : 1308450003
time = df_train[df_train.place_id==place_ranking.index[2]].time
# Histogram of time
hist = np.histogram(time,5000)
# FFT of Histogram
hist_fft = np.absolute(np.fft.fft(hist[0]))
plt.plot(hist_fft)
plt.xlim([0,1000])
plt.title('FFT of Histogram at 3rd place', fontsize=18)
plt.xlabel('Frequency')
print "Period 1: {}, close to 10080 minutes a week.".format(time.max() / (hist_fft[2:200].argmax()+2.0))
print "Period 2: {}, close to 1440 minutes a day.".format(time.max() / (hist_fft[300:500].argmax()+300.0))
del time
In [21]:
T = df_train.time.max() - df_train.time.min()
T = pd.Timedelta(minutes=T)
print "Time period of collecting data is {} days {} hours {} minutes".format(T.days, T.seconds/3600, T.seconds/60%60)
del T
In [22]:
# Histogram of hour and minute at the 1st popular place
place = place_ranking.index[0]
_, AX = plt.subplots(ncols=2, nrows=2, figsize=(15,6))
data = df_train[df_train.place_id==place]
data.hour = ((data.time/60)%24).astype(int)
data.hour.hist(bins=100,ax=AX[0][0])
AX[0][0].set_title('Histogram of Hour at place {}'.format(place), fontsize=18)
AX[0][0].set_xlabel('Hour')
AX[0][0].set_ylabel('Frequency')
data.weekday = ((data.time/(60*24))%7).astype(int)
data.weekday.hist(bins=100,ax=AX[0][1])
AX[0][1].set_title('Histogram of Weekday at place {}'.format(place), fontsize=18)
AX[0][1].set_xlabel('Weekday')
data.minute = (data.time%60).astype(int)
data.minute.hist(bins=100,ax=AX[1][0])
AX[1][0].set_title('Histogram of Minute at place {}'.format(place), fontsize=18)
AX[1][0].set_xlabel('Minute')
data.month = (data.time/(60*24*30)%12).astype(int)
data.month.hist(bins=100,ax=AX[1][1])
AX[1][1].set_title('Histogram of Month at place {}'.format(place), fontsize=18)
AX[1][1].set_xlabel('Month')
plt.tight_layout(h_pad=1.5)
In [23]:
# Histogram of hour and minute at the 2nd popular place
place = place_ranking.index[1]
_, AX = plt.subplots(ncols=2, nrows=2, figsize=(15,6))
data = df_train[df_train.place_id==place]
data.hour = ((data.time/60)%24).astype(int)
data.hour.hist(bins=100,ax=AX[0][0])
AX[0][0].set_title('Histogram of Hour at place {}'.format(place), fontsize=18)
AX[0][0].set_xlabel('Hour')
AX[0][0].set_ylabel('Frequency')
data.weekday = ((data.time/(60*24))%7).astype(int)
data.weekday.hist(bins=100,ax=AX[0][1])
AX[0][1].set_title('Histogram of Weekday at place {}'.format(place), fontsize=18)
AX[0][1].set_xlabel('Weekday')
data.minute = (data.time%60).astype(int)
data.minute.hist(bins=100,ax=AX[1][0])
AX[1][0].set_title('Histogram of Minute at place {}'.format(place), fontsize=18)
AX[1][0].set_xlabel('Minute')
data.month = (data.time/(60*24*30)%12).astype(int)
data.month.hist(bins=100,ax=AX[1][1])
AX[1][1].set_title('Histogram of Month at place {}'.format(place), fontsize=18)
AX[1][1].set_xlabel('Month')
plt.tight_layout(h_pad=1.5)
del data,place
In [24]:
_, AX = plt.subplots(ncols=2, figsize=(15,6))
df_train.accuracy.hist(bins=100, ax=AX[0])
AX[0].set_title('Histogram of Accuracy', fontsize=18)
AX[0].set_xlabel('Accuracy')
AX[0].set_ylabel('Frequency')
np.log10(df_train.accuracy).hist(bins=100, ax=AX[1])
AX[1].set_title('Histogram of Log of Accuracy', fontsize=18)
AX[1].set_xlabel('Log of Accuracy')
print "CV of accuracy: {:.3f}".format(np.std(df_train.accuracy) / np.mean(df_train.accuracy))
print "CV of log of accuracy: {:.3f}".format(np.std(np.log10(df_train.accuracy)) / np.mean(np.log10(df_train.accuracy)))
In [25]:
for i, order in zip(range(3), ['1st', '2nd', '3rd']):
place = place_ranking.index[i]
data = df_train[df_train.place_id==place]
print "The {} place:".format(order)
print "CV of accuracy: {:.3f}".format(np.std(data.accuracy) / np.mean(data.accuracy))
print "CV of log of accuracy: {:.3f}".format(np.std(np.log10(data.accuracy)) / np.mean(np.log10(data.accuracy)))
print "-"*20
In [26]:
small_grid = df_train[(df_train.x<0.1)&(df_train.y<0.1)]
# Mapping each place id with one color
color = dict(zip(small_grid.place_id.unique(), cm.rainbow(np.linspace(0,1,small_grid.place_id.unique().shape[0]))))
In [27]:
f, ax = plt.subplots()
for place, group in small_grid.groupby('place_id'):
group.plot(ax=ax, kind='scatter', x='x', y='y', color=color[place])
ax.set_title('Check-ins colored by place_id', fontsize=18);
It could be found that the clusters are more easy to be seperated on some specific hour, because check-ins appear more often on specific hour for some specific places.
In [29]:
# Converted minutes into hours
small_grid.loc[:,'hour'] = ((small_grid.time /60)%24).astype(int)
In [30]:
ax = plt.figure(figsize=(15,8)).gca(projection='3d')
for place, group in small_grid.groupby('place_id'):
ax.scatter(group.x,group.y,group.hour,c=color[place])
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Hour')
ax.set_title('3D Scatter of x, y and hour', fontsize=18);
From the below histogram, we could see the check-ins on these two places are almost complement on dimension of time, one is popular at morning and the other is popular at night. This could help us double check the idea of adding the dimensio of time.
In [31]:
print 'Top 3 popular places in the small grid'
for order, place in zip(['1st','2nd','3rd'], small_grid.place_id.value_counts().index[:3]):
print '{} place: {}'.format(order, place)
In [32]:
f, (ax1,ax2) = plt.subplots(nrows=2, sharey=True,figsize=(15,10))
small_grid[small_grid.place_id==1006316884].hour.hist(bins=100,ax=ax1)
ax1.set_title('Histogram of check-ins at the 1st popular place', fontsize=18)
ax1.set_xlabel('Hour')
ax1.set_ylabel('Frequency')
small_grid[small_grid.place_id==8378301865].hour.hist(bins=100,ax=ax2)
ax2.set_title('Histogram of check-ins at the 3rd popular place', fontsize=18)
ax2.set_xlabel('Hour')
ax2.set_ylabel('Frequency')
del small_grid
For this supervised classification problem, following are those algorithms I'm trying to use
In [33]:
data = df_train[df_train.place_id == place_ranking.index[0]]
f, AX = plt.subplots(nrows=2,ncols=2,figsize=(10,8))
# Histogram of X
data.x.plot(kind='hist',bins=50,ax=AX[0][0])
AX[0][0].set_xlabel('X')
AX[0][0].set_title('Histogram of X')
# Histogram of Y
data.y.plot(kind='hist',bins=50,ax=AX[0][1])
AX[0][1].set_xlabel('Y')
AX[0][1].set_title('Histogram of Y')
# Histogram of Log of Accuracy
np.log10(data.accuracy).plot(kind='hist',bins=50,ax=AX[1][0])
AX[1][0].set_xlabel('Log of Accuracy')
AX[1][0].set_title('Histogram of Log of Accuracy')
# Histogram of Hour
((data.time/60) % 24).astype(int).plot(kind='hist',bins=50,ax=AX[1][1])
AX[1][1].set_xlabel('Hour')
AX[1][1].set_title('Histogram of Hour')
plt.tight_layout(h_pad=1.5)
del data
I use the most voted and forked methods on Kaggle forums as my benchmark.
In [7]:
def add_time(df):
# Set the initial date
initial_date = np.datetime64('2014-01-01T01:01', dtype='datetime64[m]')
# Training data
time = pd.DatetimeIndex(df['time'] + initial_date)
df['minute'] = time.minute
# Dividing the interval between hours by minutes
df['hour'] = time.hour + time.minute / 60.0
df['hour_sin'] = np.sin(2*np.pi*df.hour/24)
df['hour_cos'] = np.cos(2*np.pi*df.hour/24)
# Dividing the interval between weekday by hours
df['weekday'] = time.weekday + time.hour / 24.0
df['weekday_sin'] = np.sin(2*np.pi*df.weekday/7)
df['weekday_cos'] = np.cos(2*np.pi*df.weekday/7)
df['day'] = time.dayofyear
df['month'] = time.month - 1
df['month_sin'] = np.sin(2*np.pi*(df.month-1)/7)
df['month_cos'] = np.cos(2*np.pi*(df.month-1)/7)
# Dividing the interval between year by months
df['year'] = time.year - 2013 + time.month / 12.0
return df
df_train = add_time(df_train)
In [35]:
def add_grid_cell(df, n_cell_x, n_cell_y):
eps = 0.00001
size_x = 10.0 / n_cell_x
size_y = 10.0 / n_cell_y
pos_x = (np.where(df['x'].values < eps, 0, df['x'].values - eps) / size_x).astype(int)
pos_y = (np.where(df['y'].values < eps, 0, df['y'].values - eps) / size_y).astype(int)
df['grid_cell'] = pos_y * n_cell_x + pos_x
return df
# Adding grid id
n_cell_x = 30 # The number of cells on axis-x.
n_cell_y = 90 # The number of cells on axis-y.
x_offset = 0.3 # The portion of augmation along axis-x.
y_offset = 0.1 # The portion of augmation along axis-y.
n_cells = n_cell_x * n_cell_y # Total number of cells
df_train = add_grid_cell(df_train, n_cell_x, n_cell_y)
In [36]:
def log_acc(df):
df['accuracy'] = pd.Series(np.log10(df.accuracy.values), index=df.index)
return df
df_train = log_acc(df_train)
Because it will take too much time generating one submission file for validating improvement, I will instead using accuracy as metric for simplicity.
In [37]:
def calculate_accuracy(y_preds, y_tests):
result = [] # The list of number of correctly preded labels in each cell
total = 0 # The number of total preditions
for y_pred, y_test in zip(y_preds, y_tests):
result.append(np.sum(y_pred == y_test))
total += y_test.shape[0]
return np.sum(result) * 100.0 / total
In [38]:
# Given cell_id, this function will split the data into training and testing in the specific cell.
def data_split(df, cell_id, x_offset, y_offset):
# Select the specific cell with cell_id
df_train_cell = df.loc[df.grid_cell == cell_id]
# set the splitting time to be 0.75 percentiles.
time = df_train_cell.time.quantile(0.75)
# Split training and testing data by time.
X_valid = df_train_cell[df_train_cell.time > time]
# extending the area of picked cell by using 3 more cells along y-axis and 2 more cells along x-axis
x_offset *= 1.0/n_cell_x
y_offset *= 1.0/n_cell_y
x_min = df_train_cell.x.min()
x_max = df_train_cell.x.max()
y_min = df_train_cell.y.min()
y_max = df_train_cell.y.max()
X_train = df.loc[(df.x > x_min-x_offset)&(df.x < x_max+x_offset)&
(df.y > y_min-y_offset)&(df.y < y_max+y_offset)&
(df.time < time)]
# Select the place ids in the cell if the number of place id is higher than the threshold
place_counts = X_train.place_id.value_counts()
th = place_counts.quantile(0.1)
mask = (place_counts[X_train.place_id] >= th).values
X_train = X_train.loc[mask]
y_train = X_train.place_id
y_valid = X_valid.place_id
X_train = X_train.drop(['row_id', 'place_id'], axis=1)
X_valid = X_valid.drop(['row_id', 'place_id'], axis=1)
return X_train, X_valid, y_train, y_valid
In [39]:
def time_decay(t, time_end):
alpha = 50
return np.sum(np.exp(alpha * (t.values - time_end) / time_end))
In [40]:
def build_pr_place_table(df, x_offset, y_offset):
pr_place_table = pd.DataFrame(index=df.place_id.unique())
dpath = './pr_place_table_valid'
if not os.path.isdir(dpath):
os.makedirs(dpath)
for cell_id in tnrange(n_cell_x*n_cell_y):
fname = dpath + "/{}.csv".format(cell_id/100)
if not os.path.isfile(fname):
X_train, X_test, y_train, y_test = data_split(df, cell_id, x_offset, y_offset)
X_train["place_id"] = y_train
# Calculating the probability of each place
time_end = X_train.time.max()
groups = X_train.groupby('place_id')
pr_place = groups.time.apply(time_decay, time_end=time_end)
pr_place = pr_place / X_train.shape[0]
pr_place = pr_place.apply(lambda p: p**0.15)
pr_place_table[cell_id] = pr_place
# Grouped 100 cells into one dataframe
if cell_id % 100 == 99:
pr_place_table.fillna(0, inplace=True)
pr_place_table.to_csv(fname)
pr_place_table = pd.DataFrame(index=df.place_id.unique())
print "Successfully creat table!"
build_pr_place_table(df_train, x_offset, y_offset)
In [41]:
# Given dataframe, this funciton will return the weighted result with following features.
def feature_engineering_KNN(df, fw):
DF = df.copy()
for col in fw:
DF[col] = DF[col]*fw[col]
return DF
In [42]:
# Randomly selected cells for training
n_validate_cells = 40 # Number of cells for validating
cell_ids = np.random.choice(n_cells, n_validate_cells)
In [43]:
from sklearn.neighbors import KNeighborsClassifier
# Given cell_id, this function will train a KNN classifier on that cell and it returns the predicted as well as test labels.
# Deuring training, it would not consider those place if the number of occurances is lower than the threshold.
def process_one_cell_KNN(df_train, cell_id, x_offset, y_offset,
cols=None, fw=None, params={}):
# Split data by time
X_train, X_valid, y_train, y_valid = data_split(df_train, cell_id, x_offset, y_offset)
# Feature Engineering
if fw:
X_train = feature_engineering_KNN(X_train, fw)
X_valid = feature_engineering_KNN(X_valid, fw)
# Converting into np array
if cols:
X_train = X_train[cols].values
X_valid = X_valid[cols].values
else:
X_train = X_train.values
X_valid = X_valid.values
y_train = y_train.values
y_valid = y_valid.values
# Creating KNN classifier
clf = KNeighborsClassifier(**params)
clf.fit(X_train, y_train)
y_prob = pd.DataFrame(clf.predict_proba(X_valid), columns=clf.classes_)
y_pred3 = [] # Predict top 3 place_ids
y_pred1 = [] # Predict top 1 place_ids
for _, row in y_prob.iterrows():
data = row.sort_values(ascending=False).index[:3]
y_pred3.append(data)
y_pred1.append(data[:1])
return y_pred3, np.array(y_pred1).reshape(1,-1)[0], y_valid
In [44]:
y_preds = []
y_valids = []
# Randomly select cells for validating
for cell_id in tqdm_notebook(cell_ids, total=n_validate_cells):
_, y_pred, y_valid = process_one_cell_KNN(df_train, cell_id, x_offset, y_offset)
y_preds.append(y_pred)
y_valids.append(y_valid)
print "Accuracy of KNN in {} cells: {:.4f}".format(n_validate_cells, calculate_accuracy(y_preds, y_valids))
In [45]:
from sklearn.naive_bayes import GaussianNB
# Given cell_id, this function will train a GNB classifier on that cell and it returns the predicted as well as test labels.
# Deuring training, it would not consider those place if the number of occurances is lower than the threshold.
def process_one_cell_GNB(df_train, cell_id, x_offset, y_offset, cols=None):
# Split data by time
X_train, X_valid, y_train, y_valid = data_split(df_train, cell_id, x_offset, y_offset)
# Converting into np array
if cols:
X_train = X_train[cols].values
X_valid = X_valid[cols].values
else:
X_train = X_train.values
X_valid = X_valid.values
y_train = y_train.values
y_valid = y_valid.values
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_valid)
return y_pred, y_valid
In [46]:
y_preds = []
y_valids = []
# Randomly select cells for validating
for cell_id in tqdm_notebook(cell_ids, total=n_validate_cells):
y_pred, y_valid = process_one_cell_GNB(df_train, cell_id, x_offset, y_offset)
y_preds.append(y_pred)
y_valids.append(y_valid)
print "Accuracy of GNB in {} cells: {:.4f}".format(n_validate_cells, calculate_accuracy(y_preds, y_valids))
In [47]:
y_preds = []
y_valids = []
# Weight of features
fw = {'x':600, 'y':1200, 'hour':10, 'weekday':10, 'day':0.01, 'accuracy': 30, 'month':2, 'year': 15}
for cell_id in tqdm_notebook(cell_ids, total=n_validate_cells):
_, y_pred, y_valid = process_one_cell_KNN(df_train, cell_id, x_offset, y_offset, cols=fw.keys(), fw=fw)
y_preds.append(y_pred)
y_valids.append(y_valid)
print "Accuracy of KNN in {} cells: {:.4f}".format(n_validate_cells, calculate_accuracy(y_preds, y_valids))
In [48]:
y_preds = []
y_valids = []
fw = {'x':600, 'y':1200, 'hour':10, 'weekday':10, 'day':0.01, 'accuracy': 30, 'month':2, 'year': 15}
params = {'n_neighbors':25, 'weights':'distance', 'metric':'manhattan'}
for cell_id in tqdm_notebook(cell_ids, total=n_validate_cells):
_, y_pred, y_valid = process_one_cell_KNN(df_train, cell_id, x_offset, y_offset,
cols=fw.keys(), fw=fw, params=params)
y_preds.append(y_pred)
y_valids.append(y_valid)
print "Accuracy of KNN in {} cells: {:.4f}".format(n_validate_cells, calculate_accuracy(y_preds, y_valids))
In [49]:
KNN_preds = []
y_preds = []
y_valids = []
fw = {'x':600, 'y':1200, 'hour_sin':15, 'hour_cos':15, 'weekday_sin':10, 'weekday_cos':10,
'month_sin':2, 'month_cos':2, 'day':0.01, 'accuracy': 30, 'year': 15}
params = {'n_neighbors':25, 'weights':'distance', 'metric':'manhattan'}
for cell_id in tqdm_notebook(cell_ids, total=n_validate_cells):
y_pred3, y_pred1, y_valid = process_one_cell_KNN(df_train, cell_id, x_offset, y_offset,
cols=fw.keys(), fw=fw, params=params)
KNN_preds.append(y_pred3)
y_preds.append(y_pred1)
y_valids.append(y_valid)
print "Accuracy of KNN in {} cells: {:.4f}".format(n_validate_cells, calculate_accuracy(y_preds, y_valids))
In [50]:
GNB_preds = []
y_valids = []
# Features considered in GNB model
gnb_cols = ['x', 'y', 'hour_sin', 'hour_cos', 'weekday', 'accuracy']
# Randomly select cells for validating
for cell_id in tqdm_notebook(cell_ids, total=n_validate_cells):
y_pred, y_valid = process_one_cell_GNB(df_train, cell_id, x_offset, y_offset, cols=gnb_cols)
GNB_preds.append(y_pred)
y_valids.append(y_valid)
print "Accuracy of GNB in {} cells: {:.4f}".format(n_validate_cells, calculate_accuracy(GNB_preds, y_valids))
In [51]:
def process_one_cell_GNB(df_train, cell_id, x_offset, y_offset, cols=None):
# Split data by time
X_train, X_valid, y_train, y_valid = data_split(df_train, cell_id, x_offset, y_offset)
X_train["place_id"] = y_train
# Calculating the probability of each place in cell
place_prob = X_train.place_id.value_counts() / X_train.shape[0]
place_prob = place_prob.apply(lambda p: p**0.3)
# Converting into np array
if cols:
X_train = X_train[cols].values
X_valid = X_valid[cols].values
else:
X_train = X_train.values
X_valid = X_valid.values
y_train = y_train.values
y_valid = y_valid.values
clf = GaussianNB()
clf.fit(X_train, y_train)
# Calculating the final probabilities of each row
y_prob = pd.DataFrame(clf.predict_proba(X_valid), columns=clf.classes_)
y_prob = y_prob.apply(lambda c: place_prob[c.index]*c.values, axis=1)
# Sorting the final probabilities of each row
y_pred = []
for _, row in y_prob.iterrows():
y_pred.append(row.sort_values(ascending=False).index[:1])
return np.array(y_pred).reshape(1,-1)[0], y_valid
In [52]:
GNB_preds = []
y_valids = []
# Features considered in GNB model
gnb_cols = ['x', 'y', 'hour_sin', 'hour_cos', 'weekday', 'accuracy']
# Randomly select cells for validating
for cell_id in tqdm_notebook(cell_ids, total=n_validate_cells):
y_pred, y_valid = process_one_cell_GNB(df_train, cell_id, x_offset, y_offset, cols=gnb_cols)
GNB_preds.append(y_pred)
y_valids.append(y_valid)
print "Accuracy of GNB in {} cells: {:.4f}".format(n_validate_cells, calculate_accuracy(GNB_preds, y_valids))
In [53]:
def process_one_cell_GNB(df_train, cell_id, x_offset, y_offset, cols=None):
# Split data by time
X_train, X_valid, y_train, y_valid = data_split(df_train, cell_id, x_offset, y_offset)
pr_place_table = pd.read_csv('./pr_place_table_valid/{}.csv'.format(cell_id/100), index_col=0)
# Converting into np array
if cols:
X_train = X_train[cols].values
X_valid = X_valid[cols].values
else:
X_train = X_train.values
X_valid = X_valid.values
y_train = y_train.values
y_valid = y_valid.values
clf = GaussianNB()
clf.fit(X_train, y_train)
# Calculating the final probabilities of each row
y_prob = pd.DataFrame(clf.predict_proba(X_valid), columns=clf.classes_)
y_prob = y_prob.apply(lambda c: pr_place_table[str(cell_id)][c.index]*c.values, axis=1)
# Sorting the final probabilities of each row
y_pred3 = [] # Predict top 3 place_ids
y_pred1 = [] # Predict top 1 place_ids
for _, row in y_prob.iterrows():
data = row.sort_values(ascending=False).index[:3]
y_pred3.append(data)
y_pred1.append(data[:1])
return y_pred3, np.array(y_pred1).reshape(1,-1)[0], y_valid
In [54]:
GNB_preds = []
y_preds = []
y_valids = []
# Features considered in GNB model
gnb_cols = ['x', 'y', 'hour_sin', 'hour_cos', 'weekday', 'accuracy']
# Randomly select cells for validating
for cell_id in tqdm_notebook(cell_ids, total=n_validate_cells):
y_pred3, y_pred1, y_valid = process_one_cell_GNB(df_train, cell_id, x_offset, y_offset, cols=gnb_cols)
GNB_preds.append(y_pred3)
y_preds.append(y_pred1)
y_valids.append(y_valid)
print "Accuracy of GNB in {} cells: {:.4f}".format(n_validate_cells, calculate_accuracy(y_preds, y_valids))
In [55]:
def scott_rule(group, col):
# Scott's Rule : n**(-1./(d+4)), n: number of data, d: number of dimension
if math.isnan(group[col].std()):
bw = group.shape[0] ** (-0.3)
else:
bw = (group.shape[0] ** (-0.3)) * (group[col].std()+0.01)
return bw
In [56]:
from sklearn.neighbors import KernelDensity
def build_KDE(X_train, X_valid, cols, place_ids, cell_id, path):
kde_table = {}
dpath = path + '/{}'.format(cell_id)
groups = X_train.groupby('place_id')
for col in cols:
fpath = path + '/{}/{}.csv'.format(cell_id, col)
if not os.path.isfile(fpath):
samples = X_valid[col].unique()
df = pd.DataFrame(index=place_ids, columns=['{:.5f}'.format(v) for v in samples])
bw = groups.apply(scott_rule, col=col)
for place, group in groups:
# Creating KDE
kde = KernelDensity(kernel='gaussian', bandwidth=bw[place])
kde.fit(group[col][:, np.newaxis])
# Storing KDE for each place
df.loc[place] = np.exp(kde.score_samples(samples[:,np.newaxis]))
if not os.path.isdir(dpath):
os.makedirs(dpath)
df.to_csv(fpath)
kde_table[col] = pd.read_csv(fpath, index_col=0)
return kde_table
In [57]:
def process_one_cell_KDE(df_train, cell_id, cols, x_offset, y_offset):
X_train, X_valid, y_train, y_valid = data_split(df_train, cell_id, x_offset, y_offset)
X_train['place_id'] = y_train
# Unique place ids in the cell
place_ids = X_train.place_id.unique()
# Building KDE of columns, kde_cols, for each place_id
kde_table = {}
groups = X_train.groupby('place_id')
for col in cols:
table = {}
bw = groups.apply(scott_rule, col=col)
for place, group in groups:
kde = KernelDensity(kernel='gaussian', bandwidth=bw[place])
kde.fit(group[col][:, np.newaxis])
table[place] = kde
kde_table[col] = table
# Calculating the probability of each place
pr_place_table = pd.read_csv('./pr_place_table_valid/{}.csv'.format(cell_id/100), index_col=0)
# Predicting
prob = pd.DataFrame(np.ones((place_ids.shape[0], X_valid.shape[0])), columns=X_valid.index, index=place_ids)
for col in cols:
samples = X_valid[col].values[:,np.newaxis]
for place in prob.index:
kde = kde_table[col][place]
temp = np.exp(kde.score_samples(samples))
prob.loc[place] = prob.loc[place].values * temp
probs = prob.apply(lambda p: (p.values ** 0.2) * pr_place_table[str(cell_id)][p.index])
probs = probs.transpose()
y_pred = []
for _, row in probs.iterrows():
y_pred.append(row.sort_values(ascending=False).index[:3])
return y_pred, probs.idxmax(1).values, y_valid
In [58]:
KDE_preds = []
y_preds = []
y_valids = []
# Features considered in KDE model
kde_cols = ['x', 'y', 'hour', 'weekday', 'accuracy']
for cell_id in tqdm_notebook(cell_ids, total=n_validate_cells):
y_pred3, y_pred1, y_valid = process_one_cell_KDE(df_train, cell_id, kde_cols, x_offset, y_offset)
KDE_preds.append(y_pred3)
y_preds.append(y_pred1)
y_valids.append(y_valid)
print "Accuracy of KDE in {} cells: {:.4f}".format(n_validate_cells, calculate_accuracy(y_preds, y_valids))
In [59]:
from collections import Counter
weights = [68, 71, 19]
preds = [KDE_preds, KNN_preds, GNB_preds]
ensemble_preds = []
for cell in tnrange(n_validate_cells):
result_cell = []
for row in range(y_valids[cell].shape[0]):
result_row = {}
for i, model in enumerate(preds):
weight = weights[i]
for j, pred in enumerate(model[cell][row]):
if pred in result_row:
curr = result_row[pred]
else:
curr = 0
curr += weight/(j+1.0)
result_row[pred] = curr
result_cell.append(Counter(result_row).most_common(1)[0][0])
ensemble_preds.append(np.array(result_cell))
print "Accuracy of Ensemble in {} cells: {:.4f}".format(n_validate_cells, calculate_accuracy(ensemble_preds, y_valids))
In [60]:
# This function will write the dataframe, df_aux, into a csv for submitting on Kaggle.
def generate_submission(df_aux, fname):
print 'Generating submission file ...'
ds_sub = df_aux['place1'].str.cat([df_aux['place2'], df_aux['place3']], sep=' ')
ds_sub.name = 'place_id'
ds_sub.to_csv(fname, index=True, header=True, index_label='row_id')
In [61]:
def select_cell(df, cell_id, x_offset, y_offset):
# Select the specific cell with cell_id
df_train_cell = df.loc[df.grid_cell == cell_id]
x_offset *= 1.0/n_cell_x
y_offset *= 1.0/n_cell_y
x_min = df_train_cell.x.min()
x_max = df_train_cell.x.max()
y_min = df_train_cell.y.min()
y_max = df_train_cell.y.max()
X_train = df.loc[(df.x > x_min-x_offset)&(df.x < x_max+x_offset)&
(df.y > y_min-y_offset)&(df.y < y_max+y_offset)]
place_counts = X_train.place_id.value_counts()
th = place_counts.quantile(0.1)
mask = (place_counts[X_train.place_id] >= th).values
X_train = X_train.loc[mask]
return X_train
In [62]:
def build_pr_place_table_sub(place_ids):
pr_place_table = pd.DataFrame(index=place_ids)
dpath = './pr_place_table'
if not os.path.isdir(dpath):
os.makedirs(dpath)
for cell_id in tnrange(n_cells):
fname = dpath + "/{}.csv".format(cell_id/100)
if not os.path.isfile(fname):
X_train = pd.read_csv('./X_train/{}.csv'.format(cell_id), index_col=0)
# Calculating the probability of each place
time_end = X_train.time.max()
groups = X_train.groupby('place_id')
pr_place = groups.time.apply(time_decay, time_end=time_end)
pr_place = pr_place / X_train.shape[0]
pr_place = pr_place.apply(lambda p: p**0.15)
pr_place_table[cell_id] = pr_place
# Grouped 100 cells into one dataframe
if cell_id % 100 == 99:
pr_place_table.fillna(0, inplace=True)
pr_place_table.to_csv(fname)
pr_place_table = pd.DataFrame(index=df.place_id.unique())
print "Successfully creat table!"
In [63]:
# Modified function, process_one_cell_KNN, in order to generate submission file.
def one_cell_KNN_sub(cell_id, fw, params):
# Select the specific cell with cell_id for testing
X_test = pd.read_csv('./X_test/{}.csv'.format(cell_id), index_col=0)
row_ids = X_test.index
# Select the specific cell with cell_id for training
X_train = pd.read_csv('./X_train/{}.csv'.format(cell_id), index_col=0)
# Feature Engineering
X_train = feature_engineering_KNN(X_train, fw)
X_test = feature_engineering_KNN(X_test, fw)
# Mapping place ids into values between 0 and n_classes-1
le = LabelEncoder()
y_train = le.fit_transform(X_train.place_id.values)
cols = fw.keys()
X_train = X_train[cols].values
X_test = X_test[cols].values
clf = KNeighborsClassifier(**params)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)
# Reverse sorting and taking the top 3 place ids
pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])
return pred_labels, row_ids
In [64]:
def grid_KNN_sub(fw, params, n_cells, fname):
if os.path.isfile(fname):
print 'File has been generated!'
else:
preds = np.zeros((8607230, 3), dtype=int)
for cell_id in tnrange(n_cells):
pred_labels, row_ids = one_cell_KNN_sub(cell_id, fw, params)
preds[row_ids] = pred_labels
df_aux = pd.DataFrame(preds, dtype=str, columns=['place1', 'place2', 'place3'])
generate_submission(df_aux, fname)
In [65]:
# Weight of features
fw = {'x':600, 'y':1200, 'hour_sin':15, 'hour_cos':15, 'weekday_sin':10, 'weekday_cos':10,
'month_sin':2, 'month_cos':2, 'day':0.01, 'accuracy': 30, 'year': 15}
params = {'n_neighbors':25, 'weights':'distance', 'metric':'manhattan'}
grid_KNN_sub(fw, params, n_cells, 'sub_KNN.csv')
In [66]:
build_pr_place_table_sub(df_train.place_id.unique())
In [67]:
# Modifying function, process_one_cell_KNN, in order to generate submission file.
def one_cell_GNB_sub(cell_id, cols):
# Select the specific cell with cell_id for training
X_train = pd.read_csv('./X_train/{}.csv'.format(cell_id), index_col=0)
# Select the specific cell with cell_id for testing
X_test = pd.read_csv('./X_test/{}.csv'.format(cell_id), index_col=0)
row_ids = X_test.index
y_train = X_train.place_id.values
X_train = X_train[cols].values
X_test = X_test[cols].values
# Probability of each place
pr_place_table = pd.read_csv('./pr_place_table/{}.csv'.format(cell_id/100), index_col=0)
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = pd.DataFrame(clf.predict_proba(X_test), columns=clf.classes_)
y_pred = y_pred.apply(lambda c: pr_place_table[str(cell_id)][c.index]*c.values, axis=1)
# Reverse sorting and taking the top 3 place ids
pred_labels = []
for _, row in y_pred.iterrows():
pred_labels.append(row.sort_values(ascending=False).index[:3])
return pred_labels, row_ids
In [68]:
def grid_GNB_sub(cols, n_cells, fname):
if os.path.isfile(fname):
print 'File has been generated!'
else:
preds = np.zeros((8607230, 3), dtype=int)
for cell_id in tnrange(num_cells):
pred_labels, row_ids = one_cell_GNB_sub(cell_id, cols)
preds[row_ids] = pred_labels
df_aux = pd.DataFrame(preds, dtype=str, columns=['place1', 'place2', 'place3'])
generate_submission(df_aux, fname)
In [69]:
# Features considered in GNB model
gnb_cols = ['x', 'y', 'hour_sin', 'hour_cos', 'minute', 'weekday', 'accuracy']
grid_GNB_sub(gnb_cols, n_cells, 'sub_GNB.csv')
In [70]:
# Modifying function, process_one_cell_KDE, in order to generate submission file.
def one_cell_KDE_sub(cell_id, cols):
# Select the specific cell with cell_id for training
X_train = pd.read_csv('./X_train/{}.csv'.format(cell_id), index_col=0)
# Select the specific cell with cell_id for testing
X_test = pd.read_csv('./X_test/{}.csv'.format(cell_id), index_col=0)
row_ids = X_test.index
# All place ids in the cell
place_ids = X_train.place_id.unique()
# Building KDE of columns, kde_cols, for each place_id
kde_table = {}
groups = X_train.groupby('place_id')
for col in cols:
table = {}
bw = groups.apply(scott_rule, col=col)
for place, group in groups:
kde = KernelDensity(kernel='gaussian', bandwidth=bw[place])
kde.fit(group[col][:, np.newaxis])
table[place] = kde
kde_table[col] = table
# Probability of each place
pr_place_table = pd.read_csv('./pr_place_table/{}.csv'.format(cell_id/100), index_col=0)
# Predicting
prob = pd.DataFrame(np.ones((place_ids.shape[0], X_test.shape[0])), columns=X_test.index, index=place_ids)
for col in cols:
samples = X_test[col].values[:,np.newaxis]
for place in prob.index:
kde = kde_table[col][place]
temp = np.exp(kde.score_samples(samples))
prob.loc[place] = prob.loc[place].values * temp
probs = prob.apply(lambda p: (p.values ** 0.2) * pr_place_table[str(cell_id)][p.index])
probs = probs.transpose()
# Sorting probabilities and choosing top 3 row by row
pred_labels = []
for _, row in probs.iterrows():
pred_labels.append(row.sort_values(ascending=False).index[:3])
return np.array(pred_labels), row_ids
In [71]:
def grid_KDE_sub(cols, num_cells, fname):
if os.path.isfile(fname):
print 'File has been generated!'
else:
preds = np.zeros((8607230, 3), dtype=int)
for cell_id in tnrange(num_cells):
pred_labels, row_ids = one_cell_KDE_sub(cell_id, cols)
preds[row_ids] = pred_labels
df_aux = pd.DataFrame(preds, dtype=str, columns=['place1', 'place2', 'place3'])
generate_submission(df_aux, fname)
In [72]:
# Features considered in KDE model
kde_cols = ['x', 'y', 'hour', 'weekday', 'accuracy']
grid_KDE_sub(kde_cols, n_cells, 'sub_KDE.csv')
In [73]:
if not os.path.isfile('sub_ensemble.csv'):
del df_train
df_KDE = pd.read_csv('sub_KDE.csv')
df_KNN = pd.read_csv('sub_KNN.csv')
df_GNB = pd.read_csv('sub_GNB.csv')
weights = [68, 71, 19]
models = [df_KDE, df_KNN, df_GNB]
df_ensemble = pd.DataFrame(columns=df_KDE.columns)
df_ensemble.row_id = df_KDE.row_id
for row in tnrange(8607230):
place_ids = {}
# Ensemble
for i, model in enumerate(models):
weight = weights[i]
for j, place_id in enumerate(model.loc[row].place_id.split()):
if place_id in place_ids:
place_ids[place_id] += weight/(j+1.0)
else:
place_ids[place_id] = weight/(j+1.0)
new_preds = ' '.join([x[0] for x in Counter(place_ids).most_common(3)])
# Select top 3 place_id
df_ensemble.set_value(row, 'place_id', new_preds)
df_ensemble.to_csv('sub_ensemble.csv', index=False, header=True)
print "File has been generated!"
Model | Score | Ranking |
---|---|---|
Benchmark | 0.56736 | 585/1212 |
My Ensemble Model | 0.59250 | 63/1212 |
The most important insight into this data is the relative popularity of places, P(place), varied substantially over time (really it should be written as P(place, time)). For instance, if the following 6 different places were in the same cell, I wanted to predict the most likely check-in place on December, it seemed that 9586338177 is the most unlikely one because it had lowest frequency on December comparing to others. This is why the feature of time plays one of the important roles in this problem. Moreover, it was hard to forecast it from the training data because there were not specific pattern here, as following figures show.
In [10]:
# Month is over 12, because I add the next year to be considered as different months
_, ax = plt.subplots(nrows=3, ncols=2, sharey=True,figsize=(10,7))
plt.tight_layout(h_pad=3)
for i, place_id in enumerate(place_ranking[:6].index):
place = df_train[df_train.place_id == place_id]
place.Month = 12*place.year - 12
col = i/3
row = i%3
ax[row][col].set_title('Histogram of Month, Place: {}'.format(place_id))
ax[row][col].set_xlabel('Month')
ax[row][col].set_ylabel('Frequency')
ax[row][col].set_xlim((0,18))
place.Month.hist(bins=30, ax=ax[row][col])
Based on shared methods on the forum of Kaggle, there are two aspects of ideas could help improving result of KDE in order to get better ensemble result:
In [ ]: