In [2]:
import math
import os
import pandas as pd
import numpy as np
import time
from datetime import timedelta
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")
# For checking progress
from tqdm import tqdm_notebook, tnrange
# For plotting
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D # For 3D scatter
import matplotlib.cm as cm # For colored labels
import seaborn as sns
%matplotlib inline
plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['font.size'] = 16
In [3]:
# Reading data
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
In [5]:
# Count the occurrences of place_id
place_ranking = df_train.place_id.value_counts()
In [4]:
# Basic statistics of data
print 'Size of training data: {}'.format(df_train.shape)
print 'Size of testing data: {}'.format(df_test.shape)
In [5]:
print "Description of training data: \n"
print df_train.describe()
In [6]:
print "Description of testing data: \n"
print df_test.describe()
del df_test
In [7]:
sns.jointplot(data=df_train.sample(frac=0.01), x='x',y='y',kind='hex', stat_func=None,
xlim=(0,10),ylim=(0,10));
plt.title('Distribution of 1% random samples',x=-2.5,y=1.2,fontsize=18);
In [8]:
data = df_train[df_train.place_id==place_ranking.index[0]]
data.plot(kind='scatter', x='x', y='y')
plt.title('Distribution at 1st popular place',fontsize=18)
plt.xlabel('X')
plt.ylabel('Y')
print 'Standard deviation of x: {}'.format(data.x.std())
print 'Standard deviation of y: {}'.format(data.y.std())
del data
In [9]:
data = df_train[df_train.place_id==place_ranking.index[1]]
data.plot(kind='scatter', x='x', y='y')
plt.title('Distribution at 2nd popular place',fontsize=18)
plt.xlabel('X')
plt.ylabel('Y')
print 'Standard deviation of x: {}'.format(data.x.std())
print 'Standard deviation of y: {}'.format(data.y.std())
del data
In [10]:
data = df_train[df_train.place_id==place_ranking.index[2]]
data.plot(kind='scatter', x='x', y='y')
plt.title('Distribution at 3rd popular place',fontsize=18)
plt.xlabel('X')
plt.ylabel('Y')
print 'Standard deviation of x: {}'.format(data.x.std())
print 'Standard deviation of y: {}'.format(data.y.std())
del data
In [11]:
_, AX = plt.subplots(nrows=2, sharey=True)
AX[0].set_title('Histogram of accuracy')
AX[0].set_xlabel('Accuracy')
AX[0].set_ylabel('Frequency')
df_train['accuracy'].hist(bins=100,ax=AX[0])
AX[1].set_title('Histogram of accuracy under 200')
AX[1].set_xlim((0,200))
AX[1].set_xlabel('Accuracy')
AX[1].set_ylabel('Frequency')
df_train.accuracy.hist(bins=100,ax=AX[1])
plt.tight_layout(h_pad=1.5)
In [12]:
unique_id = df_train.place_id.unique()
print "Number of unique place id: {}, roughly {:.3f} % of traing data.".format(len(unique_id),
len(unique_id) * 100.0 / df_train.shape[0])
del unique_id
In [13]:
plt.xlabel('Frequency')
plt.ylabel('Place id')
place_ranking.head(5).plot.barh(xlim=(0,2000)).invert_yaxis()
plt.title('Top 5 most popular place id', fontsize=18);
In [14]:
plt.xlabel('Frequency')
plt.ylabel('Place id')
place_ranking.tail(5).plot.barh(xlim=(0,5))
plt.title('Bottom 5 least popular place id', fontsize=18);
In [15]:
# 1st popular place : 8772469670
time = df_train[df_train.place_id==place_ranking.index[0]].time
time = time % (24*60*7) # Converted into weekdays
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.title('Weekly cycle of 1st popular place', fontsize=18)
time.hist(bins=100);
In [16]:
# 2nd popular place : 1623394281
time = df_train[df_train.place_id==place_ranking.index[1]].time
time = time % (24*60*7) # Converted into weekdays
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.title('Weekly cycle of 2nd place', fontsize=18)
time.hist(bins=100);
In [17]:
# 3rd popular place : 1308450003
time = df_train[df_train.place_id==place_ranking.index[2]].time
time = time % (24*60*7) # Converted into weekdays
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.title('Weekly cycle of 3nd place', fontsize=18)
time.hist(bins=100);
In [18]:
# 1st popular place : 8772469670
time = df_train[df_train.place_id==place_ranking.index[0]].time
# Histogram of time
hist = np.histogram(time,5000)
# FFT of Histogram
hist_fft = np.absolute(np.fft.fft(hist[0]))
plt.plot(hist_fft)
plt.xlim([0,1000])
plt.title('FFT of Histogram at 1st place', fontsize=18)
plt.xlabel('Frequency')
print "Period 1: {}, close to 10080 minutes a week.".format(time.max() / (hist_fft[2:200].argmax()+2.0))
print "Period 2: {}, close to 1440 minutes a day.".format(time.max() / (hist_fft[400:600].argmax()+400.0))
In [19]:
# 2nd popular place : 1623394281
time = df_train[df_train.place_id==place_ranking.index[1]].time
# Histogram of time
hist = np.histogram(time,5000)
# FFT of Histogram
hist_fft = np.absolute(np.fft.fft(hist[0]))
plt.plot(hist_fft)
plt.xlim([0,1000])
plt.title('FFT of Histogram at 2nd place', fontsize=18)
plt.xlabel('Frequency')
print "Period 1: {}, close to 10080 minutes a week.".format(time.max() / (hist_fft[2:200].argmax()+2.0))
print "Period 2: {}, close to 1440 minutes a day.".format(time.max() / (hist_fft[400:600].argmax()+400.0))
In [20]:
# 3rd popular place : 1308450003
time = df_train[df_train.place_id==place_ranking.index[2]].time
# Histogram of time
hist = np.histogram(time,5000)
# FFT of Histogram
hist_fft = np.absolute(np.fft.fft(hist[0]))
plt.plot(hist_fft)
plt.xlim([0,1000])
plt.title('FFT of Histogram at 3rd place', fontsize=18)
plt.xlabel('Frequency')
print "Period 1: {}, close to 10080 minutes a week.".format(time.max() / (hist_fft[2:200].argmax()+2.0))
print "Period 2: {}, close to 1440 minutes a day.".format(time.max() / (hist_fft[300:500].argmax()+300.0))
del time
In [21]:
T = df_train.time.max() - df_train.time.min()
T = pd.Timedelta(minutes=T)
print "Time period of collecting data is {} days {} hours {} minutes".format(T.days, T.seconds/3600, T.seconds/60%60)
del T
In [22]:
# Histogram of hour and minute at the 1st popular place
place = place_ranking.index[0]
_, AX = plt.subplots(ncols=2, nrows=2, figsize=(15,6))
data = df_train[df_train.place_id==place]
data.hour = ((data.time/60)%24).astype(int)
data.hour.hist(bins=100,ax=AX[0][0])
AX[0][0].set_title('Histogram of Hour at place {}'.format(place), fontsize=18)
AX[0][0].set_xlabel('Hour')
AX[0][0].set_ylabel('Frequency')
data.weekday = ((data.time/(60*24))%7).astype(int)
data.weekday.hist(bins=100,ax=AX[0][1])
AX[0][1].set_title('Histogram of Weekday at place {}'.format(place), fontsize=18)
AX[0][1].set_xlabel('Weekday')
data.minute = (data.time%60).astype(int)
data.minute.hist(bins=100,ax=AX[1][0])
AX[1][0].set_title('Histogram of Minute at place {}'.format(place), fontsize=18)
AX[1][0].set_xlabel('Minute')
data.month = (data.time/(60*24*30)%12).astype(int)
data.month.hist(bins=100,ax=AX[1][1])
AX[1][1].set_title('Histogram of Month at place {}'.format(place), fontsize=18)
AX[1][1].set_xlabel('Month')
plt.tight_layout(h_pad=1.5)
In [23]:
# Histogram of hour and minute at the 2nd popular place
place = place_ranking.index[1]
_, AX = plt.subplots(ncols=2, nrows=2, figsize=(15,6))
data = df_train[df_train.place_id==place]
data.hour = ((data.time/60)%24).astype(int)
data.hour.hist(bins=100,ax=AX[0][0])
AX[0][0].set_title('Histogram of Hour at place {}'.format(place), fontsize=18)
AX[0][0].set_xlabel('Hour')
AX[0][0].set_ylabel('Frequency')
data.weekday = ((data.time/(60*24))%7).astype(int)
data.weekday.hist(bins=100,ax=AX[0][1])
AX[0][1].set_title('Histogram of Weekday at place {}'.format(place), fontsize=18)
AX[0][1].set_xlabel('Weekday')
data.minute = (data.time%60).astype(int)
data.minute.hist(bins=100,ax=AX[1][0])
AX[1][0].set_title('Histogram of Minute at place {}'.format(place), fontsize=18)
AX[1][0].set_xlabel('Minute')
data.month = (data.time/(60*24*30)%12).astype(int)
data.month.hist(bins=100,ax=AX[1][1])
AX[1][1].set_title('Histogram of Month at place {}'.format(place), fontsize=18)
AX[1][1].set_xlabel('Month')
plt.tight_layout(h_pad=1.5)
del data,place
In [24]:
_, AX = plt.subplots(ncols=2, figsize=(15,6))
df_train.accuracy.hist(bins=100, ax=AX[0])
AX[0].set_title('Histogram of Accuracy', fontsize=18)
AX[0].set_xlabel('Accuracy')
AX[0].set_ylabel('Frequency')
np.log10(df_train.accuracy).hist(bins=100, ax=AX[1])
AX[1].set_title('Histogram of Log of Accuracy', fontsize=18)
AX[1].set_xlabel('Log of Accuracy')
print "CV of accuracy: {:.3f}".format(np.std(df_train.accuracy) / np.mean(df_train.accuracy))
print "CV of log of accuracy: {:.3f}".format(np.std(np.log10(df_train.accuracy)) / np.mean(np.log10(df_train.accuracy)))
In [25]:
for i, order in zip(range(3), ['1st', '2nd', '3rd']):
place = place_ranking.index[i]
data = df_train[df_train.place_id==place]
print "The {} place:".format(order)
print "CV of accuracy: {:.3f}".format(np.std(data.accuracy) / np.mean(data.accuracy))
print "CV of log of accuracy: {:.3f}".format(np.std(np.log10(data.accuracy)) / np.mean(np.log10(data.accuracy)))
print "-"*20
In [26]:
small_grid = df_train[(df_train.x<0.1)&(df_train.y<0.1)]
# Mapping each place id with one color
color = dict(zip(small_grid.place_id.unique(), cm.rainbow(np.linspace(0,1,small_grid.place_id.unique().shape[0]))))
In [27]:
f, ax = plt.subplots()
for place, group in small_grid.groupby('place_id'):
group.plot(ax=ax, kind='scatter', x='x', y='y', color=color[place])
ax.set_title('Check-ins colored by place_id', fontsize=18);
It could be found that the clusters are more easy to be seperated on some specific hour, because check-ins appear more often on specific hour for some specific places.
In [29]:
# Converted minutes into hours
small_grid.loc[:,'hour'] = ((small_grid.time /60)%24).astype(int)
In [30]:
ax = plt.figure(figsize=(15,8)).gca(projection='3d')
for place, group in small_grid.groupby('place_id'):
ax.scatter(group.x,group.y,group.hour,c=color[place])
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Hour')
ax.set_title('3D Scatter of x, y and hour', fontsize=18);
From the below histogram, we could see the check-ins on these two places are almost complement on dimension of time, one is popular at morning and the other is popular at night. This could help us double check the idea of adding the dimensio of time.
In [31]:
print 'Top 3 popular places in the small grid'
for order, place in zip(['1st','2nd','3rd'], small_grid.place_id.value_counts().index[:3]):
print '{} place: {}'.format(order, place)
In [32]:
f, (ax1,ax2) = plt.subplots(nrows=2, sharey=True,figsize=(15,10))
small_grid[small_grid.place_id==1006316884].hour.hist(bins=100,ax=ax1)
ax1.set_title('Histogram of check-ins at the 1st popular place', fontsize=18)
ax1.set_xlabel('Hour')
ax1.set_ylabel('Frequency')
small_grid[small_grid.place_id==8378301865].hour.hist(bins=100,ax=ax2)
ax2.set_title('Histogram of check-ins at the 3rd popular place', fontsize=18)
ax2.set_xlabel('Hour')
ax2.set_ylabel('Frequency')
del small_grid
For this supervised classification problem, following are those algorithms I'm trying to use