In [1]:
import numpy as np
import pandas as pd
Load the data, in this case a sample data. Working with a sample allow us to debug easier and faster!
In [2]:
train_users = pd.read_csv('../data/train_users_sample.csv')
test_users = pd.read_csv('../data/test_users_sample.csv')
sessions = pd.read_csv('../data/sessions_sample.csv')
Make a single DataFrame containing all the users:
In [3]:
users = pd.concat([train_users, test_users], axis=0, ignore_index=True)
Drop useless column(test_users don't have it)
In [4]:
users.drop('date_first_booking', axis=1, inplace=True)
In [5]:
user_with_year_age_mask = users['age'] > 1000
users.loc[user_with_year_age_mask, 'age'] = 2015 - users.loc[user_with_year_age_mask, 'age']
Set limits to age:
In [6]:
users.loc[(users['age'] > 100) | (users['age'] < 18), 'age'] = -1
Fill NaNs with -1 to make it more noticiable:
In [7]:
users['age'].fillna(-1, inplace=True)
The age, is really fine grained. We are going to make bins and fit each user in the proper age group:
In [8]:
bins = [-1, 20, 25, 30, 40, 50, 60, 75, 100]
users['age_group'] = np.digitize(users['age'], bins, right=True)
In [9]:
users['nans'] = np.sum([
(users['age'] == -1),
(users['gender'] == '-unknown-'),
(users['language'] == '-unknown-'),
(users['first_affiliate_tracked'] == 'untracked'),
(users['first_browser'] == '-unknown-')
], axis=0)
In [10]:
users['date_account_created'] = pd.to_datetime(users['date_account_created'], errors='ignore')
users['date_first_active'] = pd.to_datetime(users['timestamp_first_active'], format='%Y%m%d%H%M%S')
Convert to DatetimeIndex:
In [11]:
date_account_created = pd.DatetimeIndex(users['date_account_created'])
date_first_active = pd.DatetimeIndex(users['date_first_active'])
Split dates into day, week, month, year:
In [12]:
users['day_account_created'] = date_account_created.day
users['weekday_account_created'] = date_account_created.weekday
users['week_account_created'] = date_account_created.week
users['month_account_created'] = date_account_created.month
users['year_account_created'] = date_account_created.year
users['day_first_active'] = date_first_active.day
users['weekday_first_active'] = date_first_active.weekday
users['week_first_active'] = date_first_active.week
users['month_first_active'] = date_first_active.month
users['year_first_active'] = date_first_active.year
Get the difference(time lag) between the date in which the account was created and when it was first active:
In [13]:
users['time_lag'] = (date_account_created.values - date_first_active.values).astype(int)
Drop duplicated columns:
In [14]:
drop_list = [
'date_account_created',
'date_first_active',
'timestamp_first_active'
]
users.drop(drop_list, axis=1, inplace=True)
In [15]:
sessions.rename(columns = {'user_id': 'id'}, inplace=True)
This simply counts the number of times an action is made by each user. One thing I learned from this competition is to really understand what you want to do and not begin to code an over-engineered solution(KISS).
In [16]:
action_count = sessions.groupby(['id', 'action'])['secs_elapsed'].agg(len).unstack()
action_type_count = sessions.groupby(['id', 'action_type'])['secs_elapsed'].agg(len).unstack()
action_detail_count = sessions.groupby(['id', 'action_detail'])['secs_elapsed'].agg(len).unstack()
device_type_sum = sessions.groupby(['id', 'device_type'])['secs_elapsed'].agg(sum).unstack()
sessions_data = pd.concat([action_count, action_type_count, action_detail_count, device_type_sum],axis=1)
sessions_data.columns = sessions_data.columns.map(lambda x: str(x) + '_count')
# Most used device
sessions_data['most_used_device'] = sessions.groupby('id')['device_type'].max()
users = users.join(sessions_data, on='id')
In [17]:
secs_elapsed = sessions.groupby('id')['secs_elapsed']
secs_elapsed = secs_elapsed.agg(
{
'secs_elapsed_sum': np.sum,
'secs_elapsed_mean': np.mean,
'secs_elapsed_min': np.min,
'secs_elapsed_max': np.max,
'secs_elapsed_median': np.median,
'secs_elapsed_std': np.std,
'secs_elapsed_var': np.var,
'day_pauses': lambda x: (x > 86400).sum(),
'long_pauses': lambda x: (x > 300000).sum(),
'short_pauses': lambda x: (x < 3600).sum(),
'session_length' : np.count_nonzero
}
)
users = users.join(secs_elapsed, on='id')
Encode categorical features
In [18]:
categorical_features = [
'gender', 'signup_method', 'signup_flow', 'language',
'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked',
'signup_app', 'first_device_type', 'first_browser', 'most_used_device'
]
users = pd.get_dummies(users, columns=categorical_features)
In [19]:
users.set_index('id', inplace=True)
users.loc[train_users['id']].to_csv('../cache/train_users.csv')
users.loc[test_users['id']].drop('country_destination', axis=1).to_csv('../cache/test_users.csv')