In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from datetime import timedelta
import datetime as dt
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [16, 10]
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
import warnings
warnings.filterwarnings('ignore')
In [2]:
train = pd.read_pickle('../data/generated/preprocessed_train.pkl')
test = pd.read_pickle('../data/generated/preprocessed_test.pkl')
In [3]:
print('We have {} training rows and {} test rows.'.format(train.shape[0], test.shape[0]))
print('We have {} training columns and {} test columns.'.format(train.shape[1], test.shape[1]))
train.head(5)
Out[3]:
In [4]:
corr = train.corr()
sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values)
Out[4]:
In [5]:
s = corr.unstack()
so = s.sort_values(kind='quicksort', ascending=False)
so = so.reset_index()
so[so['level_0'] != so['level_1']].head(20)
Out[5]:
In [6]:
so[so['level_0'] == 'ORDERED'].head(20)
Out[6]:
In [7]:
print('The PLATFORM_A has only two values {}.'.format(str(set(train.PLATFORM_A.unique()) | set(test.PLATFORM_A.unique()))))
print('The PLATFORM_B has only two values {}.'.format(str(set(train.PLATFORM_B.unique()) | set(test.PLATFORM_B.unique()))))
print('The PLATFORM_C has only two values {}.'.format(str(set(train.PLATFORM_C.unique()) | set(test.PLATFORM_C.unique()))))
print('The PLATFORM_D has only two values {}.'.format(str(set(train.PLATFORM_D.unique()) | set(test.PLATFORM_D.unique()))))
print('The TAG has only two values {}.'.format(str(set(train.TAG.unique()) | set(test.TAG.unique()))))
print('The COMPLETED has only two values {}.'.format(str(set(train.COMPLETED.unique()) | set(test.COMPLETED.unique()))))
print('The PUBLISHED has only two values {}.'.format(str(set(train.PUBLISHED.unique()) | set(test.PUBLISHED.unique()))))
print('The SCHEDULE_1 has only two values {}.'.format(str(set(train.SCHEDULE_1.unique()) | set(test.SCHEDULE_1.unique()))))
print('The SCHEDULE_2 has only two values {}.'.format(str(set(train.SCHEDULE_2.unique()) | set(test.SCHEDULE_2.unique()))))
print('The SCHEDULE_3 has only two values {}.'.format(str(set(train.SCHEDULE_3.unique()) | set(test.SCHEDULE_3.unique()))))
print('The SCHEDULE_4 has only two values {}.'.format(str(set(train.SCHEDULE_4.unique()) | set(test.SCHEDULE_4.unique()))))
print('The SCHEDULE_5 has only two values {}.'.format(str(set(train.SCHEDULE_5.unique()) | set(test.SCHEDULE_5.unique()))))
print('The SCHEDULE_6 has only two values {}.'.format(str(set(train.SCHEDULE_6.unique()) | set(test.SCHEDULE_6.unique()))))
print('The SCHEDULE_7 has only two values {}.'.format(str(set(train.SCHEDULE_7.unique()) | set(test.SCHEDULE_7.unique()))))
print('The SCHEDULE_8 has only two values {}.'.format(str(set(train.SCHEDULE_8.unique()) | set(test.SCHEDULE_8.unique()))))
print('The SCHEDULE_9 has only two values {}.'.format(str(set(train.SCHEDULE_9.unique()) | set(test.SCHEDULE_9.unique()))))
print('The SCHEDULE_10 has only two values {}.'.format(str(set(train.SCHEDULE_10.unique()) | set(test.SCHEDULE_10.unique()))))
print('The GENRE_1 has only two values {}.'.format(str(set(train.GENRE_1.unique()) | set(test.GENRE_1.unique()))))
print('The GENRE_2 has only two values {}.'.format(str(set(train.GENRE_2.unique()) | set(test.GENRE_2.unique()))))
print('The GENRE_3 has only two values {}.'.format(str(set(train.GENRE_3.unique()) | set(test.GENRE_3.unique()))))
print('The GENRE_4 has only two values {}.'.format(str(set(train.GENRE_4.unique()) | set(test.GENRE_4.unique()))))
print('The GENRE_5 has only two values {}.'.format(str(set(train.GENRE_5.unique()) | set(test.GENRE_5.unique()))))
print('The GENRE_6 has only two values {}.'.format(str(set(train.GENRE_6.unique()) | set(test.GENRE_6.unique()))))
print('The GENRE_7 has only two values {}.'.format(str(set(train.GENRE_7.unique()) | set(test.GENRE_7.unique()))))
print('The GENRE_8 has only two values {}.'.format(str(set(train.GENRE_8.unique()) | set(test.GENRE_8.unique()))))
print('The TAG_1 has only one values {}.'.format(str(set(train.TAG_1.unique()) | set(test.TAG_1.unique()))))
print('The TAG_2 has only one values {}.'.format(str(set(train.TAG_2.unique()) | set(test.TAG_2.unique()))))
print('The TAG_3 has only two values {}.'.format(str(set(train.TAG_3.unique()) | set(test.TAG_3.unique()))))
print('The TAG_4 has only two values {}.'.format(str(set(train.TAG_4.unique()) | set(test.TAG_4.unique()))))
print('The TAG_5 has only two values {}.'.format(str(set(train.TAG_5.unique()) | set(test.TAG_5.unique()))))
print('The TAG_6 has only two values {}.'.format(str(set(train.TAG_6.unique()) | set(test.TAG_6.unique()))))
In [8]:
usr1 = train.groupby([train.USER_ID_1, train.USER_ID_2, train.USER_ID_3]).agg({'SESSION_CNT': ['sum', 'mean']})
usr1.columns = usr1.columns.droplevel(0)
usr1.columns = ['USR_SESSION_CNT_SUM', 'USR_SESSION_CNT_MEAN']
usr1.reset_index(inplace=True)
usr1.head(10)
Out[8]:
In [9]:
usr2 = train.groupby([train.USER_ID_1, train.USER_ID_2]).agg({'TENDENCY_1': ['sum', 'mean'],
'TENDENCY_2': ['sum', 'mean'],
'TENDENCY_3': ['sum', 'mean'],
'TENDENCY_4': ['sum', 'mean'],
'TENDENCY_5': ['sum', 'mean'],
'TENDENCY_6': ['sum', 'mean'],
'TENDENCY_7': ['sum', 'mean'],
'TENDENCY_8': ['sum', 'mean'],
'TENDENCY_9': ['sum', 'mean'],
'TENDENCY_10': ['sum', 'mean'],
'TENDENCY_11': ['sum', 'mean'],
'TENDENCY_12': ['sum', 'mean'],
'TENDENCY_13': ['sum', 'mean'],
'TENDENCY_14': ['sum', 'mean'],
'TENDENCY_15': ['sum', 'mean'],
'TENDENCY_16': ['sum', 'mean']})
usr2.columns = usr2.columns.droplevel(0)
usr2.columns = ['USR_TENDENCY_1_SUM', 'USR_TENDENCY_1_MEAN',
'USR_TENDENCY_2_SUM', 'USR_TENDENCY_2_MEAN',
'USR_TENDENCY_3_SUM', 'USR_TENDENCY_3_MEAN',
'USR_TENDENCY_4_SUM', 'USR_TENDENCY_4_MEAN',
'USR_TENDENCY_5_SUM', 'USR_TENDENCY_5_MEAN',
'USR_TENDENCY_6_SUM', 'USR_TENDENCY_6_MEAN',
'USR_TENDENCY_7_SUM', 'USR_TENDENCY_7_MEAN',
'USR_TENDENCY_8_SUM', 'USR_TENDENCY_8_MEAN',
'USR_TENDENCY_9_SUM', 'USR_TENDENCY_9_MEAN',
'USR_TENDENCY_10_SUM', 'USR_TENDENCY_10_MEAN',
'USR_TENDENCY_11_SUM', 'USR_TENDENCY_11_MEAN',
'USR_TENDENCY_12_SUM', 'USR_TENDENCY_12_MEAN',
'USR_TENDENCY_13_SUM', 'USR_TENDENCY_13_MEAN',
'USR_TENDENCY_14_SUM', 'USR_TENDENCY_14_MEAN',
'USR_TENDENCY_15_SUM', 'USR_TENDENCY_15_MEAN',
'USR_TENDENCY_16_SUM', 'USR_TENDENCY_16_MEAN']
usr2.reset_index(inplace=True)
usr2.head(10)
Out[9]:
In [10]:
usr = usr1.merge(usr2, on=['USER_ID_1', 'USER_ID_2'])
usr.head(10)
Out[10]:
In [11]:
prd = train.groupby([train.PRODUCT_ID]).agg({'LAST_EPISODE': ['sum', 'mean'],
'START_DATE': ['sum', 'mean'],
'TOTAL_EPISODE_CNT': ['sum', 'mean']})
prd.columns = prd.columns.droplevel(0)
prd.columns = ['PRD_LAST_EPISODE_SUM', 'PRD_LAST_EPISODE_MEAN',
'PRD_START_DATE_SUM', 'PRD_START_DATE_MEAN',
'PRD_TOTAL_EPISODE_CNT_SUM', 'PRD_TOTAL_EPISODE_CNT_MEAN']
prd.reset_index(inplace=True)
prd.head(10)
Out[11]:
In [12]:
usr_prd = train.groupby([train.USER_ID_1, train.USER_ID_2, train.USER_ID_3, train.PRODUCT_ID]).agg({
'USER_ID_1': 'size',
'ORDERED': 'sum'})
#usr_prd.columns = usr_prd.columns.droplevel(0)
usr_prd.columns = ['UP_VIEW_CNT', 'UP_ORDERED_SUM']
usr_prd['UP_ORDERED_RATIO'] = pd.Series(usr_prd.UP_ORDERED_SUM / usr_prd.UP_VIEW_CNT).astype(np.float32)
usr_prd.reset_index(inplace=True)
usr_prd.head(10)
Out[12]:
In [ ]: