In [1]:
"""
data preparation for model-based task:
1. extract the data with selected features;
2. set the rare categorical values to 'other';
3. fit a label encoder and a one-hot encoder for new data set
"""
##==================== Package ====================##
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from dummyPy import OneHotEncoder
import random
import pickle # to store temporary variable
In [7]:
##==================== File-Path (fp) ====================##
## raw data (for read)
fp_train = "../../Datasets/ctr/train.csv"
fp_test = "../../Datasets/ctr/test.csv"
## subsample training set
fp_sub_train_f = "../../Datasets/ctr/sub_train_f.csv"
fp_col_counts = "../../Datasets/ctr/col_counts"
## data after selecting features (LR_fun needed)
## and setting rare categories' value to 'other' (feature filtering)
fp_train_f = "../../Datasets/ctr/train_f.csv"
fp_test_f = "../../Datasets/ctr/test_f.csv"
## storing encoder for labeling / one-hot encoding task
fp_lb_enc = "../../Datasets/ctr/lb_enc"
fp_oh_enc = "../../Datasets/ctr/oh_enc"
In [8]:
##==================== pre-Processing ====================##
## some simple original features is selected for dataset
'''features are used
C1: int, 1001, 1002, ...
banner_pos: int, 0,1,2,3,...
site_domain: object, large set of object variables
site_id: object, large set of object variables
site_category:object, large set of object variables
app_id: object, large set of object variables
app_category: object, small set of object variables
device_type: int, 0,1,2,3,4
device_conn_type:int, 0,1,2,3
C14: int, small set of int variables
C15: int, ...
C16: int, ...
'''
## feature names
cols = ['C1',
'banner_pos',
'site_domain',
'site_id',
'site_category',
'app_id',
'app_category',
'device_type',
'device_conn_type',
'C14',
'C15',
'C16']
cols_train = ['id', 'click']
cols_test = ['id']
cols_train.extend(cols)
cols_test.extend(cols)
In [9]:
## data reading
df_train_ini = pd.read_csv(fp_train, nrows = 10)
df_train_org = pd.read_csv(fp_train, chunksize = 1000000, iterator = True)
df_test_org = pd.read_csv(fp_test, chunksize = 1000000, iterator = True)
In [10]:
#----- counting features' categories numbers -----#
## 1.init_dict
cols_counts = {} # the categories count for each feature
for col in cols:
cols_counts[col] = df_train_ini[col].value_counts()
In [11]:
cols_counts
Out[11]:
In [12]:
## 2.counting through train-set
for chunk in df_train_org:
for col in cols:
cols_counts[col] = cols_counts[col].append(chunk[col].value_counts())
## 3.counting through test-set
for chunk in df_test_org:
for col in cols:
cols_counts[col] = cols_counts[col].append(chunk[col].value_counts())
## 4.merge the deduplicates index in counting vectors
for col in cols:
cols_counts[col] = cols_counts[col].groupby(cols_counts[col].index).sum()
# sort the counts
cols_counts[col] = cols_counts[col].sort_values(ascending=False)
## 5.store the value_counting
pickle.dump(cols_counts, open(fp_col_counts, 'wb'))
## 6.show the distribution of value_counts
fig = plt.figure(1)
for i, col in enumerate(cols):
ax = fig.add_subplot(4, 3, i+1)
ax.fill_between(np.arange(len(cols_counts[col])), cols_counts[col].get_values())
# ax.set_title(col)
plt.show()
In [18]:
#----- set rare to 'other' -----#
# cols_counts = pickle.load(open(fp_col_counts, 'rb'))
## save at most k indices of the categorical variables
## and set the rest to 'other'
k = 99
col_index = {}
for col in cols:
col_index[col] = cols_counts[col][0: k].index
df_train_org = pd.read_csv(fp_train, dtype = {'id': str}, chunksize = 1000000, iterator = True)
df_test_org = pd.read_csv(fp_test, dtype = {'id': str}, chunksize = 1000000, iterator = True)
## train set
hd_flag = True # add column names at 1-st row
for chunk in df_train_org:
df = chunk.copy()
for col in cols:
df[col] = df[col].astype('object')
# assign all the rare variables as 'other'
df.loc[~df[col].isin(col_index[col]), col] = 'other'
with open(fp_train_f, 'a') as f:
df.to_csv(f, columns = cols_train, header = hd_flag, index = False)
hd_flag = False
## test set
hd_flag = True # add column names at 1-st row
for chunk in df_test_org:
df = chunk.copy()
for col in cols:
df[col] = df[col].astype('object')
# assign all the rare variables as 'other'
df.loc[~df[col].isin(col_index[col]), col] = 'other'
with open(fp_test_f, 'a') as f:
df.to_csv(f, columns = cols_test, header = hd_flag, index = False)
hd_flag = False
In [19]:
#----- generate encoder for label encoding -----#
#----- generate encoder for one-hot encoding -----#
'''
notes: here we do not apply label/one-hot transform
as we do it later in the iteration of model training on chunks
'''
## 1.label encoding
lb_enc = {}
for col in cols:
col_index[col] = np.append(col_index[col], 'other')
for col in cols:
lb_enc[col] = LabelEncoder()
lb_enc[col].fit(col_index[col])
## store the label encoder
pickle.dump(lb_enc, open(fp_lb_enc, 'wb'))
## 2.one-hot encoding
oh_enc = OneHotEncoder(cols)
df_train_f = pd.read_csv(fp_train_f, index_col=None, chunksize=500000, iterator=True)
df_test_f = pd.read_csv(fp_test_f, index_col=None, chunksize=500000, iterator=True)
for chunk in df_train_f:
oh_enc.fit(chunk)
for chunk in df_test_f:
oh_enc.fit(chunk)
## store the one-hot encoder
pickle.dump(oh_enc, open(fp_oh_enc, 'wb'))
In [22]:
#----- construct of original train set (sub-sampling randomly) -----#
n = sum(1 for line in open(fp_train_f)) - 1 # total size of train data (about 46M)
s = 2000000 # desired train set size (2M)
## the 0-indexed header will not be included in the skip list
skip = sorted(random.sample(range(1, n+1), n-s))
df_train = pd.read_csv(fp_train_f, skiprows = skip)
df_train.columns = cols_train
## store the sub-sampling train set as .csv
df_train.to_csv(fp_sub_train_f, index=False)
print(' - end - ')
In [33]:
for col in cols:
print col,lb_enc[col].classes_
In [ ]: