In [1]:
"""
data preparation for model-based task:
    
    1. extract the data with selected features;
    2. set the rare categorical values to 'other';
    3. fit a label encoder and a one-hot encoder for new data set
"""

##==================== Package ====================##
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from dummyPy import OneHotEncoder
import random

import pickle  # to store temporary variable

In [7]:
##==================== File-Path (fp) ====================##
## raw data (for read)
fp_train = "../../Datasets/ctr/train.csv"
fp_test  = "../../Datasets/ctr/test.csv"

## subsample training set
fp_sub_train_f = "../../Datasets/ctr/sub_train_f.csv"

fp_col_counts = "../../Datasets/ctr/col_counts"

## data after selecting features (LR_fun needed)
## and setting rare categories' value to 'other' (feature filtering)
fp_train_f = "../../Datasets/ctr/train_f.csv"
fp_test_f  = "../../Datasets/ctr/test_f.csv"

## storing encoder for labeling / one-hot encoding task
fp_lb_enc = "../../Datasets/ctr/lb_enc"
fp_oh_enc = "../../Datasets/ctr/oh_enc"

In [8]:
##==================== pre-Processing ====================##
## some simple original features is selected for dataset
'''features are used
    C1:           int,     1001, 1002, ...
    banner_pos:   int,     0,1,2,3,...
    site_domain:  object,  large set of object variables 
    site_id:      object,  large set of object variables 
    site_category:object,  large set of object variables 
    app_id:       object,  large set of object variables 
    app_category: object,  small set of object variables
    device_type:  int,     0,1,2,3,4
    device_conn_type:int,  0,1,2,3
    C14:          int,     small set of int variables
    C15:          int,     ...
    C16:          int,     ...
'''
## feature names
cols = ['C1', 
        'banner_pos', 
        'site_domain', 
        'site_id',
        'site_category',
        'app_id',
        'app_category', 
        'device_type', 
        'device_conn_type',
        'C14', 
        'C15',
        'C16']

cols_train = ['id', 'click']
cols_test  = ['id']
cols_train.extend(cols)
cols_test.extend(cols)

In [9]:
## data reading
df_train_ini = pd.read_csv(fp_train, nrows = 10)
df_train_org = pd.read_csv(fp_train, chunksize = 1000000, iterator = True)
df_test_org  = pd.read_csv(fp_test,  chunksize = 1000000, iterator = True)

In [10]:
#----- counting features' categories numbers -----#
## 1.init_dict
cols_counts = {}  # the categories count for each feature
for col in cols:
    cols_counts[col] = df_train_ini[col].value_counts()

In [11]:
cols_counts


Out[11]:
{'C1': 1005    9
 1002    1
 Name: C1, dtype: int64, 'C14': 15706    2
 15704    2
 20632    1
 20362    1
 16920    1
 21689    1
 15707    1
 18993    1
 Name: C14, dtype: int64, 'C15': 320    10
 Name: C15, dtype: int64, 'C16': 50    10
 Name: C16, dtype: int64, 'app_category': 07d7df22    10
 Name: app_category, dtype: int64, 'app_id': ecad2386    10
 Name: app_id, dtype: int64, 'banner_pos': 0    8
 1    2
 Name: banner_pos, dtype: int64, 'device_conn_type': 0    8
 2    2
 Name: device_conn_type, dtype: int64, 'device_type': 1    9
 0    1
 Name: device_type, dtype: int64, 'site_category': 28905ebd    5
 f028772b    3
 50e219e0    1
 0569f928    1
 Name: site_category, dtype: int64, 'site_domain': f3845767    5
 7e091613    1
 c4e18dd6    1
 25d4cfcd    1
 bb1ef334    1
 9166c161    1
 Name: site_domain, dtype: int64, 'site_id': 1fbe01fe    5
 8fda644b    1
 fe8cc448    1
 d6137915    1
 e151e245    1
 84c7ba46    1
 Name: site_id, dtype: int64}

In [12]:
## 2.counting through train-set
for chunk in df_train_org:
    for col in cols:
        cols_counts[col] = cols_counts[col].append(chunk[col].value_counts())

## 3.counting through test-set
for chunk in df_test_org:
    for col in cols:
        cols_counts[col] = cols_counts[col].append(chunk[col].value_counts())
        
## 4.merge the deduplicates index in counting vectors
for col in cols:
    cols_counts[col] = cols_counts[col].groupby(cols_counts[col].index).sum()
    # sort the counts
    cols_counts[col] = cols_counts[col].sort_values(ascending=False)   

## 5.store the value_counting
pickle.dump(cols_counts, open(fp_col_counts, 'wb'))

## 6.show the distribution of value_counts
fig = plt.figure(1)
for i, col in enumerate(cols):
    ax = fig.add_subplot(4, 3, i+1)
    ax.fill_between(np.arange(len(cols_counts[col])), cols_counts[col].get_values())
    # ax.set_title(col)
plt.show()



In [18]:
#----- set rare to 'other' -----#
# cols_counts = pickle.load(open(fp_col_counts, 'rb'))

## save at most k indices of the categorical variables
## and set the rest to 'other'
k = 99
col_index = {}
for col in cols:
    col_index[col] = cols_counts[col][0: k].index

df_train_org = pd.read_csv(fp_train, dtype = {'id': str}, chunksize = 1000000, iterator = True)
df_test_org  = pd.read_csv(fp_test,  dtype = {'id': str}, chunksize = 1000000, iterator = True)

## train set
hd_flag = True  # add column names at 1-st row
for chunk in df_train_org:
    df = chunk.copy()
    for col in cols:
        df[col] = df[col].astype('object')
        # assign all the rare variables as 'other'
        df.loc[~df[col].isin(col_index[col]), col] = 'other'
    with open(fp_train_f, 'a') as f:
        df.to_csv(f, columns = cols_train, header = hd_flag, index = False)
    hd_flag = False

## test set
hd_flag = True  # add column names at 1-st row
for chunk in df_test_org:
    df = chunk.copy()
    for col in cols:
        df[col] = df[col].astype('object')
        # assign all the rare variables as 'other'
        df.loc[~df[col].isin(col_index[col]), col] = 'other'
    with open(fp_test_f, 'a') as f:
        df.to_csv(f, columns = cols_test, header = hd_flag, index = False)      
    hd_flag = False

In [19]:
#----- generate encoder for label encoding -----#
#----- generate encoder for one-hot encoding -----#
'''
notes: here we do not apply label/one-hot transform
       as we do it later in the iteration of model training on chunks
'''
## 1.label encoding
lb_enc = {}
for col in cols:
    col_index[col] = np.append(col_index[col], 'other')

for col in cols:
    lb_enc[col] = LabelEncoder()
    lb_enc[col].fit(col_index[col])
    
## store the label encoder
pickle.dump(lb_enc, open(fp_lb_enc, 'wb'))

## 2.one-hot encoding
oh_enc = OneHotEncoder(cols)

df_train_f = pd.read_csv(fp_train_f, index_col=None, chunksize=500000, iterator=True)
df_test_f  = pd.read_csv(fp_test_f, index_col=None, chunksize=500000, iterator=True)

for chunk in df_train_f:
    oh_enc.fit(chunk)
for chunk in df_test_f:
    oh_enc.fit(chunk)
    
## store the one-hot encoder
pickle.dump(oh_enc, open(fp_oh_enc, 'wb'))

In [22]:
#----- construct of original train set (sub-sampling randomly) -----#
n = sum(1 for line in open(fp_train_f)) - 1  # total size of train data (about 46M)
s = 2000000 # desired train set size (2M)

## the 0-indexed header will not be included in the skip list
skip = sorted(random.sample(range(1, n+1), n-s)) 
df_train = pd.read_csv(fp_train_f, skiprows = skip)
df_train.columns = cols_train

## store the sub-sampling train set as .csv
df_train.to_csv(fp_sub_train_f, index=False) 

print(' - end - ')


 - end - 

In [33]:
for col in cols:
    print col,lb_enc[col].classes_


C1 ['1001' '1002' '1005' '1007' '1008' '1010' '1012' 'other']
banner_pos ['0' '1' '2' '3' '4' '5' '7' 'other']
site_domain ['08facbad' '0a4015b2' '0d3cb7be' '0dde25ec' '16a36ef3' '17d996e6'
 '1b32ed33' '1e0acfb4' '1e334bd3' '2296b5b3' '250469f8' '272c5d9d'
 '27e3c518' '28f93029' '2c5a2165' '2d235ae0' '30518255' '3f2f3819'
 '422b8953' '449497bc' '4c26e9ba' '510bd839' '5449fde7' '58a89a43'
 '5b626596' '5c9ae867' '5c9c05b7' '5dddf09e' '61eac15d' '64778742'
 '6b560cc1' '6b59f079' '6bdbd889' '6dbd48f0' '6e882918' '7256c623'
 '759d1c56' '75f9ddc3' '7687a86e' '77c8e77d' '7804dea6' '7d05db75'
 '7e091613' '7e1e7bdf' '7e3be6ba' '8025317b' '863fa89d' '8a429b45'
 '91cdcccd' '945bfb4b' '948ff336' '964a3bd1' '968765cd' '9690165f'
 '98572c79' '98acf46c' '98e6755b' '9b421927' '9b851bd8' '9cf7de2f'
 '9d54950b' '9eaf88fa' '9f448720' 'a10eb148' 'a17bde68' 'a434fa42'
 'a9bba545' 'ac5abf20' 'ace5b8fd' 'b12b9f85' 'b4598159' 'b9c4ab81'
 'bb1ef334' 'bd6d812f' 'bea33b9a' 'c1aa3c04' 'c4342784' 'c4e18dd6'
 'c7ca3108' 'c7dcd9d1' 'c8223ee3' 'cc962a1f' 'ce307e01' 'd00b8121'
 'd262cf1e' 'd2f72222' 'd733bbc3' 'd7e2f29b' 'db11867b' 'de0f0f82'
 'e16ceb4b' 'e2a5dc06' 'f3845767' 'f3ca2e42' 'f415c8a8' 'f6ebf28e'
 'f7570339' 'fba7221d' 'fd13bc73' 'other']
site_id ['0a742914' '0c2fe9d6' '0eb72673' '12fb4121' '16c73019' '178a7b89'
 '17caea14' '17d1b03f' '1a5c1d83' '1b171d6d' '1fbe01fe' '222acd54'
 '2328ee8e' '235ba823' '26fa1946' '29229f8e' '2a68aa20' '2b1ddb24'
 '2fdff0f2' '3281baa7' '38217daf' '3dc2d6de' '43d6df75' '4bf5bbe2'
 '4e7614cf' '5114c672' '517b8671' '543a539e' '57ef2c87' '57fe1b20'
 '5b08c53b' '5b4d2eda' '5b787406' '5bcf81a2' '5ee41ff2' '61a8c644'
 '6256f5b4' '6399eda6' '6c5b482c' '6ec06dbd' '6fcff501' '7294ea0f'
 '75f43c5a' '763a42b5' '7697ed3e' '78d60190' '791b5571' '7c6576be'
 '7dd19f44' '801dfefb' '83a0ad1a' '84c7ba46' '856e6d3f' '85f751fd'
 '88154ade' '887a4754' '89a490f5' '92c7cbe7' '93eaba74' '9a977531'
 '9e8cf15d' '9e8e8d09' '9ee9dd00' 'a7853007' 'ac696ed4' 'b7e9786d'
 'b8eae5f9' 'b99a2c43' 'ba9aee1b' 'bb4524e7' 'c1fcc9a1' 'c54454a2'
 'c63170c5' 'cbb01868' 'ce3307ec' 'cf3a1767' 'd1a51189' 'd6137915'
 'd7f3460b' 'd8bb8687' 'd9750ee7' 'dcc019de' 'df7971d4' 'e023ba3e'
 'e151e245' 'e3c09f3a' 'e4d8dd7b' 'e5c60a05' 'e7a92cc6' 'e8f79e60'
 'e9ca7d40' 'ec47f32a' 'ee8b8550' 'f0ee33c5' 'f61eaaae' 'f6cba1bd'
 'f9c69707' 'faf9c1c2' 'fec6e632' 'other']
site_category ['0569f928' '110ab22d' '28905ebd' '335d28a8' '3e814130' '42a36e14'
 '50e219e0' '5378d028' '6432c423' '70fb0e29' '72722551' '74073276'
 '75fa27f6' '76b2941d' '8fd0aea4' '9ccfa2ea' 'a72a0145' 'a818d37a'
 'bcf865d9' 'c0dd3be3' 'c706e647' 'da34532e' 'dedf689d' 'e787de0e'
 'f028772b' 'f66779e6' 'other']
app_id ['03528b27' '03a08c3f' '04f2be5f' '088b6a7b' '08a53ae7' '090d3a47'
 '0acbeaa3' '13684a79' '1779deee' '197b4f7f' '1d64ced5' '1dc72b4d'
 '255a58c8' '27550a3c' '2815f500' '28d3bd59' '2d869bee' '2f6efcf2'
 '33291962' '3692fd30' '396df801' '39947756' '3bfa19cb' '3c4b944d'
 '3e2bf98d' '3f2a6cbb' '442cfede' '45aff1a2' '495d447f' '4b08f369'
 '4e02fbd3' '51cedd4e' '53de0284' '54c5d545' '5adb10d9' '5d74d2a4'
 '5e3f096f' '66f5e02e' '680e6a92' '685d1c4c' '6fc85e22' '6fef5db2'
 '73206397' '7358e05e' '75076517' '7c7508aa' '7e7baafa' '85938df3'
 '8b89048f' '8c0dcd5a' '8dbc921a' '92f5800b' '93d786c6' '95b5e741'
 '98fed791' '9c13b419' '9f41cdad' 'a0a45985' 'a0fc55e5' 'a2b190d4'
 'a37bf1e4' 'a4869716' 'a4db51b9' 'a5184c22' 'a607e6a7' 'a97b17d0'
 'bdf46af7' 'be7c618d' 'c105f675' 'c3f39675' 'c51f82bc' 'c776ba2a'
 'c7f29df3' 'c8e3e3c1' 'ce183bbd' 'cf0327f9' 'd292c32f' 'd2bb6502'
 'd36838b1' 'd44c074c' 'd644e4e7' 'd8784af5' 'da90aa44' 'de97da65'
 'dfdd226c' 'e2a1ca37' 'e2fcccd2' 'e5f1bafe' 'e69a444c' 'e96773f0'
 'e9739828' 'ecad2386' 'f0d41ff1' 'f53417e1' 'f757bb8f' 'f888bf4c'
 'fb7c70a3' 'febd1138' 'ffc6ffd0' 'other']
app_category ['07d7df22' '09481d60' '0bfbc358' '0d82db25' '0f2161f8' '0f9a328c'
 '18b1e0be' '2281a340' '2fc4f2aa' '4681bb9d' '4b7ade46' '4ce2e9fc'
 '52de74cf' '5326cf99' '6fea3693' '7113d72a' '71af18ce' '75d80bbe'
 '79f0b860' '86c1a5a3' '879c24eb' '8ded1f7a' '8df2e842' 'a3c42688'
 'a7fd01ec' 'a86a3e89' 'bd41f328' 'bf8ac856' 'cba0e20d' 'cef3e649'
 'd1327cf5' 'dc97ec06' 'ef03ae90' 'f395a87f' 'f95efa07' 'fc6fa53d' 'other']
device_type ['0' '1' '2' '4' '5' 'other']
device_conn_type ['0' '2' '3' '5' 'other']
C14 ['15699' '15701' '15702' '15703' '15704' '15705' '15706' '15707' '15708'
 '16208' '16615' '16688' '16920' '17014' '17016' '17017' '17037' '17239'
 '17262' '17264' '17614' '17653' '17654' '17747' '17753' '17875' '17877'
 '17893' '17894' '19015' '19016' '19251' '19665' '19743' '19771' '19772'
 '19950' '20093' '20108' '20251' '20277' '20312' '20345' '20346' '20352'
 '20362' '20366' '20508' '20632' '20633' '20634' '21153' '21189' '21191'
 '21611' '21647' '21665' '21767' '21768' '21769' '21770' '21789' '21790'
 '21882' '21893' '22104' '22254' '22257' '22261' '22267' '22268' '22288'
 '22676' '22680' '22681' '22682' '22683' '22813' '23137' '23143' '23144'
 '23160' '23161' '23221' '23222' '23224' '23438' '23441' '23454' '23626'
 '23642' '23722' '23723' '23804' '23866' '4687' '6563' '8330' '9478'
 'other']
C15 ['1024' '120' '216' '300' '320' '480' '728' '768' 'other']
C16 ['1024' '20' '250' '320' '36' '480' '50' '768' '90' 'other']

In [ ]: