Machine Learning Notes: Feature Encoding I

Author: Diego Marinho de Oliveira (GitHub)

1. Introduction

This notebook is a study to observe the potential of feature encoding. Althought there are a considerable amount of studies that claims the benefit of feature encoding, this notebook try to verify it empirically. Also, the studies don't show how to do it, and this notebook give a chance to Data Scientists and Machine Learning Practioners see some smart tricks on data. At least but not less, all the notebook was made on Julia! This is an awesome language that are conquering me every day! You can see more notebooks like this one throught my GitHub page. Every one is now coding in Julia.



In [274]:

    
using MLBase
using DataFrames
using RDatasets
using Lasso
using GLM
using GLMNet
using LIBSVM
using DecisionTree
using XGBoost
using NeuralNets
using Gadfly

2. Method Definitions

Encode Data Method



In [7]:

    
function encode_data(data, features)
    
    encoded_data = copy(data)
    encoders     = Dict([f => labelmap(data[:, f]) for f in features])
    
    for f in features
        encoded_data[:, f] = labelencode(encoders[f], encoded_data[:, f])
    end
    
    return encoded_data
end









    Out[7]:





encode_data (generic function with 1 method)

Train, Predict and Eval Methods



In [317]:

    
abstract Model
type LM <: Model end
type SVM <: Model end
type ANN <: Model end
type DecisionTrees <: Model end
type RandomForest <: Model end
type XGB <: Model end

train(::Type{LM}, X, y)            = glmnet(X, y)
train(::Type{SVM}, X, y)           = 0
train(::Type{ANN}, X, y)           = 0
train(::Type{DecisionTrees}, X, y) = build_tree(y, X)
train(::Type{RandomForest}, X, y)  = build_forest(y, X, 4, 100, 5, 0.7)
train(::Type{XGB}, X, y)           = xgboost(X, 10, label=y, eta=.3, max_depth=4, silent=1)

predict(::Type{LM}, model, X)            = (yhat=GLMNet.predict(model, X); return [mean(yhat[i,:]) for i=1:size(yhat,1)])
predict(::Type{SVM}, model, X)           = 0
predict(::Type{ANN}, model, X)           = 0
predict(::Type{DecisionTrees}, model, X) = apply_tree(model, X)
predict(::Type{RandomForest}, model, X)  = apply_forest(model, X)
predict(::Type{XGB}, model, X)           = XGBoost.predict(model, X)

rmse(Y, YHat) = sqrt(sum((Y - YHat) .^ 2)/length(Y))
accuracy(Y, YHat) = sum(Array{Int8}(Y .== YHat))/length(Y)









    Out[317]:





accuracy (generic function with 1 method)

Evaluate All Models



In [232]:

    
function analyze_models(X, y, Xval, metric, model_list)
    
    results = []
    
    for model_name in model_list
        model  = train(model_name, X, y)
        yhat   = predict(model_name, model, Xval)
        result = metric(y, yhat)
        push!(results, result)
    end
    
    dataframe = DataFrame(Model=model_list, RMSE=results)
    sort!(dataframe, cols=:RMSE)
    
    return dataframe
end









    Out[232]:





analyze_models (generic function with 2 methods)

Binarize All Features



In [267]:

    
function binarize_feature!(data, feature)
    unique_values = [ symbol(feature, "=", value) for value in Set(data[:, feature]) ]
    nrows, ncols  = size(data, 1), length(unique_values)
    rows          = fill(0, nrows, ncols)
    for i=1:nrows
        rows[i, data[i, feature]] = 1
    end
    
    for j=1:ncols
        data[unique_values[j]] = rows[:, j]
    end
    
    return data
end

function binarize_features!(data, feature_list)
    
    for feature in feature_list
        binarize_feature!(data, feature)
    end
    
    return data
end









    Out[267]:





binarize_features! (generic function with 1 method)

Categorize Continous/Discrete Features



In [303]:

    
function generate_evaluation_function_list(interval_list)
    
    length(interval_list) == 0 && error("Empty list.")
    length(interval_list) == 1 && error("List need to be size >= 2.")
    
    eval_list = [x -> x <= interval_list[1]]
    
    for i=1:length(interval_list)-1
        push!(eval_list, x -> interval_list[i] < x <= interval_list[i+1])
    end
    
    push!(eval_list, x -> x > interval_list[end])
    
    return eval_list
    
end

function categorize_feature!(data, feature, evaluate_function_list)
    
    nrows, ncols = size(data, 1), length(evaluate_function_list)
    new_features = [ symbol(feature, "=", i) for i=1:ncols]
    rows         = fill(0, nrows, ncols)
    
    for i=1:nrows, j=1:ncols
        rows[i, j] = evaluate_function_list[j](data[i, feature])? 1 : 0
    end
    
    for j=1:ncols
        data[new_features[j]] = rows[:, j]
    end
    
    return data
end









    Out[303]:





categorize_feature! (generic function with 1 method)



In [9]:

    
RDatasets.packages()









    Out[9]:




Package Title
1 COUNT Functions, data and code for count data.
2 Ecdat Data sets for econometrics
3 HSAUR A Handbook of Statistical Analyses Using R (1st Edition)
4 HistData Data sets from the history of statistics and data visualization
5 ISLR Data for An Introduction to Statistical Learning with Applications in R
6 KMsurv Data sets from Klein and Moeschberger (1997), Survival Analysis
7 MASS Support Functions and Datasets for Venables and Ripley's MASS
8 SASmixed Data sets from "SAS System for Mixed Models"
9 Zelig Everyone's Statistical Software
10 adehabitatLT Analysis of Animal Movements
11 boot Bootstrap Functions (Originally by Angelo Canty for S)
12 car Companion to Applied Regression
13 cluster Cluster Analysis Extended Rousseeuw et al.
14 datasets The R Datasets Package
15 gap Genetic analysis package
16 ggplot2 An Implementation of the Grammar of Graphics
17 lattice Lattice Graphics
18 lme4 Linear mixed-effects models using Eigen and S4
19 mgcv Mixed GAM Computation Vehicle with GCV/AIC/REML smoothness estimation
20 mlmRev Examples from Multilevel Modelling Software Review
21 nlreg Higher Order Inference for Nonlinear Heteroscedastic Models
22 plm Linear Models for Panel Data
23 plyr Tools for splitting, applying and combining data
24 pscl Political Science Computational Laboratory, Stanford University
25 psych Procedures for Psychological, Psychometric, and Personality Research
26 quantreg Quantile Regression
27 reshape2 Flexibly Reshape Data: A Reboot of the Reshape Package.
28 robustbase Basic Robust Statistics
29 rpart Recursive Partitioning and Regression Trees
30 sandwich Robust Covariance Matrix Estimators
&vellip &vellip &vellip



In [10]:

    
RDatasets.datasets("HSAUR")









    Out[10]:




Package Dataset Title Rows Columns
1 HSAUR BCG BCG Vaccine Data 13 7
2 HSAUR BtheB Beat the Blues Data 100 8
3 HSAUR CYGOB1 CYG OB1 Star Cluster Data 47 2
4 HSAUR Forbes2000 The Forbes 2000 Ranking of the World's Biggest Companies (Year 2004) 2000 8
5 HSAUR GHQ General Health Questionnaire 22 4
6 HSAUR Lanza Prevention of Gastointestinal Damages 198 3
7 HSAUR agefat Total Body Composision Data 25 3
8 HSAUR aspirin Aspirin Data 7 5
9 HSAUR birthdeathrates Birth and Death Rates Data 69 3
10 HSAUR bladdercancer Bladder Cancer Data 31 3
11 HSAUR clouds Cloud Seeding Data 24 7
12 HSAUR epilepsy Epilepsy Data 236 7
13 HSAUR foster Foster Feeding Experiment 61 3
14 HSAUR heptathlon Olympic Heptathlon Seoul 1988 25 9
15 HSAUR mastectomy Survival Times after Mastectomy of Breast Cancer Patients 44 3
16 HSAUR meteo Meteorological Measurements for 11 Years 11 6
17 HSAUR orallesions Oral Lesions in Rural India 8 4
18 HSAUR phosphate Phosphate Level Data 33 9
19 HSAUR pistonrings Piston Rings Failures 4 4
20 HSAUR planets Exoplanets Data 101 3
21 HSAUR plasma Blood Screening Data 32 4
22 HSAUR polyps Familial Andenomatous Polyposis 20 3
23 HSAUR polyps3 Familial Andenomatous Polyposis 22 5
24 HSAUR pottery Romano-British Pottery Data 45 9
25 HSAUR rearrests Rearrests of Juvenile Felons 2 3
26 HSAUR respiratory Respiratory Illness Data 555 8
27 HSAUR roomwidth Students Estimates of Lecture Room Width 113 2
28 HSAUR schizophrenia Age of Onset of Schizophrenia Data 251 2
29 HSAUR schizophrenia2 Schizophrenia Data 220 5
30 HSAUR schooldays Days not Spent at School 154 5
&vellip &vellip &vellip &vellip &vellip &vellip



In [11]:

    
head(dataset("HSAUR", "Forbes2000"))









    Out[11]:




Rank Name Country Category Sales Profits Assets MarketValue
1 1 Citigroup United States Banking 94.71 17.85 1264.03 255.3
2 2 General Electric United States Conglomerates 134.19 15.59 626.93 328.54
3 3 American Intl Group United States Insurance 76.66 6.46 647.66 194.87
4 4 ExxonMobil United States Oil & gas operations 222.88 20.96 166.99 277.02
5 5 BP United Kingdom Oil & gas operations 232.57 10.27 177.57 173.54
6 6 Bank of America United States Banking 49.01 10.81 736.45 117.55

3. Analyzing Forbes2000 Dataset (I)

3.1. Load Data



In [12]:

    
data = dataset("HSAUR", "Forbes2000");

3.2. Define Features and Output



In [13]:

    
features     = [:Country, :Category, :Sales, :Profits, :Assets]
cat_features = [:Country, :Category]
output        = :MarketValue;



In [186]:

    
model_list = [LM, DecisionTrees, RandomForest, XGB];

3.3. Data Sample



In [14]:

    
head(data)









    Out[14]:




Rank Name Country Category Sales Profits Assets MarketValue
1 1 Citigroup United States Banking 94.71 17.85 1264.03 255.3
2 2 General Electric United States Conglomerates 134.19 15.59 626.93 328.54
3 3 American Intl Group United States Insurance 76.66 6.46 647.66 194.87
4 4 ExxonMobil United States Oil & gas operations 222.88 20.96 166.99 277.02
5 5 BP United Kingdom Oil & gas operations 232.57 10.27 177.57 173.54
6 6 Bank of America United States Banking 49.01 10.81 736.45 117.55

Round 1

3.4. Data Encode: Round 1 (Shallow Encoded)



In [135]:

    
encoded_data = encode_data(data[:, vcat(features, output)], cat_features)
encoded_data[isnan(encoded_data[:, 4]), 4] = median(encoded_data[!isnan(encoded_data[:, 4]), 4]);

3.5. Train And Evaluate: Round 1



In [264]:

    
X, y = Array{Float64, 2}(encoded_data[:, features]), Array{Float64, 1}(encoded_data[output]);



In [265]:

    
analyze_models(X, y, X, rmse, model_list)









    Out[265]:




Model RMSE
1 DecisionTrees 4.297163057277828
2 RandomForest 6.896041041590683
3 XGB 6.993713117201217
4 LM 16.92750603781559

Round 2

3.6. Data Encode: Round 2 (Categorical Binarized Encoded)



In [309]:

    
encoded_data2 = binarize_features!(copy(encoded_data[:, vcat(features, output)]), [:Country, :Category]);



In [314]:

    
new_features = setdiff(names(encoded_data2), [:Country, :Category]);

3.7. Train And Evaluate: Round 2



In [315]:

    
X, y = Array{Float64, 2}(encoded_data2[:, new_features]), Array{Float64, 1}(encoded_data2[output]);



In [316]:

    
analyze_models(X, y, X, rmse, model_list)









    Out[316]:




Model RMSE
1 DecisionTrees 0.9389780832550524
2 XGB 1.5828800058092913
3 LM 6.870691606186913
4 RandomForest 16.22804927254093

Round 3

3.8. Data Encode: Round 3 (Fully Encoded)



In [333]:

    
# Sales
eval_list = generate_evaluation_function_list([1, 2, 5, 10, 20, 30, 60, 90, 120, 150])
encoded_data3 = categorize_feature!(copy(encoded_data2), :Sales, eval_list)

eval_list = generate_evaluation_function_list([-5, -2, 0, 2, 5])
categorize_feature!(encoded_data3, :Profits, eval_list)

eval_list = generate_evaluation_function_list([1, 25, 50])
categorize_feature!(encoded_data3, :Assets, eval_list)

new_features = setdiff(names(encoded_data3), [:Sales, :Profits, :Assets]);

X, y = Array{Float64, 2}(encoded_data3[:, new_features]), Array{Float64, 1}(encoded_data3[output])
analyze_models(X, y, X, rmse, model_list)









    Out[333]:




Model RMSE
1 DecisionTrees 0.9389780832550524
2 XGB 1.5828800058092913
3 LM 6.870691606186913
4 RandomForest 14.639966653736243

Results



In [350]:

    
rounds       = [fill(1, 4); fill(2, 4); fill(3, 4)]
methods_name = vcat(fill(["Decision Trees", "Random Forest", "XGB", "LM"], 3)...)
rmses        = [4.297163057277828, 6.896041041590683, 6.993713117201217, 16.92750603781559, 
                0.9389780832550524, 16.22804927254093, 1.5828800058092913, 6.870691606186913, 
                0.9389780832550524, 14.639966653736243, 1.5828800058092913, 6.870691606186913]

results = DataFrame(Round=rounds, Method=methods_name, RMSE=rmses)
set_default_plot_size(1000px, 300px)
plot(results, x=:Method, color=:Round, y=:RMSE, Geom.line)









    Out[350]:

Preliminar Conclusion: Using encode produce better results. The results improves in general more than the double. For some reason Random Forest was not benefitiated by the encoders. But all other methods were. Also LM not improved much as trees based models. This indicates that the problem could not be a linear one. Also surprisly, trees that has its splits made by gini index were benefited by manual feature segmentation.

	Package	Title
1	COUNT	Functions, data and code for count data.
2	Ecdat	Data sets for econometrics
3	HSAUR	A Handbook of Statistical Analyses Using R (1st Edition)
4	HistData	Data sets from the history of statistics and data visualization
5	ISLR	Data for An Introduction to Statistical Learning with Applications in R
6	KMsurv	Data sets from Klein and Moeschberger (1997), Survival Analysis
7	MASS	Support Functions and Datasets for Venables and Ripley's MASS
8	SASmixed	Data sets from "SAS System for Mixed Models"
9	Zelig	Everyone's Statistical Software
10	adehabitatLT	Analysis of Animal Movements
11	boot	Bootstrap Functions (Originally by Angelo Canty for S)
12	car	Companion to Applied Regression
13	cluster	Cluster Analysis Extended Rousseeuw et al.
14	datasets	The R Datasets Package
15	gap	Genetic analysis package
16	ggplot2	An Implementation of the Grammar of Graphics
17	lattice	Lattice Graphics
18	lme4	Linear mixed-effects models using Eigen and S4
19	mgcv	Mixed GAM Computation Vehicle with GCV/AIC/REML smoothness estimation
20	mlmRev	Examples from Multilevel Modelling Software Review
21	nlreg	Higher Order Inference for Nonlinear Heteroscedastic Models
22	plm	Linear Models for Panel Data
23	plyr	Tools for splitting, applying and combining data
24	pscl	Political Science Computational Laboratory, Stanford University
25	psych	Procedures for Psychological, Psychometric, and Personality Research
26	quantreg	Quantile Regression
27	reshape2	Flexibly Reshape Data: A Reboot of the Reshape Package.
28	robustbase	Basic Robust Statistics
29	rpart	Recursive Partitioning and Regression Trees
30	sandwich	Robust Covariance Matrix Estimators
&vellip	&vellip	&vellip

	Package	Dataset	Title	Rows	Columns
1	HSAUR	BCG	BCG Vaccine Data	13	7
2	HSAUR	BtheB	Beat the Blues Data	100	8
3	HSAUR	CYGOB1	CYG OB1 Star Cluster Data	47	2
4	HSAUR	Forbes2000	The Forbes 2000 Ranking of the World's Biggest Companies (Year 2004)	2000	8
5	HSAUR	GHQ	General Health Questionnaire	22	4
6	HSAUR	Lanza	Prevention of Gastointestinal Damages	198	3
7	HSAUR	agefat	Total Body Composision Data	25	3
8	HSAUR	aspirin	Aspirin Data	7	5
9	HSAUR	birthdeathrates	Birth and Death Rates Data	69	3
10	HSAUR	bladdercancer	Bladder Cancer Data	31	3
11	HSAUR	clouds	Cloud Seeding Data	24	7
12	HSAUR	epilepsy	Epilepsy Data	236	7
13	HSAUR	foster	Foster Feeding Experiment	61	3
14	HSAUR	heptathlon	Olympic Heptathlon Seoul 1988	25	9
15	HSAUR	mastectomy	Survival Times after Mastectomy of Breast Cancer Patients	44	3
16	HSAUR	meteo	Meteorological Measurements for 11 Years	11	6
17	HSAUR	orallesions	Oral Lesions in Rural India	8	4
18	HSAUR	phosphate	Phosphate Level Data	33	9
19	HSAUR	pistonrings	Piston Rings Failures	4	4
20	HSAUR	planets	Exoplanets Data	101	3
21	HSAUR	plasma	Blood Screening Data	32	4
22	HSAUR	polyps	Familial Andenomatous Polyposis	20	3
23	HSAUR	polyps3	Familial Andenomatous Polyposis	22	5
24	HSAUR	pottery	Romano-British Pottery Data	45	9
25	HSAUR	rearrests	Rearrests of Juvenile Felons	2	3
26	HSAUR	respiratory	Respiratory Illness Data	555	8
27	HSAUR	roomwidth	Students Estimates of Lecture Room Width	113	2
28	HSAUR	schizophrenia	Age of Onset of Schizophrenia Data	251	2
29	HSAUR	schizophrenia2	Schizophrenia Data	220	5
30	HSAUR	schooldays	Days not Spent at School	154	5
&vellip	&vellip	&vellip	&vellip	&vellip	&vellip

	Rank	Name	Country	Category	Sales	Profits	Assets	MarketValue
1	1	Citigroup	United States	Banking	94.71	17.85	1264.03	255.3
2	2	General Electric	United States	Conglomerates	134.19	15.59	626.93	328.54
3	3	American Intl Group	United States	Insurance	76.66	6.46	647.66	194.87
4	4	ExxonMobil	United States	Oil & gas operations	222.88	20.96	166.99	277.02
5	5	BP	United Kingdom	Oil & gas operations	232.57	10.27	177.57	173.54
6	6	Bank of America	United States	Banking	49.01	10.81	736.45	117.55

	Model	RMSE
1	DecisionTrees	4.297163057277828
2	RandomForest	6.896041041590683
3	XGB	6.993713117201217
4	LM	16.92750603781559

	Model	RMSE
1	DecisionTrees	0.9389780832550524
2	XGB	1.5828800058092913
3	LM	6.870691606186913
4	RandomForest	16.22804927254093