# Machine Learning Notes: Feature Encoding I

Author: Diego Marinho de Oliveira (GitHub)

## 1. Introduction

This notebook is a study to observe the potential of feature encoding. Althought there are a considerable amount of studies that claims the benefit of feature encoding, this notebook try to verify it empirically. Also, the studies don't show how to do it, and this notebook give a chance to Data Scientists and Machine Learning Practioners see some smart tricks on data. At least but not less, all the notebook was made on Julia! This is an awesome language that are conquering me every day! You can see more notebooks like this one throught my GitHub page. Every one is now coding in Julia.

In [274]:
using MLBase
using DataFrames
using RDatasets
using Lasso
using GLM
using GLMNet
using LIBSVM
using DecisionTree
using XGBoost
using NeuralNets

## 2. Method Definitions

#### Encode Data Method

In [7]:
function encode_data(data, features)

encoded_data = copy(data)
encoders     = Dict([f => labelmap(data[:, f]) for f in features])

for f in features
encoded_data[:, f] = labelencode(encoders[f], encoded_data[:, f])
end

return encoded_data
end

Out[7]:
encode_data (generic function with 1 method)

#### Train, Predict and Eval Methods

In [317]:
abstract Model
type LM <: Model end
type SVM <: Model end
type ANN <: Model end
type DecisionTrees <: Model end
type RandomForest <: Model end
type XGB <: Model end

train(::Type{LM}, X, y)            = glmnet(X, y)
train(::Type{SVM}, X, y)           = 0
train(::Type{ANN}, X, y)           = 0
train(::Type{DecisionTrees}, X, y) = build_tree(y, X)
train(::Type{RandomForest}, X, y)  = build_forest(y, X, 4, 100, 5, 0.7)
train(::Type{XGB}, X, y)           = xgboost(X, 10, label=y, eta=.3, max_depth=4, silent=1)

predict(::Type{LM}, model, X)            = (yhat=GLMNet.predict(model, X); return [mean(yhat[i,:]) for i=1:size(yhat,1)])
predict(::Type{SVM}, model, X)           = 0
predict(::Type{ANN}, model, X)           = 0
predict(::Type{DecisionTrees}, model, X) = apply_tree(model, X)
predict(::Type{RandomForest}, model, X)  = apply_forest(model, X)
predict(::Type{XGB}, model, X)           = XGBoost.predict(model, X)

rmse(Y, YHat) = sqrt(sum((Y - YHat) .^ 2)/length(Y))
accuracy(Y, YHat) = sum(Array{Int8}(Y .== YHat))/length(Y)

Out[317]:
accuracy (generic function with 1 method)

#### Evaluate All Models

In [232]:
function analyze_models(X, y, Xval, metric, model_list)

results = []

for model_name in model_list
model  = train(model_name, X, y)
yhat   = predict(model_name, model, Xval)
result = metric(y, yhat)
push!(results, result)
end

dataframe = DataFrame(Model=model_list, RMSE=results)
sort!(dataframe, cols=:RMSE)

return dataframe
end

Out[232]:
analyze_models (generic function with 2 methods)

#### Binarize All Features

In [267]:
function binarize_feature!(data, feature)
unique_values = [ symbol(feature, "=", value) for value in Set(data[:, feature]) ]
nrows, ncols  = size(data, 1), length(unique_values)
rows          = fill(0, nrows, ncols)
for i=1:nrows
rows[i, data[i, feature]] = 1
end

for j=1:ncols
data[unique_values[j]] = rows[:, j]
end

return data
end

function binarize_features!(data, feature_list)

for feature in feature_list
binarize_feature!(data, feature)
end

return data
end

Out[267]:
binarize_features! (generic function with 1 method)

#### Categorize Continous/Discrete Features

In [303]:
function generate_evaluation_function_list(interval_list)

length(interval_list) == 0 && error("Empty list.")
length(interval_list) == 1 && error("List need to be size >= 2.")

eval_list = [x -> x <= interval_list[1]]

for i=1:length(interval_list)-1
push!(eval_list, x -> interval_list[i] < x <= interval_list[i+1])
end

push!(eval_list, x -> x > interval_list[end])

return eval_list

end

function categorize_feature!(data, feature, evaluate_function_list)

nrows, ncols = size(data, 1), length(evaluate_function_list)
new_features = [ symbol(feature, "=", i) for i=1:ncols]
rows         = fill(0, nrows, ncols)

for i=1:nrows, j=1:ncols
rows[i, j] = evaluate_function_list[j](data[i, feature])? 1 : 0
end

for j=1:ncols
data[new_features[j]] = rows[:, j]
end

return data
end

Out[303]:
categorize_feature! (generic function with 1 method)

In [9]:
RDatasets.packages()

Out[9]:
PackageTitle
1COUNTFunctions, data and code for count data.
2EcdatData sets for econometrics
3HSAURA Handbook of Statistical Analyses Using R (1st Edition)
4HistDataData sets from the history of statistics and data visualization
5ISLRData for An Introduction to Statistical Learning with Applications in R
6KMsurvData sets from Klein and Moeschberger (1997), Survival Analysis
7MASSSupport Functions and Datasets for Venables and Ripley's MASS
8SASmixedData sets from "SAS System for Mixed Models"
9ZeligEveryone's Statistical Software
11bootBootstrap Functions (Originally by Angelo Canty for S)
12carCompanion to Applied Regression
13clusterCluster Analysis Extended Rousseeuw et al.
14datasetsThe R Datasets Package
15gapGenetic analysis package
16ggplot2An Implementation of the Grammar of Graphics
17latticeLattice Graphics
18lme4Linear mixed-effects models using Eigen and S4
19mgcvMixed GAM Computation Vehicle with GCV/AIC/REML smoothness estimation
20mlmRevExamples from Multilevel Modelling Software Review
21nlregHigher Order Inference for Nonlinear Heteroscedastic Models
22plmLinear Models for Panel Data
23plyrTools for splitting, applying and combining data
24psclPolitical Science Computational Laboratory, Stanford University
25psychProcedures for Psychological, Psychometric, and Personality Research
26quantregQuantile Regression
27reshape2Flexibly Reshape Data: A Reboot of the Reshape Package.
28robustbaseBasic Robust Statistics
29rpartRecursive Partitioning and Regression Trees
30sandwichRobust Covariance Matrix Estimators
&vellip&vellip&vellip

In [10]:
RDatasets.datasets("HSAUR")

Out[10]:
PackageDatasetTitleRowsColumns
1HSAURBCGBCG Vaccine Data137
2HSAURBtheBBeat the Blues Data1008
3HSAURCYGOB1CYG OB1 Star Cluster Data472
4HSAURForbes2000The Forbes 2000 Ranking of the World's Biggest Companies (Year 2004)20008
5HSAURGHQGeneral Health Questionnaire224
6HSAURLanzaPrevention of Gastointestinal Damages1983
7HSAURagefatTotal Body Composision Data253
8HSAURaspirinAspirin Data75
9HSAURbirthdeathratesBirth and Death Rates Data693
11HSAURcloudsCloud Seeding Data247
12HSAURepilepsyEpilepsy Data2367
13HSAURfosterFoster Feeding Experiment613
14HSAURheptathlonOlympic Heptathlon Seoul 1988259
15HSAURmastectomySurvival Times after Mastectomy of Breast Cancer Patients443
16HSAURmeteoMeteorological Measurements for 11 Years116
17HSAURorallesionsOral Lesions in Rural India84
18HSAURphosphatePhosphate Level Data339
19HSAURpistonringsPiston Rings Failures44
20HSAURplanetsExoplanets Data1013
21HSAURplasmaBlood Screening Data324
22HSAURpolypsFamilial Andenomatous Polyposis203
23HSAURpolyps3Familial Andenomatous Polyposis225
24HSAURpotteryRomano-British Pottery Data459
25HSAURrearrestsRearrests of Juvenile Felons23
26HSAURrespiratoryRespiratory Illness Data5558
27HSAURroomwidthStudents Estimates of Lecture Room Width1132
28HSAURschizophreniaAge of Onset of Schizophrenia Data2512
29HSAURschizophrenia2Schizophrenia Data2205
30HSAURschooldaysDays not Spent at School1545
&vellip&vellip&vellip&vellip&vellip&vellip

In [11]:

Out[11]:
RankNameCountryCategorySalesProfitsAssetsMarketValue
11CitigroupUnited StatesBanking94.7117.851264.03255.3
22General ElectricUnited StatesConglomerates134.1915.59626.93328.54
33American Intl GroupUnited StatesInsurance76.666.46647.66194.87
44ExxonMobilUnited StatesOil & gas operations222.8820.96166.99277.02
55BPUnited KingdomOil & gas operations232.5710.27177.57173.54
66Bank of AmericaUnited StatesBanking49.0110.81736.45117.55

## 3. Analyzing Forbes2000 Dataset (I)

In [12]:
data = dataset("HSAUR", "Forbes2000");

### 3.2. Define Features and Output

In [13]:
features     = [:Country, :Category, :Sales, :Profits, :Assets]
cat_features = [:Country, :Category]
output        = :MarketValue;

In [186]:
model_list = [LM, DecisionTrees, RandomForest, XGB];

### 3.3. Data Sample

In [14]:

Out[14]:
RankNameCountryCategorySalesProfitsAssetsMarketValue
11CitigroupUnited StatesBanking94.7117.851264.03255.3
22General ElectricUnited StatesConglomerates134.1915.59626.93328.54
33American Intl GroupUnited StatesInsurance76.666.46647.66194.87
44ExxonMobilUnited StatesOil & gas operations222.8820.96166.99277.02
55BPUnited KingdomOil & gas operations232.5710.27177.57173.54
66Bank of AmericaUnited StatesBanking49.0110.81736.45117.55

## Round 1

### 3.4. Data Encode: Round 1 (Shallow Encoded)

In [135]:
encoded_data = encode_data(data[:, vcat(features, output)], cat_features)
encoded_data[isnan(encoded_data[:, 4]), 4] = median(encoded_data[!isnan(encoded_data[:, 4]), 4]);

### 3.5. Train And Evaluate: Round 1

In [264]:
X, y = Array{Float64, 2}(encoded_data[:, features]), Array{Float64, 1}(encoded_data[output]);

In [265]:
analyze_models(X, y, X, rmse, model_list)

Out[265]:
ModelRMSE
1DecisionTrees4.297163057277828
2RandomForest6.896041041590683
3XGB6.993713117201217
4LM16.92750603781559

## Round 2

### 3.6. Data Encode: Round 2 (Categorical Binarized Encoded)

In [309]:
encoded_data2 = binarize_features!(copy(encoded_data[:, vcat(features, output)]), [:Country, :Category]);

In [314]:
new_features = setdiff(names(encoded_data2), [:Country, :Category]);

### 3.7. Train And Evaluate: Round 2

In [315]:
X, y = Array{Float64, 2}(encoded_data2[:, new_features]), Array{Float64, 1}(encoded_data2[output]);

In [316]:
analyze_models(X, y, X, rmse, model_list)

Out[316]:
ModelRMSE
1DecisionTrees0.9389780832550524
2XGB1.5828800058092913
3LM6.870691606186913
4RandomForest16.22804927254093

## Round 3

### 3.8. Data Encode: Round 3 (Fully Encoded)

In [333]:
# Sales
eval_list = generate_evaluation_function_list([1, 2, 5, 10, 20, 30, 60, 90, 120, 150])
encoded_data3 = categorize_feature!(copy(encoded_data2), :Sales, eval_list)

eval_list = generate_evaluation_function_list([-5, -2, 0, 2, 5])
categorize_feature!(encoded_data3, :Profits, eval_list)

eval_list = generate_evaluation_function_list([1, 25, 50])
categorize_feature!(encoded_data3, :Assets, eval_list)

new_features = setdiff(names(encoded_data3), [:Sales, :Profits, :Assets]);

X, y = Array{Float64, 2}(encoded_data3[:, new_features]), Array{Float64, 1}(encoded_data3[output])
analyze_models(X, y, X, rmse, model_list)

Out[333]:
ModelRMSE
1DecisionTrees0.9389780832550524
2XGB1.5828800058092913
3LM6.870691606186913
4RandomForest14.639966653736243

## Results

In [350]:
rounds       = [fill(1, 4); fill(2, 4); fill(3, 4)]
methods_name = vcat(fill(["Decision Trees", "Random Forest", "XGB", "LM"], 3)...)
rmses        = [4.297163057277828, 6.896041041590683, 6.993713117201217, 16.92750603781559,
0.9389780832550524, 16.22804927254093, 1.5828800058092913, 6.870691606186913,
0.9389780832550524, 14.639966653736243, 1.5828800058092913, 6.870691606186913]

results = DataFrame(Round=rounds, Method=methods_name, RMSE=rmses)
set_default_plot_size(1000px, 300px)
plot(results, x=:Method, color=:Round, y=:RMSE, Geom.line)

Out[350]:

Preliminar Conclusion: Using encode produce better results. The results improves in general more than the double. For some reason Random Forest was not benefitiated by the encoders. But all other methods were. Also LM not improved much as trees based models. This indicates that the problem could not be a linear one. Also surprisly, trees that has its splits made by gini index were benefited by manual feature segmentation.