This notebook is a study to observe the potential of feature encoding. Althought there are a considerable amount of studies that claims the benefit of feature encoding, this notebook try to verify it empirically. Also, the studies don't show how to do it, and this notebook give a chance to Data Scientists and Machine Learning Practioners see some smart tricks on data. At least but not less, all the notebook was made on Julia! This is an awesome language that are conquering me every day! You can see more notebooks like this one throught my GitHub page. Every one is now coding in Julia.
In [2]:
using MLBase
using DataFrames
using RDatasets
using Lasso
using GLM
using GLMNet
using LIBSVM
using DecisionTree
using XGBoost
using NeuralNets
using Gadfly
In [3]:
function encode_data(data, features)
encoded_data = copy(data)
encoders = Dict([f => labelmap(data[:, f]) for f in features])
for f in features
encoded_data[:, f] = labelencode(encoders[f], encoded_data[:, f])
end
return encoded_data
end
Out[3]:
In [4]:
abstract Model
type LM <: Model end
type SVM <: Model end
type ANN <: Model end
type DecisionTrees <: Model end
type RandomForest <: Model end
type XGB <: Model end
train(::Type{LM}, X, y) = glmnet(X, y)
train(::Type{SVM}, X, y) = 0
train(::Type{ANN}, X, y) = 0
train(::Type{DecisionTrees}, X, y) = build_tree(y, X)
train(::Type{RandomForest}, X, y) = build_forest(y, X, 4, 100, 5, 0.7)
train(::Type{XGB}, X, y) = xgboost(X, 10, label=y, eta=.3, max_depth=4, silent=1)
predict(::Type{LM}, model, X) = (yhat=GLMNet.predict(model, X); return [mean(yhat[i,:]) for i=1:size(yhat,1)])
predict(::Type{SVM}, model, X) = 0
predict(::Type{ANN}, model, X) = 0
predict(::Type{DecisionTrees}, model, X) = apply_tree(model, X)
predict(::Type{RandomForest}, model, X) = apply_forest(model, X)
predict(::Type{XGB}, model, X) = XGBoost.predict(model, X)
rmse(Y, YHat) = sqrt(sum((Y - YHat) .^ 2)/length(Y))
accuracy(Y, YHat) = sum(Array{Int8}(Y .== YHat))/length(Y)
Out[4]:
In [5]:
function analyze_models(X, y, Xval, metric, model_list)
results = []
for model_name in model_list
model = train(model_name, X, y)
yhat = predict(model_name, model, Xval)
result = metric(y, yhat)
push!(results, result)
end
dataframe = DataFrame(Model=model_list, RMSE=results)
sort!(dataframe, cols=:RMSE)
return dataframe
end
Out[5]:
In [6]:
function binarize_feature!(data, feature)
unique_values = [ symbol(feature, "=", value) for value in Set(data[:, feature]) ]
nrows, ncols = size(data, 1), length(unique_values)
rows = fill(0, nrows, ncols)
for i=1:nrows
rows[i, data[i, feature]] = 1
end
for j=1:ncols
data[unique_values[j]] = rows[:, j]
end
return data
end
function binarize_features!(data, feature_list)
for feature in feature_list
binarize_feature!(data, feature)
end
return data
end
Out[6]:
In [7]:
function generate_evaluation_function_list(interval_list)
length(interval_list) == 0 && error("Empty list.")
length(interval_list) == 1 && error("List need to be size >= 2.")
eval_list = [x -> x <= interval_list[1]]
for i=1:length(interval_list)-1
push!(eval_list, x -> interval_list[i] < x <= interval_list[i+1])
end
push!(eval_list, x -> x > interval_list[end])
return eval_list
end
function categorize_feature!(data, feature, evaluate_function_list)
nrows, ncols = size(data, 1), length(evaluate_function_list)
new_features = [ symbol(feature, "=", i) for i=1:ncols]
rows = fill(0, nrows, ncols)
for i=1:nrows, j=1:ncols
rows[i, j] = evaluate_function_list[j](data[i, feature])? 1 : 0
end
for j=1:ncols
data[new_features[j]] = rows[:, j]
end
return data
end
Out[7]:
In [9]:
RDatasets.packages()
Out[9]:
In [10]:
RDatasets.datasets("HSAUR")
Out[10]:
In [11]:
head(dataset("HSAUR", "Forbes2000"))
Out[11]:
In [12]:
data = dataset("HSAUR", "Forbes2000");
In [13]:
features = [:Country, :Category, :Sales, :Profits, :Assets]
cat_features = [:Country, :Category]
output = :MarketValue;
In [186]:
model_list = [LM, DecisionTrees, RandomForest, XGB];
In [14]:
head(data)
Out[14]:
In [135]:
encoded_data = encode_data(data[:, vcat(features, output)], cat_features)
encoded_data[isnan(encoded_data[:, 4]), 4] = median(encoded_data[!isnan(encoded_data[:, 4]), 4]);
In [264]:
X, y = Array{Float64, 2}(encoded_data[:, features]), Array{Float64, 1}(encoded_data[output]);
In [265]:
analyze_models(X, y, X, rmse, model_list)
Out[265]:
In [309]:
encoded_data2 = binarize_features!(copy(encoded_data[:, vcat(features, output)]), [:Country, :Category]);
In [314]:
new_features = setdiff(names(encoded_data2), [:Country, :Category]);
In [315]:
X, y = Array{Float64, 2}(encoded_data2[:, new_features]), Array{Float64, 1}(encoded_data2[output]);
In [316]:
analyze_models(X, y, X, rmse, model_list)
Out[316]:
In [333]:
# Sales
eval_list = generate_evaluation_function_list([1, 2, 5, 10, 20, 30, 60, 90, 120, 150])
encoded_data3 = categorize_feature!(copy(encoded_data2), :Sales, eval_list)
eval_list = generate_evaluation_function_list([-5, -2, 0, 2, 5])
categorize_feature!(encoded_data3, :Profits, eval_list)
eval_list = generate_evaluation_function_list([1, 25, 50])
categorize_feature!(encoded_data3, :Assets, eval_list)
new_features = setdiff(names(encoded_data3), [:Sales, :Profits, :Assets]);
X, y = Array{Float64, 2}(encoded_data3[:, new_features]), Array{Float64, 1}(encoded_data3[output])
analyze_models(X, y, X, rmse, model_list)
Out[333]:
In [350]:
rounds = [fill(1, 4); fill(2, 4); fill(3, 4)]
methods_name = vcat(fill(["Decision Trees", "Random Forest", "XGB", "LM"], 3)...)
rmses = [4.297163057277828, 6.896041041590683, 6.993713117201217, 16.92750603781559,
0.9389780832550524, 16.22804927254093, 1.5828800058092913, 6.870691606186913,
0.9389780832550524, 14.639966653736243, 1.5828800058092913, 6.870691606186913]
results = DataFrame(Round=rounds, Method=methods_name, RMSE=rmses)
set_default_plot_size(1000px, 300px)
plot(results, x=:Method, color=:Round, y=:RMSE, Geom.line)
Out[350]:
Preliminar Conclusion: Using encode produce better results. The results improves in general more than the double. For some reason Random Forest was not benefitiated by the encoders. But all other methods were. Also LM not improved much as trees based models. This indicates that the problem could not be a linear one. Also surprisly, trees that has its splits made by gini index were benefited by manual feature segmentation.