Optimization Models for Stats Nerds

Data


In [64]:
using RDatasets
using Gadfly

hw = dataset("car", "Davis")
#hw = sub(hw, :(Height .>= 100)) # broken
hw = hw[hw[:Height] .>= 100, :]

set_default_plot_size(22cm, 12cm)
plot(hw, x=:Height, y=:Weight, color=:Sex, Guide.title("Height & Weight by Gender"))


Out[64]:
Height 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 80 82 84 86 88 90 92 94 96 98 100 102 104 106 108 110 112 114 116 118 120 122 124 126 128 130 132 134 136 138 140 142 144 146 148 150 152 154 156 158 160 162 164 166 168 170 172 174 176 178 180 182 184 186 188 190 192 194 196 198 200 202 204 206 208 210 212 214 216 218 220 222 224 226 228 230 232 234 236 238 240 242 244 246 248 250 252 254 256 258 260 0 100 200 300 0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 F M Sex Weight -200 -150 -100 -50 0 50 100 150 200 250 300 350 -150 -145 -140 -135 -130 -125 -120 -115 -110 -105 -100 -95 -90 -85 -80 -75 -70 -65 -60 -55 -50 -45 -40 -35 -30 -25 -20 -15 -10 -5 0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 160 165 170 175 180 185 190 195 200 205 210 215 220 225 230 235 240 245 250 255 260 265 270 275 280 285 290 295 300 -200 0 200 400 -150 -140 -130 -120 -110 -100 -90 -80 -70 -60 -50 -40 -30 -20 -10 0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 260 270 280 290 300 Height & Weight by Gender

In [67]:
#males = sub(hw, :(Sex .== "M"))
males = hw[hw[:Sex] .== "M", :]
plot(males, x=:Height, y=:Weight, Guide.title("Heights & Weights of Males"))


Out[67]:
Height 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 120 122 124 126 128 130 132 134 136 138 140 142 144 146 148 150 152 154 156 158 160 162 164 166 168 170 172 174 176 178 180 182 184 186 188 190 192 194 196 198 200 202 204 206 208 210 212 214 216 218 220 222 224 226 228 230 232 234 236 238 240 100 150 200 250 120 125 130 135 140 145 150 155 160 165 170 175 180 185 190 195 200 205 210 215 220 225 230 235 240 Weight -60 -40 -20 0 20 40 60 80 100 120 140 160 180 200 220 -40 -35 -30 -25 -20 -15 -10 -5 0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 160 165 170 175 180 185 190 195 200 -100 0 100 200 -40 -35 -30 -25 -20 -15 -10 -5 0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 160 165 170 175 180 185 190 195 200 Heights & Weights of Males

In [68]:
#females = subset(hw, :(Sex .== "F"))
females = hw[hw[:Sex] .== "F", :]
plot(females, x=:Height, y=:Weight, Guide.title("Heights & Weights of Females"))


Out[68]:
Height 90 100 110 120 130 140 150 160 170 180 190 200 210 220 230 100 102 104 106 108 110 112 114 116 118 120 122 124 126 128 130 132 134 136 138 140 142 144 146 148 150 152 154 156 158 160 162 164 166 168 170 172 174 176 178 180 182 184 186 188 190 192 194 196 198 200 202 204 206 208 210 212 214 216 218 220 100 150 200 250 0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 160 165 170 175 180 185 190 195 200 205 210 215 220 Weight -30 -20 -10 0 10 20 30 40 50 60 70 80 90 100 110 120 130 140 -20 -18 -16 -14 -12 -10 -8 -6 -4 -2 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62 64 66 68 70 72 74 76 78 80 82 84 86 88 90 92 94 96 98 100 102 104 106 108 110 112 114 116 118 120 122 124 126 128 130 -50 0 50 100 150 -20 -15 -10 -5 0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 Heights & Weights of Females

Least Absolute Deviations (LAD) Regression


In [69]:
using JuMP
using GLPKMathProgInterface

function LAD(data)
    m = Model(solver=GLPKSolverLP())
    
    @defVar(m, x[1:3])
    @defVar(m, residuals[1:nrow(data)] >= 0)
    for (i, (height, weight)) in enumerate(zip(data[:Height], data[:Weight]))
        fitted = dot([1, height, height^2], x)

        # Linear way of representing residual >= |observed - fitted|
        @addConstraint(m, residuals[i] >= fitted - weight)
        @addConstraint(m, residuals[i] >= weight - fitted)
     end

    # Minimize the total sum of these residuals.
    @setObjective(m, Min, sum(residuals))
    solve(m)
    
    coefficients = getValue(x)[:]
    return coefficients, height -> dot([1, height, height^2], coefficients)
end

ladCoefficients, ladPredictor = LAD(males)
println("Coefficients = $ladCoefficients")
println(ladPredictor(175))


Coefficients = [-465.17662337665223,4.890909090909426,-0.010389610389611361]
72.55064935064951

In [70]:
ladFitted = DataFrame(
    Height = males[:Height],
    Weight = males[:Weight], 
    FittedWeight = map(ladPredictor, males[:Height])
)
plot(
    ladFitted, 
    layer(x=:Height, y=:FittedWeight, Geom.line), 
    layer(x=:Height, y=:Weight, Geom.point)
)


Out[70]:
Height 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 120 122 124 126 128 130 132 134 136 138 140 142 144 146 148 150 152 154 156 158 160 162 164 166 168 170 172 174 176 178 180 182 184 186 188 190 192 194 196 198 200 202 204 206 208 210 212 214 216 218 220 222 224 226 228 230 232 234 236 238 240 100 150 200 250 120 125 130 135 140 145 150 155 160 165 170 175 180 185 190 195 200 205 210 215 220 225 230 235 240 FittedWeight -60 -40 -20 0 20 40 60 80 100 120 140 160 180 200 220 -40 -35 -30 -25 -20 -15 -10 -5 0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 160 165 170 175 180 185 190 195 200 -100 0 100 200 -40 -35 -30 -25 -20 -15 -10 -5 0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 160 165 170 175 180 185 190 195 200

Epsilon-Insensitive LAD Regression


In [71]:
using JuMP
using GLPKMathProgInterface

function EpsilonLAD(data, epsilon)
    m = Model(solver=GLPKSolverLP())
    
    @defVar(m, x[1:3])
    @defVar(m, residuals[1:nrow(data)] >= 0)
    for (i, (height, weight)) in enumerate(zip(data[:Height], data[:Weight]))
        fitted = dot([1, height, height^2], x)

        # Linear way of representing residual >= |observed - fitted| - epsilon
        @addConstraint(m, residuals[i] >= fitted - weight - epsilon)
        @addConstraint(m, residuals[i] >= weight - fitted - epsilon)
     end

    # Minimize the total sum of these residuals.
    @setObjective(m, Min, sum(residuals))
    solve(m)
    
    coefficients = getValue(x)[:]
    return coefficients, height -> dot([1, height, height^2], coefficients)
end

epsLadCoefficients, epsLadPredictor = EpsilonLAD(males, 1.5)
println("Coefficients = $epsLadCoefficients")
println(epsLadPredictor(175))


Coefficients = [41.50000000000726,-0.8690476190477003,0.005952380952381179]
71.70833333333333

In [72]:
epsLadFitted = DataFrame(
    Height = males[:Height],
    Weight = males[:Weight], 
    LADWeight = map(ladPredictor, males[:Height]),
    EpsLADWeight = map(epsLadPredictor, males[:Height])
)
plot(
    epsLadFitted, 
    layer(x=:Height, y=:LADWeight, Theme(default_color=color("green")), Geom.line), 
    layer(x=:Height, y=:EpsLADWeight, Theme(default_color=color("violet")), Geom.line), 
    layer(x=:Height, y=:Weight, Geom.point)
)


Out[72]:
Height 110 120 130 140 150 160 170 180 190 200 210 220 230 240 250 120 122 124 126 128 130 132 134 136 138 140 142 144 146 148 150 152 154 156 158 160 162 164 166 168 170 172 174 176 178 180 182 184 186 188 190 192 194 196 198 200 202 204 206 208 210 212 214 216 218 220 222 224 226 228 230 232 234 236 238 240 100 150 200 250 120 125 130 135 140 145 150 155 160 165 170 175 180 185 190 195 200 205 210 215 220 225 230 235 240 LADWeight -60 -40 -20 0 20 40 60 80 100 120 140 160 180 200 220 -40 -35 -30 -25 -20 -15 -10 -5 0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 160 165 170 175 180 185 190 195 200 -100 0 100 200 -40 -35 -30 -25 -20 -15 -10 -5 0 5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 105 110 115 120 125 130 135 140 145 150 155 160 165 170 175 180 185 190 195 200

In [73]:
plot([abs, x -> max(abs(x) - 1, 0)], -5, 5)


Out[73]:
x -20 -15 -10 -5 0 5 10 15 20 -15.0 -14.5 -14.0 -13.5 -13.0 -12.5 -12.0 -11.5 -11.0 -10.5 -10.0 -9.5 -9.0 -8.5 -8.0 -7.5 -7.0 -6.5 -6.0 -5.5 -5.0 -4.5 -4.0 -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5 7.0 7.5 8.0 8.5 9.0 9.5 10.0 10.5 11.0 11.5 12.0 12.5 13.0 13.5 14.0 14.5 15.0 -20 -10 0 10 20 -15 -14 -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 f2 f1 Color f(x) -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11 -5.0 -4.8 -4.6 -4.4 -4.2 -4.0 -3.8 -3.6 -3.4 -3.2 -3.0 -2.8 -2.6 -2.4 -2.2 -2.0 -1.8 -1.6 -1.4 -1.2 -1.0 -0.8 -0.6 -0.4 -0.2 0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6 1.8 2.0 2.2 2.4 2.6 2.8 3.0 3.2 3.4 3.6 3.8 4.0 4.2 4.4 4.6 4.8 5.0 5.2 5.4 5.6 5.8 6.0 6.2 6.4 6.6 6.8 7.0 7.2 7.4 7.6 7.8 8.0 8.2 8.4 8.6 8.8 9.0 9.2 9.4 9.6 9.8 10.0 -5 0 5 10 -5.0 -4.5 -4.0 -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5 7.0 7.5 8.0 8.5 9.0 9.5 10.0

In [74]:
plot([x -> x^2, x -> max(abs(x) - 1, 0)^2], -5, 5)


Out[74]:
x -20 -15 -10 -5 0 5 10 15 20 -15.0 -14.5 -14.0 -13.5 -13.0 -12.5 -12.0 -11.5 -11.0 -10.5 -10.0 -9.5 -9.0 -8.5 -8.0 -7.5 -7.0 -6.5 -6.0 -5.5 -5.0 -4.5 -4.0 -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5 7.0 7.5 8.0 8.5 9.0 9.5 10.0 10.5 11.0 11.5 12.0 12.5 13.0 13.5 14.0 14.5 15.0 -20 -10 0 10 20 -15 -14 -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 f2 f1 Color f(x) -30 -25 -20 -15 -10 -5 0 5 10 15 20 25 30 35 40 45 50 55 -25 -24 -23 -22 -21 -20 -19 -18 -17 -16 -15 -14 -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 -25 0 25 50 -26 -24 -22 -20 -18 -16 -14 -12 -10 -8 -6 -4 -2 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50

In [54]: