In [1]:
import Base: convert, ==, +, one, /, typemin

bitstype 8 ThreeWay
#0xff is the sentinel value and is created with a 0-argument constructor
ThreeWay()        = reinterpret(ThreeWay, 0xff)
ThreeWay(x::Bool) = reinterpret(ThreeWay, x)

const true3  = ThreeWay(true) #by default, defined to be = convert(ThreeWay, true)
const false3 = ThreeWay(false)
const na3    = ThreeWay()

#We abuse typemin here to specify the sentinel values
is_na{T}(x::T) = x == typemin(T)

typemin(::Type{ThreeWay}) = na3
typemin{T<:Complex}(::Type{T}) = T(-NaN)

#If NA detected, return sentinel value
convert{T<:Real}(::Type{T}, x::ThreeWay) = ifelse(is_na(x), typemin(T),
    T(reinterpret(UInt8, x)))

one(x::Type{ThreeWay}) = true3
==(x::ThreeWay, y::Bool) = ifelse(x == na3, false, Bool(x) == y)
+(x::Union{Int,ThreeWay}, y::ThreeWay) = Int(x) + Int(y)
/(x::ThreeWay, y::Int) = Bool(x)/y


Out[1]:
/ (generic function with 49 methods)

In [2]:
extract(x,v) = x[ setdiff( [1:length(x);] ,v) ]
product(x) = Int32(reducedim(*,collect(x),1)[1])

function run(x,t)
    for i = 1:t-1  colMeans(x, true, 1)  end
    colMeans(x,true,1)
end

z(x::AbstractFloat) = 0.0
z(x::Complex) = complex(0.0,0.0)
z(x) = 0


Out[2]:
z (generic function with 3 methods)

In [3]:
## colMeans implements part of the functionality of the do_colsum R
## internal funtion.
## dims specifies how many dimensions we want to reduce
function colMeans(x, na_rm=true, dims=1)
    dn = size(x)
    id = [1:dims;]
    n  = product(dn[id])
    dn = extract(dn,id)
    pdn = product(dn)
    res = zeros(typeof(x[1]/1), pdn)
    init = z(x[1])
    sentinel = typemin(typeof(init))
    for j = 0:pdn-1
        sum = init
        cnt = 0
        off = (j * n)
        for i = 1:n
         v = x[i+off]
            if (!is_na(v))
                cnt += 1
                sum += v
            elseif na_rm
                sum = sentinel
            end
        end
        res[j+1] = is_na(sum) ?  sum :  sum/cnt
    end
    res
end


Out[3]:
colMeans (generic function with 3 methods)

In [4]:
l = 100000
x1 = Int32[1:l;]
x2 = Float32[1:l;]
x3 = fill(true3, l)
x4 = complex(x2)
d = 2
iter = 1000
using Base.Profile
for x in (x1, x2, x3, x4)
    info(eltype(x))
    x = reshape(x, l÷d, d)
    run(x,1)#Precompile
    @time run(x,iter)
    #Analyze code for type instabilities
   # @code_warntype colMeans(x,true,1)
    #Run profiler - using Base.Profile
  #  @profile colMeans(x,true,1)
    #Profile.print()
end


INFO: Int32
  0.087937 seconds (51.00 k allocations: 2.548 MB)
INFO: Float32
  0.141269 seconds (49.00 k allocations: 2.518 MB)
INFO: ThreeWay
  0.095119 seconds (51.00 k allocations: 2.548 MB)
INFO: Complex{Float32}
  0.165120 seconds (49.00 k allocations: 2.518 MB, 3.12% gc time)

In [ ]: