In [1]:
# load the JSeqArray package
using JSeqArray
Open an existing SeqArray file and display its structure. The dimensions of sample.id and variant.id tell you the total numbers of samples and variants, i.e., 1092 samples and 19,773 variants.
In [2]:
# get the file name of example data
fn = seqExample(:kg)
f = seqOpen(fn)
Out[2]:
Genotypic data and annotations are stored in an array-oriented manner, providing efficient data access using the Julia programming language. seqFilterSet()
and seqGetData()
can be used together to retrieve data for a selected set of samples from a defined genomic region. seqApply()
applies a user-defined function to array margins of genotypes and annotations.
In [3]:
# launches the built-in ClusterManager with 4 worker processes and return worker IDs
addprocs(4)
Out[3]:
In [4]:
seqFilterReset(f) # reset the filter
In [5]:
af = seqApply(f, "genotype", asis=:unlist) do geno::Array{UInt8,3}
N = size(geno, 3)
rv = Vector{Float64}(N)
for k in 1:N
sum = 0; n = 0
for g in geno[:,:,k]
if g != 0xFF
sum += g == 0 # 0 is the reference allele
n += 1
end
end
rv[k] = sum / n
end
return rv
end
Out[5]:
In [6]:
af = seqParallel(f) do gdsfile
v = seqApply(gdsfile, "genotype", asis=:unlist) do geno::Array{UInt8,3}
N = size(geno, 3)
rv = Vector{Float64}(N)
for k in 1:N
sum = 0; n = 0
for g in geno[:,:,k]
if g != 0xFF
sum += g == 0 # 0 is the reference allele
n += 1
end
end
rv[k] = sum / n
end
return rv
end
return v
end
Out[6]:
In [7]:
# initialize the covariance matrix
ss = Any[0.0]
seqApply(f, "#dosage", ss) do geno::Matrix{UInt8}, ss::Vector{Any}
# calculate allele frequencies
N = size(geno, 2); af = Vector{Float64}(N)
for i in 1:N
g = geno[:,i]; af[i] = mean(g[g .!= 0xFF])
end
af *= 0.5
# normalized by allele frequencies
g = Matrix{Float64}(size(geno))
for i in 1:N
g[:,i] = (geno[:,i] - 2*af[i]) / sqrt(af[i] * (1 - af[i]))
end
# correct missing genotypes
g[isnan(g)] = 0.0; g[geno .== 0xFF] = 0.0
# update the cov matrix
ss[1] += g * g'
end
# scaled
ss[1] *= size(ss[1], 1) / sum(diag(ss[1]))
ss[1]
Out[7]:
In [8]:
# initialize the covariance matrix
cov2 = seqParallel(f, combine=+) do gdsfile
ss = Any[0.0]
seqApply(gdsfile, "#dosage", ss) do geno::Matrix{UInt8}, ss::Vector{Any}
# calculate allele frequencies
N = size(geno, 2); af = Vector{Float64}(N)
for i in 1:N
g = geno[:,i]; af[i] = mean(g[g .!= 0xFF])
end
af *= 0.5
# normalized by allele frequencies
g = Matrix{Float64}(size(geno))
for i in 1:N
g[:,i] = (geno[:,i] - 2*af[i]) / sqrt(af[i] * (1 - af[i]))
end
# correct missing genotypes
g[isnan(g)] = 0.0; g[geno .== 0xFF] = 0.0
# update the cov matrix
ss[1] += g * transpose(g)
end
return ss[1]
end
cov2 *= size(cov2, 1) / sum(diag(cov2))
cov2
Out[8]:
In [9]:
# eigen-decomposition
(w, v) = eig(-cov2)
-w # eigenvalues
Out[9]:
In [10]:
v[:,1] # the first eigenvector
Out[10]: