`anhima.pca` - Principal components analysis



In [1]:

    
import numpy as np
np.random.seed(42)
import scipy.stats
import itertools
import random
random.seed(1)
import matplotlib.pyplot as plt
%matplotlib inline
import sys
# import anhima
# dev imports
sys.path.insert(0, '..')
%reload_ext autoreload
%autoreload 1
%aimport anhima.sim
%aimport anhima.gt
%aimport anhima.pca
%aimport anhima.dist



In [2]:

    
# simulate non-uniform variant positions
n_variants = 1000
p = 0
pos = []
for i in range(n_variants):
    gap = int(np.abs(np.cos(i/100))*100)
    p += gap
    pos.append(p)
pos = np.array(pos)



In [3]:

    
# simulate genotypes with no relatedness
n_samples = 100
labels = [''.join(s) for s in itertools.product('ABCDEFGHIJ', repeat=2)]
ploidy = 2
af_dist = scipy.stats.beta(a=.4, b=.6)
p_missing = 0
genotypes = anhima.sim.simulate_biallelic_genotypes(n_variants, n_samples, af_dist, p_missing, ploidy)
gn = anhima.gt.as_n_alt(genotypes)



In [4]:

    
# simulate three sub-populations with relatedness
genotypes_subpop1 = anhima.sim.simulate_relatedness(genotypes[:, :n_samples//2], relatedness=.5, n_iter=1000)
genotypes_subpop2 = anhima.sim.simulate_relatedness(genotypes[:, n_samples//2:], relatedness=.5, n_iter=1000)
genotypes_subpop2a = anhima.sim.simulate_relatedness(genotypes_subpop2[:, n_samples//4:], relatedness=.5, n_iter=500)
genotypes_subpop2b = anhima.sim.simulate_relatedness(genotypes_subpop2[:, :n_samples//4], relatedness=.5, n_iter=500)
genotypes_related = np.concatenate([genotypes_subpop1, genotypes_subpop2a, genotypes_subpop2b], axis=1)
gnr = anhima.gt.as_n_alt(genotypes_related)



In [5]:

    
sample_colors = (['red']*50) + (['blue']*25) + (['green']*25)



In [6]:

    
# run PCA
p, t = anhima.pca.pca(gn)
pr, tr = anhima.pca.pca(gnr)



In [7]:

    
anhima.pca.plot_coords(p, t, labels=labels, colors=sample_colors, sizes=40);



In [8]:

    
anhima.pca.plot_coords(pr, tr, colors=sample_colors, sizes=40);



In [9]:

    
anhima.pca.plot_variance_explained(pr);



In [10]:

    
anhima.pca.plot_loadings(pr, pc=1, pos=pos);



In [11]:

anhima.pca - Principal components analysis

`anhima.pca` - Principal components analysis