In [1]:
import ipyrad.analysis as ipa
import numpy as np
import pandas as pd
import itertools
In [2]:
# get a sequence alignment from the HDF5
In [3]:
database = "/home/deren/Downloads/spp30.seqs.hdf5"
idx=0
wmin=0
wmax=100000
In [ ]:
parse_phystring
In [12]:
# init a popgen analysis object
pop = ipa.popgen(
name="ped",
workdir="analysis-popgen",
data="./pedicularis/clust85_min12_outfiles/clust85_min12.loci",
)
In [11]:
pop._fst()
In [ ]:
def _loci_to_arr(self):
"""
return a frequency array from a loci file for all loci with taxa from
taxdict and min coverage from mindict.
"""
#
loci = infile.read().strip().split("|\n")
## make the array (4 or 5) and a mask array to remove loci without cov
nloci = len(loci)
keep = np.zeros(nloci, dtype=np.bool_)
arr = np.zeros((nloci, 4, 300), dtype=np.float64)
## six rows b/c one for each p3, and for the fused p3 ancestor
if len(taxdict) == 5:
arr = np.zeros((nloci, 6, 300), dtype=np.float64)
## if not mindict, make one that requires 1 in each taxon
if isinstance(mindict, int):
mindict = {i: mindict for i in taxdict}
elif isinstance(mindict, dict):
mindict = {i: mindict[i] for i in taxdict}
else:
mindict = {i: 1 for i in taxdict}
## raise error if names are not 'p[int]'
allowed_names = ['p1', 'p2', 'p3', 'p4', 'p5']
if any([i not in allowed_names for i in taxdict]):
raise IPyradError(\
"keys in taxdict must be named 'p1' through 'p4' or 'p5'")
## parse key names
keys = sorted([i for i in taxdict.keys() if i[0] == 'p'])
outg = keys[-1]
## grab seqs just for the good guys
for loc in xrange(nloci):
## parse the locus
lines = loci[loc].split("\n")[:-1]
names = [i.split()[0] for i in lines]
seqs = np.array([list(i.split()[1]) for i in lines])
## check that names cover the taxdict (still need to check by site)
covs = [sum([j in names for j in taxdict[tax]]) >= mindict[tax] \
for tax in taxdict]
## keep locus
if all(covs):
keep[loc] = True
## get the refseq
refidx = np.where([i in taxdict[outg] for i in names])[0]
refseq = seqs[refidx].view(np.uint8)
ancestral = np.array([reftrick(refseq, GETCONS2)[:, 0]])
## freq of ref in outgroup
iseq = _reffreq2(ancestral, refseq, GETCONS2)
arr[loc, -1, :iseq.shape[1]] = iseq
## enter 4-taxon freqs
if len(taxdict) == 4:
for tidx, key in enumerate(keys[:-1]):
## get idx of names in test tax
nidx = np.where([i in taxdict[key] for i in names])[0]
sidx = seqs[nidx].view(np.uint8)
## get freq of sidx
iseq = _reffreq2(ancestral, sidx, GETCONS2)
## fill it in
arr[loc, tidx, :iseq.shape[1]] = iseq
else:
## entere p5; and fill it in
iseq = _reffreq2(ancestral, refseq, GETCONS2)
arr[loc, -1, :iseq.shape[1]] = iseq
## enter p1
nidx = np.where([i in taxdict['p1'] for i in names])[0]
sidx = seqs[nidx].view(np.uint8)
iseq = _reffreq2(ancestral, sidx, GETCONS2)
arr[loc, 0, :iseq.shape[1]] = iseq
## enter p2
nidx = np.where([i in taxdict['p2'] for i in names])[0]
sidx = seqs[nidx].view(np.uint8)
iseq = _reffreq2(ancestral, sidx, GETCONS2)
arr[loc, 1, :iseq.shape[1]] = iseq
## enter p3 with p4 masked, and p4 with p3 masked
nidx = np.where([i in taxdict['p3'] for i in names])[0]
nidy = np.where([i in taxdict['p4'] for i in names])[0]
sidx = seqs[nidx].view(np.uint8)
sidy = seqs[nidy].view(np.uint8)
xseq = _reffreq2(ancestral, sidx, GETCONS2)
yseq = _reffreq2(ancestral, sidy, GETCONS2)
mask3 = xseq != 0
mask4 = yseq != 0
xseq[mask4] = 0
yseq[mask3] = 0
arr[loc, 2, :xseq.shape[1]] = xseq
arr[loc, 3, :yseq.shape[1]] = yseq
## enter p34
nidx = nidx.tolist() + nidy.tolist()
sidx = seqs[nidx].view(np.uint8)
iseq = _reffreq2(ancestral, sidx, GETCONS2)
arr[loc, 4, :iseq.shape[1]] = iseq
## size-down array to the number of loci that have taxa for the test
arr = arr[keep, :, :]
## size-down sites to
arr = masknulls(arr)
return arr, keep
In [ ]:
In [309]:
pop.results
Out[309]:
In [527]:
popdict = {'i': [2, 3], 'j': [2, 4]}
mindict = {}
(mindict if mindict else {j:1 for j in popdict})
Out[527]:
In [528]:
testdata1 = np.array([
[0, 0, 0, 0, 0],
[1, 0, 0, 0, 0],
[1, 1, 1, 0, 0],
[1, 1, 1, 1, 0],
[1, 1, 1, 1, 0],
[1, 1, 1, 1, 1],
])
testdata2 = np.random.binomial(1, 0.01, (6, 100))
In [532]:
class Self:
def __init__(self, data, popdict={}, mindict={}, mapfile=False):
self.data = data
self.popdict = popdict
self.mindict = mindict
self._parsedata()
self._checkdicts()
self._filterdata()
def _parsedata(self):
"""
parse data as an ndarray, vcfile, or locifile and store
in ... arrays, one with all sites, and mapfile...
"""
pass
def _checkdicts(self):
#assert len(popdict) > 2, "must enter at least two populations"
self.mindict = (mindict if mindict else {j:1 for j in popdict})
def _filterdata(self):
pass
def fst_H(self, pop1idx, pop2idx):
ii = list(itertools.combinations(pop1idx, 2))
jj = list(itertools.combinations(pop2idx, 2))
kk = itertools.combinations(pop1idx + pop2idx, 2)
between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk)
diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]
sums = np.sum(diff, axis=0)
a = sums / sums.shape[0]
diff = [self.data[i] != self.data[j] for (i, j) in between]
sums = np.sum(diff, axis=0)
b = sums / sums.shape[0]
return abs(1 - (a / b))
def theta_W(self):
"return Watternson's theta for each population"
df = pd.DataFrame(
data=np.zeros(len(self.popdict)),
columns=["theta"],
index=list(self.popdict.keys())
)
for row in self.popdict:
idat = self.data[self.popdict[row], :]
segregating = np.invert(np.all(idat == idat[0], axis=0)).sum()
denom = np.sum((1 / i for i in range(1, idat.shape[1] - 1)))
theta = segregating / denom
df.loc[row] = theta
return df
def pi(self, pop1idx, pop2idx):
pass
In [533]:
popdict = {'A': [0, 1], 'B': [2, 3, 4, 5]}
mindict = {'A': 1, 'B': 2}
self = Self(testdata2, popdict, mindict)
In [535]:
self.theta_W()
Out[535]:
In [537]:
print(self.data)
print("\nFst per SNP:", self.fst_H([0, 1], [2, 3, 4, 5]))
print("\nFst mean:", np.mean(self.fst_H([0, 1], [2, 3, 4, 5])))
print("\ntheta per individual:", self.theta_W([0, 1], [2, 3, 4, 5]))
print("\ntheta per population:", self.theta_W([0, 1], [2, 3, 4, 5]))
In [555]:
pd.DataFrame(
data=np.zeros((2, 2)),
index=['aaaaaa', 'bbbbbbbbbbb'],
)
Out[555]:
In [242]:
self.fst([0, 1], [2, 3, 4, 5])
Out[242]:
In [235]:
pop1idx = [0, 1]
pop2idx = [2, 3, 4, 5]
pop1 = self.data[pop1idx, :]
pop2 = self.data[pop2idx, :]
popa = self.data[pop1idx + pop2idx, :]
In [74]:
ilist = itertools.combinations(range(4), 2)
list(ilist)
Out[74]:
In [75]:
ilist = itertools.combinations(range(pop2.shape[0]), 2)
list(ilist)
Out[75]:
In [80]:
def diffs(pop):
ilist = itertools.combinations(range(pop.shape[0]), 2)
diff = [pop[i] == pop[j] for (i, j) in ilist]
sums = np.sum(diff, axis=0)
return sums/sums.shape[0]
In [231]:
def fst(pop1idx, pop2idx):
ii = list(itertools.combinations(pop1idx, 2))
jj = list(itertools.combinations(pop2idx, 2))
kk = itertools.combinations(pop1idx + pop2idx, 2)
between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk)
diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]
sums = np.sum(diff, axis=0)
a = sums / sums.shape[0]
diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]
sums = np.sum(diff, axis=0)
b = sums / sums.shape[0]
return abs(1 - (a / b))
In [175]:
#fst([0, 1], [2, 3, 4, 5])
In [245]:
ii = list(itertools.combinations(pop1idx, 2))
jj = list(itertools.combinations(pop2idx, 2))
within = ii + jj
kk = itertools.combinations(pop1idx + pop2idx, 2)
between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk)
In [246]:
ii + jj
Out[246]:
In [247]:
between
Out[247]:
In [244]:
self.data
Out[244]:
In [209]:
diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]
sums = np.sum(diff, axis=0)
a = sums / sums.shape[0]
a
Out[209]:
In [243]:
abs(1 - (a / b))
Out[243]:
In [210]:
diff = [self.data[i] != self.data[j] for (i, j) in between]
sums = np.sum(diff, axis=0)
b = sums / sums.shape[0]
In [195]:
1 - (a / b)
Out[195]:
In [196]:
self.data
Out[196]:
In [171]:
between
Out[171]:
In [79]:
diffs(pop2) / diffs(popa)
Out[79]:
In [14]:
for i in range(pop1.shape[0]):
for j in range(pop1.shape[0]):
print(i,j)
In [123]:
def _fstH(self, pop1idx, pop2idx):
"Hudson's Fst estimator"
pop1 = self.data[pop1idx, :]
pop2 = self.data[pop2idx, :]
pop1freq = np.mean(pop1, axis=0)
pop2freq = np.mean(pop2, axis=0)
a = (pop1freq - pop2freq) ** 2
b = ((pop1freq * (1.0 - pop1freq))) / (pop1.shape[0] - 1)
c = ((pop2freq * (1.0 - pop2freq))) / (pop2.shape[0] - 1)
return pd.Series(a - b - c).round(5)
In [141]:
def _fstWC(self, pop1idx, pop2idx):
"Wier and Cockerham (1984) Fst estimator"
pop1 = self.data[pop1idx, :]
pop2 = self.data[pop2idx, :]
popa = self.data[pop1idx + pop2idx, :]
pop1freq = np.mean(pop1, axis=0)
pop2freq = np.mean(pop2, axis=0)
popafreq = np.mean(popa, axis=0)
nbar = pop1.sum(axis=0) / 2. + pop2.sum(axis=0) / 2.
s2a = (
(1. / ((popa.shape[0] - 1) * nbar))
* ((pop1.shape[0] * 1))
)
nc = (
(1. / (npops - 1.))
* (popa.shape[0]
- ((pop1.shape[0] ** 2 + pop2.shape[0] ** 2) / popa.shape[0])
)
)
#t1 =
#t2 =
return t1, t2
In [ ]:
def fstWC(self, pop1idx, pop2idx):
pop1 = self.data[pop1idx, :]
pop2 = self.data[pop2idx, :]
popa = self.data[pop1idx + pop2idx, :]
pop1freq = np.mean(pop1, axis=0)
pop2freq = np.mean(pop2, axis=0)
popafreq = np.mean(popa, axis=0)
# frequecy of allele A in the sample of size ni from population i
p = ...
# observed proportion of individuals heterozygous for allele A
hbar =
# the average sample size
nbar =
nc =
In [138]:
pop1.sum(axis=0) / 2. + pop2.sum(axis=0) / 2.
Out[138]:
In [140]:
pop1.shape[0] / 2. + pop2.shape[0] / 2.
Out[140]:
In [ ]:
npops= 2.0
nsamples = float(NpopA + NpopB)
n_bar= (NpopA / npops) + (NpopB / npops)
samplefreq = ( (popAcount+popBcount) / (NpopA + NpopB) )
pop1freq = popAcount / float(NpopA )
pop2freq = popBcount / float(NpopB )
Npop1 = NpopA
Npop2 = NpopB
S2A= (1/ ( (npops-1.0) * n_bar) ) *
( ( (Npop1)* ((pop1freq-samplefreq)**2) ) +
( (Npop2)*((pop2freq-samplefreq)**2) ) )
nc = 1.0/(npops-1.0) * ( (Npop1+Npop2) - (((Npop1**2)+(Npop2**2)) / (Npop1+Npop2)) )
T_1 = S2A -( ( 1/(n_bar-1) ) * ( (samplefreq * (1-samplefreq)) - ((npops-1)/npops)* S2A ) )
T_2 = (( (nc-1) / (n_bar-1) ) * samplefreq *(1-samplefreq) ) + (1.0 + (((npops-1)*(n_bar-nc)) / (n_bar-1))) * (S2A/npops)
In [119]:
pop1idx = [0, 1]
pop2idx = [2, 3, 4]
_fst(self, pop1idx, pop2idx)
Out[119]:
In [120]:
pop1idx = [0, 4]
pop2idx = [1, 2, 3]
_fst(self, pop1idx, pop2idx)
Out[120]:
In [22]:
pop1 = self.data[pop1idx, :]
pop2 = self.data[pop2idx, :]
popa = self.data[pop1idx + pop2idx, :]
print(pop1)
print(pop2)
In [104]:
pop1freq = np.mean(pop1, axis=0)
pop2freq = np.mean(pop2, axis=0)
popafreq = np.mean(popa, axis=0)
In [105]:
a = (pop1freq - pop2freq) ** 2
a
Out[105]:
In [106]:
b = ((pop1freq * (1.0 - pop1freq))) / (pop1.shape[0] - 1)
b
Out[106]:
In [107]:
c = ((pop2freq * (1.0 - pop2freq))) / (pop2.shape[0] - 1)
c
Out[107]:
In [108]:
pd.Series(a - b - c).round(3)
Out[108]:
In [98]:
pd.Series(a - b - c).round(3)
Out[98]:
In [114]:
d = (pop1freq * (1.0 - pop2freq)) + (pop1freq * (1.0 - pop2freq))
d
Out[114]:
In [151]:
(np.mean(popa, axis=0) - np.mean(pop2, axis=0)) / (np.mean(popa, axis=0))
Out[151]:
In [152]:
popa
Out[152]:
In [153]:
np.count_nonzero(pop2, axis=0)
Out[153]:
In [169]:
a = np.mean(popa, axis=0)
mask = a > 0.5
a[a > 0.5] = abs(a[a > 0.5] - 1)
a
Out[169]:
In [172]:
b = np.mean(pop2, axis=0)
mask = b > 0.5
b[b > 0.5] = abs(b[b > 0.5] - 1)
b
Out[172]:
In [171]:
(a - b) / a
Out[171]:
In [ ]: