In [1]:
%run ../../shared_setup.ipynb
In [2]:
# load PASS variants for all three crosses
callsets = load_callsets(COMBINED_CALLSET_FN_TEMPLATE,
variant_filter='FILTER_PASS')
In [3]:
for cross in CROSSES:
samples = callsets[cross]['calldata'].dtype.names
progeny = samples[2:]
progeny_clones = set([p.split('/')[0] for p in progeny])
print(cross, len(progeny_clones))
In [4]:
tbl_samples = (etl
.fromtsv(os.path.join(PUBLIC_DIR, 'samples.txt'))
.convert('coverage', lambda v: int(v[:-1]))
)
tbl_samples
Out[4]:
In [5]:
tbl_samples.valuecounts('cross')
Out[5]:
In [6]:
df_samples = tbl_samples.todataframe()
df_samples.groupby('cross').coverage.median()
Out[6]:
In [7]:
df_samples.groupby('cross').coverage.min()
Out[7]:
In [8]:
df_samples.groupby('cross').coverage.max()
Out[8]:
In [9]:
tbl_samples.aggregate('cross', [('median', 'coverage', lambda g: np.median(list(g))),
('min', 'coverage', min),
('max', 'coverage', max)])
Out[9]:
In [17]:
def count_variants(query):
def f(row):
callset = filter_variants(callsets[row.cross], query=query)
return callset['variants'].size
return f
tbl_variation = (etl
.wrap([['cross']] + [[cross] for cross in CROSSES])
.addfield('n_snps', count_variants('is_snp'))
.addfield('n_indels', count_variants('~is_snp'))
.addfield('n_snps_multiallelic', count_variants('is_snp & (num_alleles > 2)'))
.addfield('n_indels_multiallelic', count_variants('~is_snp & (num_alleles > 2)'))
.addfield('n_snps_multiallelic_gatk', count_variants('is_snp & (num_alleles > 2) & ((set == b"GATK") | (set == b"Intersection"))'))
.addfield('n_indels_multiallelic_gatk', count_variants('~is_snp & (num_alleles > 2) & ((set == b"GATK") | (set == b"Intersection"))'))
.addfield('n_snps_coding', count_variants('is_snp & (CDSAnnotationID != b".")'))
.addfield('n_snps_noncoding', count_variants('is_snp & (CDSAnnotationID == b".")'))
.addfield('n_indels_coding', count_variants('~is_snp & (CDSAnnotationID != b".")'))
.addfield('n_indels_noncoding', count_variants('~is_snp & (CDSAnnotationID == b".")'))
.addfield('ratio_snp_indel_coding', lambda row: row.n_snps_coding / row.n_indels_coding)
.addfield('ratio_snp_indel_noncoding', lambda row: row.n_snps_noncoding / row.n_indels_noncoding)
.melt(key='cross')
.recast(variablefield='cross', valuefield='value')
)
tbl_variation.displayall()
In [15]:
callsets['3d7_hb3']['variants']['set']
Out[15]: