In [1]:
%run ../../../shared_setup.ipynb
In [2]:
evaluation_genes_hb3 = [
'PF3D7_0106300',
'PF3D7_0207300',
'PF3D7_0207400',
'PF3D7_0207500',
'PF3D7_0207600',
'PF3D7_0207700',
'PF3D7_0207800',
'PF3D7_0208000',
# 'PF3D7_0220800', # exons only, need to manually edit bwamem alignment to handle introns properly
'PF3D7_0304600',
'PF3D7_0402300', # expect some discordance between genbank and birren
'PF3D7_0417200', # looks like birren has a few errors
'PF3D7_0424200',
'PF3D7_0508000',
'PF3D7_0620400',
'PF3D7_0708400',
'PF3D7_0709100', # nice, has dense cluster of SNPs with good concordance
'PF3D7_0709300',
'PF3D7_0804800', # genbank has errors?
'PF3D7_0831600', # exons only, nice as has dense cluster of SNPs with good concordance
'PF3D7_0902800',
'PF3D7_0905400', # exons only
'PF3D7_0929400', # exons only
'PF3D7_0935800', # exons only
'PF3D7_1115700', # nice, dense SNP clusters
'PF3D7_1133400', # nice, lots of SNPs
'PF3D7_1246100',
'PF3D7_1323500',
'PF3D7_1335000',
'PF3D7_1335100', # almost no agreement between genbank and birren, genbank looks wrong
'PF3D7_1337200',
'PF3D7_1434200',
'PF3D7_1447900',
]
len(evaluation_genes_hb3)
Out[2]:
In [27]:
truth_dir = '/data/plasmodium/pfalciparum/pf-crosses/data/evaluation/truth'
bam_fn = os.path.join(truth_dir, 'bwamem_intractg', 'alignment', 'genbank_hb3_coding_sequences.bam')
bam = pysam.AlignmentFile(bam_fn, mode='rb')
def lookup_genbank_id(gene_id):
gene = lkp_feature[gene_id]
read = next(bam.fetch(gene.feature_chrom, gene.feature_start, gene.feature_stop))
return read.query_name
In [28]:
tbl_eval = (
tbl_genes
.selectin('feature_id', evaluation_genes_hb3)
.replaceall(None, '')
.cutout(1, 4, 5, 9, 10)
.rename({'feature_chrom': 'Chromosome',
'feature_start': 'Start',
'feature_stop': 'Stop',
'feature_id': 'ID',
'feature_name': 'Name',
'feature_previous_id': 'Previous ID'})
.addfield('Genbank Accession', lambda row: lookup_genbank_id(row.ID))
)
tbl_eval.displayall(index_header=False)
In [ ]: