In [1]:
%run ../../../shared_setup.ipynb


The Cython magic has been move to the Cython package, hence 
`%load_ext cythonmagic` is deprecated; Please use `%load_ext Cython` instead.

Though, because I am nice, I'll still try to load it for you this time.

In [2]:
evaluation_genes_hb3 = [
    'PF3D7_0106300',
    'PF3D7_0207300',
    'PF3D7_0207400',
    'PF3D7_0207500',
    'PF3D7_0207600',
    'PF3D7_0207700',
    'PF3D7_0207800',
    'PF3D7_0208000',
#     'PF3D7_0220800',  # exons only, need to manually edit bwamem alignment to handle introns properly
    'PF3D7_0304600',
    'PF3D7_0402300',  # expect some discordance between genbank and birren
    'PF3D7_0417200',  # looks like birren has a few errors
    'PF3D7_0424200', 
    'PF3D7_0508000',
    'PF3D7_0620400', 
    'PF3D7_0708400',
    'PF3D7_0709100',  # nice, has dense cluster of SNPs with good concordance
    'PF3D7_0709300', 
    'PF3D7_0804800',  # genbank has errors?
    'PF3D7_0831600',  # exons only, nice as has dense cluster of SNPs with good concordance
    'PF3D7_0902800', 
    'PF3D7_0905400',  # exons only
    'PF3D7_0929400',  # exons only
    'PF3D7_0935800',  # exons only
    'PF3D7_1115700',  # nice, dense SNP clusters
    'PF3D7_1133400',  # nice, lots of SNPs
    'PF3D7_1246100', 
    'PF3D7_1323500', 
    'PF3D7_1335000', 
    'PF3D7_1335100',  # almost no agreement between genbank and birren, genbank looks wrong
    'PF3D7_1337200', 
    'PF3D7_1434200', 
    'PF3D7_1447900', 
]
len(evaluation_genes_hb3)


Out[2]:
32

In [27]:
truth_dir = '/data/plasmodium/pfalciparum/pf-crosses/data/evaluation/truth'
bam_fn = os.path.join(truth_dir, 'bwamem_intractg', 'alignment', 'genbank_hb3_coding_sequences.bam')
bam = pysam.AlignmentFile(bam_fn, mode='rb')

def lookup_genbank_id(gene_id):
    gene = lkp_feature[gene_id]
    read = next(bam.fetch(gene.feature_chrom, gene.feature_start, gene.feature_stop))
    return read.query_name

In [28]:
tbl_eval = (
    tbl_genes
    .selectin('feature_id', evaluation_genes_hb3)
    .replaceall(None, '')
    .cutout(1, 4, 5, 9, 10)
    .rename({'feature_chrom': 'Chromosome', 
             'feature_start': 'Start', 
             'feature_stop': 'Stop', 
             'feature_id': 'ID',
             'feature_name': 'Name',
             'feature_previous_id': 'Previous ID'})
    .addfield('Genbank Accession', lambda row: lookup_genbank_id(row.ID))
)
tbl_eval.displayall(index_header=False)


Chromosome Start Stop ID Name Previous ID Genbank Accession
Pf3D7_01_v3 265208 269173 PF3D7_0106300 ATP6 PFA0310c gi|56342158|dbj|AB121052.1|
Pf3D7_02_v3 290168 292703 PF3D7_0207300 SERA8 PFB0325c gi|803375251|dbj|AB733715.1|
Pf3D7_02_v3 294273 297616 PF3D7_0207400 SERA7 PFB0330c gi|803375249|dbj|AB733714.1|
Pf3D7_02_v3 298897 302564 PF3D7_0207500 SERA6 PFB0335c gi|803375247|dbj|AB733713.1|
Pf3D7_02_v3 303593 307027 PF3D7_0207600 SERA5 PFB0340c gi|803375245|dbj|AB733712.1|
Pf3D7_02_v3 308847 312155 PF3D7_0207700 SERA4 PFB0345c gi|803375243|dbj|AB733711.1|
Pf3D7_02_v3 313449 316741 PF3D7_0207800 SERA3 PFB0350c gi|803375241|dbj|AB733710.1|
Pf3D7_02_v3 322338 325723 PF3D7_0208000 SERA1 PFB0360c gi|803375237|dbj|AB733708.1|
Pf3D7_03_v3 221323 222516 PF3D7_0304600 CSP PFC0210c gi|56342142|dbj|AB121018.1|
Pf3D7_04_v3 137640 146653 PF3D7_0402300 RH1 PFD0110w gi|33414602|gb|AF411930.2|
Pf3D7_04_v3 748088 749914 PF3D7_0417200 DHFR-TS PFD0830w gi|340507|gb|J03772.1|PFADHFRTSE
Pf3D7_04_v3 1085979 1091277 PF3D7_0424200 RH4 PFD1150c gi|21321386|gb|AF420310.1|
Pf3D7_05_v3 328666 329715 PF3D7_0508000 P38 PFE0395c gi|133900606|gb|EF137222.1|
Pf3D7_06_v3 851378 852955 PF3D7_0620400 MSP10 PFF0995c gi|237664869|gb|FJ406615.1|
Pf3D7_07_v3 381592 384614 PF3D7_0708400 HSP90 PF07_0029 gi|505339|gb|L34028.1|PFAHSP86B
Pf3D7_07_v3 408215 411961 PF3D7_0709100 PF07_0035 gi|2642510|gb|AF030690.1|
Pf3D7_07_v3 413560 421749 PF3D7_0709300 PF07_0037 gi|2642515|gb|AF030693.1|
Pf3D7_08_v3 278381 279034 PF3D7_0804800 CYP24 PF08_0121 gi|1000520|gb|U10322.1|PFU10322
Pf3D7_08_v3 1358314 1363618 PF3D7_0831600 CLAG8 MAL7P1.229 gi|167962700|dbj|AB250802.1|
Pf3D7_09_v3 121621 125006 PF3D7_0902800 SERA9 PFI0135c gi|803375253|dbj|AB733716.1|
Pf3D7_09_v3 270740 274789 PF3D7_0905400 RhopH3 PFI0265c gi|167962547|dbj|AB250806.1|
Pf3D7_09_v3 1175203 1180762 PF3D7_0929400 RhopH2 PFI1445w gi|167963178|dbj|AB250805.1|
Pf3D7_09_v3 1413840 1419754 PF3D7_0935800 CLAG9 PFI1730w gi|167962308|dbj|AB250804.1|
Pf3D7_11_v3 592130 593584 PF3D7_1115700 PF11_0165 gi|9719453|gb|AF282979.1|
Pf3D7_11_v3 1293856 1295724 PF3D7_1133400 AMA1 PF11_0344 gi|182407599|gb|EU586393.1|
Pf3D7_12_v3 1915749 1917798 PF3D7_1246100 ALAS PFL2210w gi|1220442|gb|L46348.1|PFADAAS
Pf3D7_13_v3 975403 977175 PF3D7_1323500 PMV PF13_0133 gi|58372444|gb|AY878742.1|
Pf3D7_13_v3 1416316 1417458 PF3D7_1335000 MSRP1 PF13_0196 gi|237665051|gb|FJ406706.1|
Pf3D7_13_v3 1419086 1420141 PF3D7_1335100 MSP7 PF13_0197 gi|116109338|gb|DQ987539.1|
Pf3D7_13_v3 1497877 1501494 PF3D7_1337200 MAL13P1.186 gi|6690111|gb|AF111814.2|
Pf3D7_14_v3 1368815 1369796 PF3D7_1434200 CAM PF14_0323 gi|160125|gb|M59349.1|PFACALMOD
Pf3D7_14_v3 1954601 1957675 PF3D7_1447900 MDR2 PF14_0455 gi|294166|gb|L13381.1|PFAMDR2X

In [ ]: