In [1]:
from __future__ import division
import sys
import petl.interactive as etl
import petlx.all
sys.path.insert(0, '/home/aliman/src/github/alimanfoo/veff')
%reload_ext autoreload
%autoreload 1
%aimport veff

In [2]:
fasta_fn = '../../../data/genome/sanger/version3/September_2012/Pf3D7_v3.fa'
gff3_fn = '../../../data/genome/sanger/version3/September_2012/Pf3D7_v3.gff'
genome = veff.Genome(fasta_fn, gff3_fn, seqid=None)

In [12]:
vcf_fn = '../../../data/public/20141022/3d7_hb3.combined.final.vcf.gz'
tbl = (etl
    .fromvcf(vcf_fn, samples=None)
#     .eq('CHROM', 'Pf3D7_01_v3')
    .unpackinfo('EFF')
    .addfield('veff', lambda row: list(veff.get_effects(genome, row.CHROM, row.POS, row.REF, row.ALT[0])))
    .convert(['EFF', 'veff'], lambda v: v[0] if v else None)
    .addfield('strand', lambda row: row.veff.gene_strand)
    .addfield('snpeff_effect', lambda row: row.EFF.split('(')[0] if row.EFF is not None else None)
    .addfield('veff_effect', lambda row: row.veff.effect if row.veff is not None else None)
    .addfield('snpeff_codon_change', lambda row: row.EFF.split('|')[2] if row.EFF is not None else None)
    .addfield('veff_codon_change', lambda row: row.veff.codon_change if row.veff is not None else None)
    .addfield('snpeff_aa_change', lambda row: row.EFF.split('|')[3] if row.EFF is not None else None)
    .addfield('veff_aa_change', lambda row: row.veff.aa_change if row.veff is not None else None)
    .addfield('veff_ref_aa', lambda row: row.veff.ref_aa if row.veff is not None else None)
    .addfield('veff_alt_aa', lambda row: row.veff.alt_aa if row.veff is not None else None)
    .addfield('vlen', lambda row: row.veff.vlen if row.veff is not None else None)
    .cutout('REF', 'ALT')
    .cutout('veff', 'EFF')
    .cutout('ID', 'FILTER', 'QUAL')
#     .selectnotin('eff_effect', ['INTERGENIC', 'INTRON'])
)
tbl.select(lambda row: row.snpeff_effect != row.veff_effect).display(40)


CHROM POS strand snpeff_effect veff_effect snpeff_codon_change veff_codon_change snpeff_aa_change veff_aa_change veff_ref_aa veff_alt_aa vlen
Pf3D7_01_v3 127692 - CODON_CHANGE_PLUS_CODON_INSERTION CODON_INSERTION aac/aTAATAAac aaC/aaTAATAAC N150IIN N150NNN N NNN 6
Pf3D7_01_v3 135874 - CODON_CHANGE_PLUS_CODON_DELETION CODON_DELETION aaggaaaaggaaaat/aat aaGGAAAAGGAAAAT/aaT KEKEN1124N KEKEN1124N KEKEN N -12
Pf3D7_01_v3 137201 - CODON_CHANGE_PLUS_CODON_DELETION CODON_DELETION aatata/ata aATATa/aTa NI685I NI685I NI I -3
Pf3D7_01_v3 146124 - CODON_CHANGE_PLUS_CODON_DELETION CODON_DELETION aataaa/aaa aaTAAA/aaA NK90K NK90K NK K -3
Pf3D7_01_v3 148768 - CODON_CHANGE_PLUS_CODON_DELETION CODON_DELETION ggtgataatttccaccgtgat/gat gGTGATAATTTCCACCGTGAt/gAt GDNFHRD1331D GDNFHRD1331D GDNFHRD D -18
Pf3D7_01_v3 233811 - NON_SYNONYMOUS_CODING SYNONYMOUS_CODING tCa/tTa atC/atT S134L I245I I I 0
Pf3D7_01_v3 257826 - CODON_CHANGE_PLUS_CODON_INSERTION CODON_INSERTION aat/aaAATAATAATAATAATt Caa/AATAATAATAATAATCaa N239KIIIII Q240NNNNNQ Q NNNNNQ 15
Pf3D7_01_v3 263684 - NON_SYNONYMOUS_CODING STOP_GAINED Tat/Gat tTa/tGa Y102D L223* L * 0
Pf3D7_01_v3 266640 - CODON_CHANGE_PLUS_CODON_DELETION CODON_DELETION ggagat/gat gGAGAt/gAt GD844D GD844D GD D -3
Pf3D7_01_v3 288303 + None INTRON None None None None None None -2
Pf3D7_01_v3 329507 - INTERGENIC INTRON None None None None -2
Pf3D7_01_v3 335111 None FRAME_SHIFT INTERGENIC - None -83 None None None -14
Pf3D7_01_v3 335390 None FRAME_SHIFT INTERGENIC - None -176 None None None -2
Pf3D7_01_v3 345060 - INTRAGENIC INTRON None None None None 2
Pf3D7_01_v3 345066 - INTRAGENIC INTRON None None None None 0
Pf3D7_01_v3 345730 - NON_SYNONYMOUS_CODING SYNONYMOUS_CODING Tca/Cca agT/agC S130P S268S S S 0
Pf3D7_01_v3 353959 - CODON_CHANGE_PLUS_CODON_DELETION CODON_DELETION gataatatgaacatggaa/gaa gaTAATATGAACATGGAA/gaA DNMNME746E DNMNME746E DNMNME E -15
Pf3D7_01_v3 354004 - CODON_CHANGE_PLUS_CODON_INSERTION CODON_INSERTION gaa/gTAGTTATAATATGGAaa gaA/gaTAGTTATAATATGGAA E736VVIIWK E736DSYNME E DSYNME 15
Pf3D7_01_v3 355334 - None CODON_INSERTION None aAt/aGTGGTAGTAGTAATAAt None N293SGSSNN N SGSSNN 15
Pf3D7_01_v3 362255 - FRAME_SHIFT INTRAGENIC - None -106 None None None -16
Pf3D7_01_v3 362551 - FRAME_SHIFT INTRAGENIC - None -12 None None None -2
Pf3D7_01_v3 378006 - CODON_CHANGE_PLUS_CODON_DELETION CODON_DELETION aataataaa/aaa aTAATAAAc/aAc NNK105K IIN167N IIN N -6
Pf3D7_01_v3 387715 + INTRAGENIC TODO None None None None 0
Pf3D7_01_v3 387722 + INTRAGENIC TODO None None None None 0
Pf3D7_01_v3 387790 + INTRAGENIC TODO None None None None 12
Pf3D7_01_v3 393224 - FRAME_SHIFT INTRAGENIC - None -87 None None None -7
Pf3D7_01_v3 425557 - STOP_GAINED CODON_INSERTION aac/aGATAGATGTGGATAATATTAATAACAAac aaC/aaGATAGATGTGGATAATATTAATAACAAC N673R*MWIILITN N673KIDVDNINNN N KIDVDNINNN 27
Pf3D7_01_v3 425752 - CODON_CHANGE_PLUS_CODON_DELETION CODON_DELETION aagaataagatggatgtggataatattaat/aat aaGAATAAGATGGATGTGGATAATATTAAT/aaT KNKMDVDNIN599N KNKMDVDNIN599N KNKMDVDNIN N -27
Pf3D7_01_v3 429234 None FRAME_SHIFT INTERGENIC -/TATA None -65Y? None None None 4
Pf3D7_01_v3 429530 None FRAME_SHIFT INTERGENIC - None -164 None None None -10
Pf3D7_01_v3 434091 - CODON_CHANGE_PLUS_CODON_INSERTION CODON_INSERTION gat/gaCATAATGATCATAATGATt Gat/CATAATGATCATAATGATGat D317DIMIIMI D318HNDHNDD D HNDHNDD 18
Pf3D7_01_v3 443938 - INTERGENIC TODO None None None None -4
Pf3D7_01_v3 487967 - NON_SYNONYMOUS_CODING STOP_GAINED gaA/gaT Aga/Tga E649D R746* R * 0
Pf3D7_01_v3 487970 - CODON_CHANGE_PLUS_CODON_DELETION CODON_DELETION gatgaagaa/gaa TGAAGAAga/Aga DEE646E *RR743R *RR R -6
Pf3D7_01_v3 487988 - CODON_CHANGE_PLUS_CODON_INSERTION CODON_INSERTION gat/gAGATGATGAat Tga/AGATGATGATga D642EMMN *739R*** * R*** 9
Pf3D7_01_v3 488006 - CODON_CHANGE_PLUS_CODON_DELETION CODON_DELETION gatgctgaa/gaa TGCTGAAga/Aga DAE634E C*R731R C*R R -6
Pf3D7_01_v3 488135 - CODON_CHANGE_PLUS_CODON_DELETION CODON_DELETION gaagaagaagaagaagag/gag AGAAGAAGAAGAAGAGga/Gga EEEEEE588E RRRRRG685G RRRRRG G -15
Pf3D7_01_v3 488776 - CODON_CHANGE_PLUS_CODON_INSERTION CODON_INSERTION cac/caGAAGAAGGAGAACACc cAa/cGAAGAAGGAGAACACAa H379QKKENT Q476RRRRTQ Q RRRRTQ 15
Pf3D7_01_v3 488912 - CODON_CHANGE_PLUS_CODON_DELETION CODON_DELETION aagaaaaaa/aaa GAAAAAAca/Aca KKK332K EKT429T EKT T -6
Pf3D7_01_v3 489431 - CODON_CHANGE_PLUS_CODON_INSERTION CODON_INSERTION gat/gAAAAAAACATAAAAAAGAat Taa/AAAAAAACATAAAAAAGATaa D161EKNIKKN *258KKT*KR* * KKT*KR* 18

...


In [13]:
tbl.selectnotin('veff_effect', {'INTERGENIC', 'INTRON'}).display(40)


CHROM POS strand snpeff_effect veff_effect snpeff_codon_change veff_codon_change snpeff_aa_change veff_aa_change veff_ref_aa veff_alt_aa vlen
Pf3D7_01_v3 95518 + NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING aGa/aTa aGa/aTa R156I R156I R I 0
Pf3D7_01_v3 95621 + NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING caT/caA caT/caA H190Q H190Q H Q 0
Pf3D7_01_v3 95632 + NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING tGt/tAt tGt/tAt C194Y C194Y C Y 0
Pf3D7_01_v3 95641 + NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING tGt/tAt tGt/tAt C197Y C197Y C Y 0
Pf3D7_01_v3 95680 + NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING aAt/aGt aAt/aGt N210S N210S N S 0
Pf3D7_01_v3 95685 + NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING Gat/Tat Gat/Tat D212Y D212Y D Y 0
Pf3D7_01_v3 95686 + NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING gAt/gCt gAt/gCt D212A D212A D A 0
Pf3D7_01_v3 95710 + NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING tGt/tCt tGt/tCt C220S C220S C S 0
Pf3D7_01_v3 95715 + NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING Agt/Ggt Agt/Ggt S222G S222G S G 0
Pf3D7_01_v3 95716 + NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING aGt/aTt aGt/aTt S222I S222I S I 0
Pf3D7_01_v3 95742 + NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING Gct/Act Gct/Act A231T A231T A T 0
Pf3D7_01_v3 95754 + NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING Gct/Tct Gct/Tct A235S A235S A S 0
Pf3D7_01_v3 98868 + NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING gAt/gGt gAt/gGt D17G D17G D G 0
Pf3D7_01_v3 101269 + NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING Gta/Tta Gta/Tta V749L V749L V L 0
Pf3D7_01_v3 101790 + CODON_DELETION CODON_DELETION aat/- gaAAAT/gaA N923- EN922E EN E -3
Pf3D7_01_v3 107756 - STOP_LOST STOP_LOST tAa/tCa tAa/tCa *198S *198S * S 0
Pf3D7_01_v3 107823 - NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING Atg/Gtg Atg/Gtg M176V M176V M V 0
Pf3D7_01_v3 114473 - NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING Aaa/Caa Aaa/Caa K443Q K443Q K Q 0
Pf3D7_01_v3 120736 - NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING aaA/aaT Att/Ttt K116N I206F I F 0
Pf3D7_01_v3 127256 - CODON_DELETION CODON_DELETION aat/- AATTgt/Tgt N295- NC295C NC C -3
Pf3D7_01_v3 127692 - CODON_CHANGE_PLUS_CODON_INSERTION CODON_INSERTION aac/aTAATAAac aaC/aaTAATAAC N150IIN N150NNN N NNN 6
Pf3D7_01_v3 127725 - SYNONYMOUS_CODING SYNONYMOUS_CODING aaC/aaT aaC/aaT N139 N139N N N 0
Pf3D7_01_v3 130339 - NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING Gca/Aca Gca/Aca A168T A168T A T 0
Pf3D7_01_v3 135874 - CODON_CHANGE_PLUS_CODON_DELETION CODON_DELETION aaggaaaaggaaaat/aat aaGGAAAAGGAAAAT/aaT KEKEN1124N KEKEN1124N KEKEN N -12
Pf3D7_01_v3 136422 - NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING Caa/Aaa Caa/Aaa Q946K Q946K Q K 0
Pf3D7_01_v3 137201 - CODON_CHANGE_PLUS_CODON_DELETION CODON_DELETION aatata/ata aATATa/aTa NI685I NI685I NI I -3
Pf3D7_01_v3 137258 - NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING aCg/aTg aCg/aTg T667M T667M T M 0
Pf3D7_01_v3 138966 - NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING Caa/Gaa Caa/Gaa Q98E Q98E Q E 0
Pf3D7_01_v3 139191 - NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING Gca/Aca Gca/Aca A23T A23T A T 0
Pf3D7_01_v3 145560 - SYNONYMOUS_CODING SYNONYMOUS_CODING tcT/tcA tcT/tcA S279 S279S S S 0
Pf3D7_01_v3 146124 - CODON_CHANGE_PLUS_CODON_DELETION CODON_DELETION aataaa/aaa aaTAAA/aaA NK90K NK90K NK K -3
Pf3D7_01_v3 147952 - CODON_INSERTION CODON_INSERTION -/GTAAAA aAt/aGTAAAAAt -1609VK N1609SKN N SKN 6
Pf3D7_01_v3 148490 - NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING Tca/Gca Tca/Gca S1430A S1430A S A 0
Pf3D7_01_v3 148768 - CODON_CHANGE_PLUS_CODON_DELETION CODON_DELETION ggtgataatttccaccgtgat/gat gGTGATAATTTCCACCGTGAt/gAt GDNFHRD1331D GDNFHRD1331D GDNFHRD D -18
Pf3D7_01_v3 148917 - NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING aaT/aaA aaT/aaA N1287K N1287K N K 0
Pf3D7_01_v3 155281 + NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING tAt/tTt tAt/tTt Y369F Y369F Y F 0
Pf3D7_01_v3 155819 + CODON_INSERTION CODON_INSERTION -/AATAATAAT ctA/ctAAATAATAAT -549NNN L548LNNN L LNNN 9
Pf3D7_01_v3 155877 + NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING Gac/Aac Gac/Aac D568N D568N D N 0
Pf3D7_01_v3 155977 + NON_SYNONYMOUS_CODING NON_SYNONYMOUS_CODING gGc/gAc gGc/gAc G601D G601D G D 0
Pf3D7_01_v3 155978 + SYNONYMOUS_CODING SYNONYMOUS_CODING ggC/ggT ggC/ggT G601 G601G G G 0

...


In [14]:
from collections import Counter

In [15]:
tbl_indels = (tbl
    .selectnotin('veff_effect', {'INTERGENIC', 'INTRON', 'INTRAGENIC', 'TODO', 'FRAME_SHIFT'})
    .selectne('vlen', 0)
    .cutout('snpeff_effect', 'snpeff_codon_change', 'snpeff_aa_change')
    .rename({'veff_effect': 'effect', 
             'veff_codon_change': 'codon_change',
             'veff_ref_aa': 'ref_aa',
             'veff_alt_aa': 'alt_aa',
             'veff_aa_change': 'aa_change', })
    .cutout('codon_change')
    .addfield('aa_added', lambda row: Counter(row.alt_aa) - Counter(row.ref_aa))
    .addfield('aa_subtracted', lambda row: Counter(row.ref_aa) - Counter(row.alt_aa))
)
tbl_indels.display(100)


CHROM POS strand effect aa_change ref_aa alt_aa vlen aa_added aa_subtracted
Pf3D7_01_v3 101790 + CODON_DELETION EN922E EN E -3 Counter() Counter({'N': 1})
Pf3D7_01_v3 127256 - CODON_DELETION NC295C NC C -3 Counter() Counter({'N': 1})
Pf3D7_01_v3 127692 - CODON_INSERTION N150NNN N NNN 6 Counter({'N': 2}) Counter()
Pf3D7_01_v3 135874 - CODON_DELETION KEKEN1124N KEKEN N -12 Counter() Counter({'K': 2, 'E': 2})
Pf3D7_01_v3 137201 - CODON_DELETION NI685I NI I -3 Counter() Counter({'N': 1})
Pf3D7_01_v3 146124 - CODON_DELETION NK90K NK K -3 Counter() Counter({'N': 1})
Pf3D7_01_v3 147952 - CODON_INSERTION N1609SKN N SKN 6 Counter({'S': 1, 'K': 1}) Counter()
Pf3D7_01_v3 148768 - CODON_DELETION GDNFHRD1331D GDNFHRD D -18 Counter() Counter({'D': 1, 'G': 1, 'F': 1, 'H': 1, 'N': 1, 'R': 1})
Pf3D7_01_v3 155819 + CODON_INSERTION L548LNNN L LNNN 9 Counter({'N': 3}) Counter()
Pf3D7_01_v3 161439 + CODON_INSERTION D103DN D DN 3 Counter({'N': 1}) Counter()
Pf3D7_01_v3 162124 + CODON_INSERTION H332HDQVKNKHDQVKNKDDKIKNKD H HDQVKNKHDQVKNKDDKIKNKD 63 Counter({'K': 7, 'D': 5, 'N': 3, 'Q': 2, 'V': 2, 'I': 1, 'H': 1}) Counter()
Pf3D7_01_v3 162967 + CODON_INSERTION H613HNN H HNN 6 Counter({'N': 2}) Counter()
Pf3D7_01_v3 164206 + CODON_INSERTION V1026VNKNINI V VNKNINI 18 Counter({'N': 3, 'I': 2, 'K': 1}) Counter()
Pf3D7_01_v3 173982 + CODON_INSERTION Y373YN Y YN 3 Counter({'N': 1}) Counter()
Pf3D7_01_v3 190317 + CODON_DELETION HNN17H HNN H -6 Counter() Counter({'N': 2})
Pf3D7_01_v3 190862 + CODON_DELETION NNN198N NNN N -6 Counter() Counter({'N': 2})
Pf3D7_01_v3 190944 + CODON_INSERTION D226DSINNSINN D DSINNSINN 24 Counter({'N': 4, 'I': 2, 'S': 2}) Counter()
Pf3D7_01_v3 191685 + CODON_DELETION YN473Y YN Y -3 Counter() Counter({'N': 1})
Pf3D7_01_v3 192608 + CODON_DELETION KG780K KG K -3 Counter() Counter({'G': 1})
Pf3D7_01_v3 192623 + CODON_INSERTION K785KDDDD K KDDDD 12 Counter({'D': 4}) Counter()
Pf3D7_01_v3 194843 + CODON_DELETION DKYE1525D DKYE D -9 Counter() Counter({'Y': 1, 'K': 1, 'E': 1})
Pf3D7_01_v3 195383 + CODON_INSERTION K1705KN K KN 3 Counter({'N': 1}) Counter()
Pf3D7_01_v3 206406 + CODON_INSERTION E79EKDD E EKDD 9 Counter({'D': 2, 'K': 1}) Counter()
Pf3D7_01_v3 207421 + CODON_INSERTION N418NDNYDNNNYDNYDNNNYDNY N NDNYDNNNYDNYDNNNYDNY 57 Counter({'N': 9, 'Y': 5, 'D': 5}) Counter()
Pf3D7_01_v3 216756 + CODON_INSERTION Y327YNNNNN Y YNNNNN 15 Counter({'N': 5}) Counter()
Pf3D7_01_v3 239206 + CODON_INSERTION Y181YNNN Y YNNN 9 Counter({'N': 3}) Counter()
Pf3D7_01_v3 239511 + CODON_INSERTION K282KTNDVK K KTNDVK 15 Counter({'K': 1, 'N': 1, 'T': 1, 'D': 1, 'V': 1}) Counter()
Pf3D7_01_v3 242535 + CODON_DELETION HNIQ1290H HNIQ H -9 Counter() Counter({'I': 1, 'Q': 1, 'N': 1})
Pf3D7_01_v3 244758 + CODON_INSERTION D2031DNNNNNNNN D DNNNNNNNN 24 Counter({'N': 8}) Counter()
Pf3D7_01_v3 246384 + CODON_DELETION NN2573N NN N -3 Counter() Counter({'N': 1})
Pf3D7_01_v3 247606 + CODON_INSERTION Y2981YNN Y YNN 6 Counter({'N': 2}) Counter()
Pf3D7_01_v3 247675 + CODON_DELETION YTNH3004Y YTNH Y -9 Counter() Counter({'H': 1, 'T': 1, 'N': 1})
Pf3D7_01_v3 257826 - CODON_INSERTION Q240NNNNNQ Q NNNNNQ 15 Counter({'N': 5}) Counter()
Pf3D7_01_v3 266640 - CODON_DELETION GD844D GD D -3 Counter() Counter({'G': 1})
Pf3D7_01_v3 282010 + CODON_DELETION YDN286Y YDN Y -6 Counter() Counter({'D': 1, 'N': 1})
Pf3D7_01_v3 282425 + CODON_DELETION GDGDDDDDDVGDDNVD424G GDGDDDDDDVGDDNVD G -45 Counter() Counter({'D': 10, 'V': 2, 'G': 2, 'N': 1})
Pf3D7_01_v3 282546 + CODON_DELETION DDN464D DDN D -6 Counter() Counter({'D': 1, 'N': 1})
Pf3D7_01_v3 282549 + CODON_DELETION DN465D DN D -3 Counter() Counter({'N': 1})
Pf3D7_01_v3 282555 + CODON_DELETION DD467D DD D -3 Counter() Counter({'D': 1})
Pf3D7_01_v3 282597 + CODON_DELETION NDDDD481N NDDDD N -12 Counter() Counter({'D': 4})
Pf3D7_01_v3 282609 + CODON_INSERTION D485DN D DN 3 Counter({'N': 1}) Counter()
Pf3D7_01_v3 283273 + CODON_INSERTION N707NDDDDDDD N NDDDDDDD 21 Counter({'D': 7}) Counter()
Pf3D7_01_v3 283356 + CODON_INSERTION G734GDDDEND G GDDDEND 18 Counter({'D': 4, 'E': 1, 'N': 1}) Counter()
Pf3D7_01_v3 283406 + CODON_INSERTION I751ID I ID 3 Counter({'D': 1}) Counter()
Pf3D7_01_v3 315518 + CODON_DELETION YN301Y YN Y -3 Counter() Counter({'N': 1})
Pf3D7_01_v3 316004 + CODON_DELETION ENVK463E ENVK E -9 Counter() Counter({'K': 1, 'N': 1, 'V': 1})
Pf3D7_01_v3 318006 + CODON_DELETION NIMF1130N NIMF N -9 Counter() Counter({'I': 1, 'M': 1, 'F': 1})
Pf3D7_01_v3 320629 + CODON_DELETION NNN154N NNN N -6 Counter() Counter({'N': 2})
Pf3D7_01_v3 322875 + CODON_INSERTION N169NNIYH N NNIYH 12 Counter({'I': 1, 'Y': 1, 'N': 1, 'H': 1}) Counter()
Pf3D7_01_v3 323333 + CODON_DELETION NE321N NE N -3 Counter() Counter({'E': 1})
Pf3D7_01_v3 324213 + CODON_INSERTION D615DN D DN 3 Counter({'N': 1}) Counter()
Pf3D7_01_v3 338214 + CODON_DELETION DDEDDDDDE349D DDEDDDDDE D -24 Counter() Counter({'D': 6, 'E': 2})
Pf3D7_01_v3 338340 + CODON_INSERTION E391EDDEDYDDDD E EDDEDYDDDD 27 Counter({'D': 7, 'Y': 1, 'E': 1}) Counter()
Pf3D7_01_v3 338598 + CODON_INSERTION I477IN I IN 3 Counter({'N': 1}) Counter()
Pf3D7_01_v3 339060 + CODON_INSERTION N631NN N NN 3 Counter({'N': 1}) Counter()
Pf3D7_01_v3 340513 + CODON_CHANGE_PLUS_CODON_DELETION IH1116N IH N -3 Counter({'N': 1}) Counter({'I': 1, 'H': 1})
Pf3D7_01_v3 342021 + CODON_INSERTION K1618KN K KN 3 Counter({'N': 1}) Counter()
Pf3D7_01_v3 342450 + CODON_DELETION SN1761S SN S -3 Counter() Counter({'N': 1})
Pf3D7_01_v3 342833 + CODON_INSERTION I1889INN I INN 6 Counter({'N': 2}) Counter()
Pf3D7_01_v3 343374 + CODON_DELETION NN2069N NN N -3 Counter() Counter({'N': 1})
Pf3D7_01_v3 343596 + CODON_INSERTION E2143ENN E ENN 6 Counter({'N': 2}) Counter()
Pf3D7_01_v3 349643 + CODON_DELETION NEEPANEEPAN158N NEEPANEEPAN N -30 Counter() Counter({'E': 4, 'A': 2, 'P': 2, 'N': 2})
Pf3D7_01_v3 353959 - CODON_DELETION DNMNME746E DNMNME E -15 Counter() Counter({'M': 2, 'N': 2, 'D': 1})
Pf3D7_01_v3 354004 - CODON_INSERTION E736DSYNME E DSYNME 15 Counter({'Y': 1, 'S': 1, 'M': 1, 'D': 1, 'N': 1}) Counter()
Pf3D7_01_v3 354918 - CODON_DELETION ND431D ND D -3 Counter() Counter({'N': 1})
Pf3D7_01_v3 355334 - CODON_INSERTION N293SGSSNN N SGSSNN 15 Counter({'S': 3, 'G': 1, 'N': 1}) Counter()
Pf3D7_01_v3 378006 - CODON_DELETION IIN167N IIN N -6 Counter() Counter({'I': 2})
Pf3D7_01_v3 381393 + CODON_INSERTION D233DN D DN 3 Counter({'N': 1}) Counter()
Pf3D7_01_v3 397115 + CODON_INSERTION K175KCGDIKDK K KCGDIKDK 21 Counter({'K': 2, 'D': 2, 'I': 1, 'C': 1, 'G': 1}) Counter()
Pf3D7_01_v3 397494 + CODON_DELETION DNNNNNFDDN302D DNNNNNFDDN D -27 Counter() Counter({'N': 6, 'D': 2, 'F': 1})
Pf3D7_01_v3 398369 + CODON_DELETION SINTNIN593S SINTNIN S -18 Counter() Counter({'N': 3, 'I': 2, 'T': 1})
Pf3D7_01_v3 407116 + CODON_DELETION ENKSQ460E ENKSQ E -12 Counter() Counter({'Q': 1, 'K': 1, 'S': 1, 'N': 1})
Pf3D7_01_v3 408474 + CODON_INSERTION E912EGEDGEE E EGEDGEE 18 Counter({'E': 3, 'G': 2, 'D': 1}) Counter()
Pf3D7_01_v3 408489 + CODON_INSERTION E917EEGEDGEDDEEDD E EEGEDGEDDEEDD 36 Counter({'E': 5, 'D': 5, 'G': 2}) Counter()
Pf3D7_01_v3 424959 - CODON_DELETION NG872G NG G -3 Counter() Counter({'N': 1})
Pf3D7_01_v3 425557 - CODON_INSERTION N673KIDVDNINNN N KIDVDNINNN 27 Counter({'N': 3, 'I': 2, 'D': 2, 'K': 1, 'V': 1}) Counter()
Pf3D7_01_v3 425752 - CODON_DELETION KNKMDVDNIN599N KNKMDVDNIN N -27 Counter() Counter({'D': 2, 'K': 2, 'N': 2, 'I': 1, 'M': 1, 'V': 1})
Pf3D7_01_v3 426936 - CODON_DELETION TNSNSND208D TNSNSND D -18 Counter() Counter({'N': 3, 'S': 2, 'T': 1})
Pf3D7_01_v3 434091 - CODON_INSERTION D318HNDHNDD D HNDHNDD 18 Counter({'H': 2, 'D': 2, 'N': 2}) Counter()
Pf3D7_01_v3 434295 - CODON_DELETION LKEEKM245M LKEEKM M -15 Counter() Counter({'K': 2, 'E': 2, 'L': 1})
Pf3D7_01_v3 437159 + CODON_INSERTION I251IN I IN 3 Counter({'N': 1}) Counter()
Pf3D7_01_v3 437479 + CODON_DELETION DNN358D DNN D -6 Counter() Counter({'N': 2})
Pf3D7_01_v3 446589 + CODON_INSERTION S188SN S SN 3 Counter({'N': 1}) Counter()
Pf3D7_01_v3 477167 + CODON_INSERTION G255GYI G GYI 6 Counter({'Y': 1, 'I': 1}) Counter()
Pf3D7_01_v3 487939 - CODON_DELETION RI754I RI I -3 Counter() Counter({'R': 1})
Pf3D7_01_v3 487970 - CODON_DELETION *RR743R *RR R -6 Counter() Counter({'*': 1, 'R': 1})
Pf3D7_01_v3 487988 - CODON_INSERTION *739R*** * R*** 9 Counter({'*': 2, 'R': 1}) Counter()
Pf3D7_01_v3 488006 - CODON_DELETION C*R731R C*R R -6 Counter() Counter({'C': 1, '*': 1})
Pf3D7_01_v3 488135 - CODON_DELETION RRRRRG685G RRRRRG G -15 Counter() Counter({'R': 5})
Pf3D7_01_v3 488683 - CODON_DELETION RRRTQKRRTQI497I RRRTQKRRTQI I -30 Counter() Counter({'R': 5, 'Q': 2, 'T': 2, 'K': 1})
Pf3D7_01_v3 488776 - CODON_INSERTION Q476RRRRTQ Q RRRRTQ 15 Counter({'R': 4, 'T': 1}) Counter()
Pf3D7_01_v3 488912 - CODON_DELETION EKT429T EKT T -6 Counter() Counter({'K': 1, 'E': 1})
Pf3D7_01_v3 489431 - CODON_INSERTION *258KKT*KR* * KKT*KR* 18 Counter({'K': 3, '*': 1, 'R': 1, 'T': 1}) Counter()
Pf3D7_01_v3 497228 + CODON_INSERTION K607KEYDEVEYDEV K KEYDEVEYDEV 30 Counter({'E': 4, 'Y': 2, 'D': 2, 'V': 2}) Counter()
Pf3D7_01_v3 498978 + CODON_INSERTION E1190EDIDEDTNEYMYE E EDIDEDTNEYMYE 36 Counter({'E': 3, 'D': 3, 'Y': 2, 'I': 1, 'M': 1, 'N': 1, 'T': 1}) Counter()
Pf3D7_01_v3 505700 - CODON_INSERTION T429IIHVGLIT T IIHVGLIT 21 Counter({'I': 3, 'H': 1, 'L': 1, 'G': 1, 'V': 1}) Counter()
Pf3D7_01_v3 508956 - CODON_INSERTION D334NEKD D NEKD 9 Counter({'K': 1, 'E': 1, 'N': 1}) Counter()
Pf3D7_01_v3 514421 + CODON_INSERTION K779KGPTTS K KGPTTS 15 Counter({'T': 2, 'P': 1, 'S': 1, 'G': 1}) Counter()
Pf3D7_01_v3 532418 + CODON_INSERTION Y1682YDMNEHNMNEF Y YDMNEHNMNEF 30 Counter({'N': 3, 'E': 2, 'M': 2, 'D': 1, 'F': 1, 'H': 1}) Counter()
Pf3D7_01_v3 532602 + CODON_INSERTION K1743KKN K KKN 6 Counter({'K': 1, 'N': 1}) Counter()

...


In [16]:
import operator

In [17]:
all_aa_added = reduce(operator.add, tbl_indels.values('aa_added'))
all_aa_added.most_common(10)


Out[17]:
[('N', 3021),
 ('D', 804),
 ('K', 556),
 ('E', 535),
 ('I', 441),
 ('S', 346),
 ('T', 200),
 ('H', 198),
 ('Y', 190),
 ('G', 188)]

In [18]:
all_aa_subtracted = reduce(operator.add, tbl_indels.values('aa_subtracted'))
all_aa_subtracted.most_common(10)


Out[18]:
[('N', 3469),
 ('D', 1005),
 ('K', 664),
 ('E', 615),
 ('I', 433),
 ('S', 327),
 ('V', 217),
 ('H', 210),
 ('Y', 198),
 ('M', 185)]

In [19]:
all_aa_combined = all_aa_added + all_aa_subtracted
all_aa_combined.most_common(10)


Out[19]:
[('N', 6490),
 ('D', 1809),
 ('K', 1220),
 ('E', 1150),
 ('I', 874),
 ('S', 673),
 ('H', 408),
 ('V', 394),
 ('Y', 388),
 ('G', 357)]

In [34]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_style('white')

In [35]:
keys = [k for k, v in all_aa_combined.most_common(30)]
y1 = [all_aa_added[k] for k in keys]
y2 = [all_aa_subtracted[k] for k in keys]
x1 = np.arange(len(keys)) + .1
x2 = np.arange(len(keys)) + .5
fig = plt.figure(figsize=(16, 6))
ax = fig.add_subplot(111)
sns.despine(ax=ax, offset=10)
ax.bar(x1, y1, width=.4, color='w', label='insertions')
ax.bar(x2, y2, width=.4, color='k', label='deletions')
ax.set_xticks(np.arange(len(keys)) + .5)
ax.set_xticklabels(keys)
ax.legend()
ax.set_xlim(0, len(keys));


Out[35]:
21

Sandbox


In [13]:
genome._tbl_features.eq('type', 'gene').rowslice(15, 19).display()


seqid source type start stop score strand phase feature_id parent_id
Pf3D7_01_v3 chado gene 98819 102282 . + . PF3D7_0102200 None
Pf3D7_01_v3 chado gene 104704 105209 . + . PF3D7_0102300 None
Pf3D7_01_v3 chado gene 110750 115799 . - . PF3D7_0102500 None
Pf3D7_01_v3 chado gene 119041 121249 . - . PF3D7_0102600 None

In [32]:
genome._tbl_features.counts('type').displayall()


type count frequency
CDS 374 0.406080347448
polypeptide 148 0.160694896851
gene 141 0.153094462541
mRNA 135 0.14657980456
polypeptide_motif 39 0.042345276873
pseudogenic_exon 36 0.0390879478827
pseudogene 13 0.014115092291
pseudogenic_transcript 13 0.014115092291
repeat_region 12 0.0130293159609
rRNA 5 0.00542888165038
ncRNA 4 0.0043431053203
centromere 1 0.00108577633008

In [34]:
genome._tbl_features.eq('type', 'rRNA').displayall()


seqid source type start stop score strand phase feature_id parent_id
Pf3D7_01_v3 chado rRNA 473739 475887 . + . PF3D7_0112300.1 PF3D7_0112300
Pf3D7_01_v3 chado rRNA 475888 476281 . + . PF3D7_0112400.1 PF3D7_0112400
Pf3D7_01_v3 chado rRNA 476282 476402 . + . PF3D7_0112500.1 PF3D7_0112500
Pf3D7_01_v3 chado rRNA 476403 477278 . + . PF3D7_0112600.1 PF3D7_0112600
Pf3D7_01_v3 chado rRNA 477279 481382 . + . PF3D7_0112700.1 PF3D7_0112700

In [35]:
genome.get_feature('PF3D7_0112300')


Out[35]:
('Pf3D7_01_v3',
 'chado',
 'gene',
 473739,
 475887,
 '.',
 '+',
 '.',
 'PF3D7_0112300',
 None)

In [36]:
genome.get_children('PF3D7_0112300.1')


Out[36]:
[('Pf3D7_01_v3',
  'chado',
  'CDS',
  473739,
  475887,
  '.',
  '+',
  '0',
  'PF3D7_0112300.1:exon:1',
  'PF3D7_0112300.1')]

In [39]:
genome.get_feature('PF3D7_0108900')


Out[39]:
('Pf3D7_01_v3',
 'chado',
 'gene',
 361987,
 362586,
 '.',
 '-',
 '.',
 'PF3D7_0108900',
 None)

In [38]:
genome.get_children('PF3D7_0108900')


Out[38]:
[('Pf3D7_01_v3',
  'chado',
  'ncRNA',
  361987,
  362586,
  '.',
  '-',
  '.',
  'PF3D7_0108900.1',
  'PF3D7_0108900')]

In [40]:
genome.get_children('PF3D7_0108900.1')


Out[40]:
[('Pf3D7_01_v3',
  'chado',
  'CDS',
  361987,
  362586,
  '.',
  '-',
  '0',
  'PF3D7_0108900.1:exon:1',
  'PF3D7_0108900.1')]