In [1]:
%run setup.ipynb
In [2]:
tbl_variants_selected = etl.frompickle('../data/tbl_variants_missense_selected.pkl')
In [3]:
#to easily split the codon numbers from the codon letters
import re
#load the codon map from the blog post (with the header info removed)
md_tbl = etl.fromtsv('../data/domestica_gambiae_map.txt')
#get codons
dom = list(md_tbl['domestica_codon'])
ano = list(md_tbl['gambiae_codon'])
#make a dictionary with a as the key and d as the value
map_dict = {a: d for a, d in zip(ano, dom)}
#get the snpeff annotations
gam_cod = list(tbl_variants_selected['AGAP004707-RA'])
#ditch the codon letters - using re the regex module - LOVELY
gam_cod_cl = []
r = re.compile("([a-zA-Z]+)([0-9]+)")
for c in gam_cod:
if c:
d = c[0:6]
m = r.match(d)
g = m.group(2)
gam_cod_cl.append(g)
len(gam_cod_cl)
MD = [map_dict[c] for c in gam_cod_cl]
MD
Out[3]:
In [4]:
#get the musca codon letter at these positions and add
fs = pyfasta.Fasta('../data/domestica_gambiae_PROT_MEGA.fas')
#grab the right sample
dom = fs.get('domestica_vgsc')
#remove the '-' from the aligned fasta so the numbering makes sense
dom_fix = [p for p in dom if p != '-']
#check
dom_fix[261-1],dom_fix[1945-1]
#add them to the position
MD_fix = []
for p in MD:
if p == '.':
MD_fix.append('-')
if p != '.':
MD_fix.append(dom_fix[int(p)-1]+p)
MD_fix
Out[4]:
In [5]:
tbl_function = etl.wrap([
['AGAP004707-RA', 'domain', 'phenotype', 'evidence', 'study'],
['R254K', 'IL45', r'putative \texttt{L995F} enhancer', '-', '-'],
['V402L', 'IS6', r'known \texttt{I1527T} driver/enhancer)', 'assoc./\emph{in vitro}', '\cite{Yoon2008,Hopkins2010,Park1997,Lee2013,Haddi2017}'],
['D466H', 'LI/II', r'putative \texttt{L995F} enhancer', '-', '-'],
['M490I', 'LI/II', 'no known or putative phenotype', '-', '-'],
['T791M', 'IIS1', r'putative \texttt{L995F} enhancer', '-', '-'],
['L995S', 'IIS6', r'known driver', 'assoc./\emph{in vitro}', '\cite{Burton2011}'],
['L995F', 'IIS6', r'known driver', 'assoc./\emph{in vitro}', '\cite{Burton2011}'],
['A1125V', 'LII/III', r'no known or putative phenotype', '-', '-'],
['V1254I', 'LII/III', r'no known or putative phenotype', '-', '-'],
['I1527T', 'IIIS6', r'putative driver and two residues from known enhancer', '\emph{in vitro}', '\cite{Haddi2017}'],
['N1570Y', 'LIII/IV', r' known \texttt{L995F} enhancer', 'assoc./\emph{in vitro}', '\cite{Jones2012,Wang2015}'],
['E1597G', 'LIII/IV', r'putative \texttt{L995F} enhancer', '-', '-'],
['K1603T', 'IVS1', r'putative \texttt{L995F} enhancer', '-', '-'],
['A1746S', 'IVS5', r'putative \texttt{L995F} enhancer', '-', '-'],
['V1853I', 'COOH', r'putative \texttt{L995F} enhancer', '-', '-'],
['I1868T', 'COOH', r'putative \texttt{L995F} enhancer', '-', '-'],
['P1874S', 'COOH', r'putative \texttt{L995F} enhancer', 'assoc.', '\cite{Sonoda2008}'],
['P1874L', 'COOH', r'putative \texttt{L995F} enhancer', 'assoc.', '\cite{Sonoda2008}'],
['F1920S', 'COOH', r'putative \texttt{L995F} enhancer', '-', '-'],
['A1934V', 'COOH', r'putative \texttt{L995F} enhancer', '-', '-'],
['I1940T', 'COOH', r'putative \texttt{L995F} enhancer', '-', '-'],
])
In [6]:
pop_ids = 'AOM BFM GWA GNS BFS CMS GAS UGS KES'.split()
tbl_variants_display = (
tbl_variants_selected
# keep only the fields we need
.cut(['POS', 'REF', 'ALT', 'ALTIX', 'FILTER_PASS', 'AGAP004707-RA'] +
['AF_' + p for p in pop_ids])
# join in function
.leftjoin(tbl_function, key='AGAP004707-RA', missing='')
# resort by position
.sort(key='POS')
# round allele frequencies to integer
.convert(['AF_' + p for p in pop_ids], lambda v: int(np.rint(v * 100)))
# add the column of M. domestica codons
.addcolumn('Md', MD_fix)
# add a formatted "substitution" field
.addfield('substitution', lambda row: '{:,} {}>{}'.format(row['POS'], row['REF'], row['ALT']), index=5)
.convert(['substitution', 'Md', 'AGAP004707-RA', 'domain'], lambda v: r'\texttt{%s}' % v)
.convert('substitution', lambda v, row: v + '*' if not row['FILTER_PASS'] else v, pass_row=True)
# .cutout('POS', 'REF', 'ALT', 'ALTIX', 'FILTER_PASS')
)
tbl_variants_display.displayall()
In [7]:
prologue = r"""
\begin{tabular}{llllrrrrrrrrr}
\toprule
\multicolumn{4}{c}{Variant} &
\multicolumn{9}{c}{Population allele frequency (\%)}\\
\cmidrule(r){1-4}
\cmidrule(r){5-13}
Position\tnote{1} &
\emph{Ag}\tnote{2} &
\emph{Md}\tnote{3} &
Domain\tnote{4} &
AO\emph{Ac} &
BF\emph{Ac} &
GN\emph{Ag} &
BF\emph{Ag} &
CM\emph{Ag} &
GA\emph{Ag} &
UG\emph{Ag} &
KE &
GW\\
\midrule
"""
template = r"""
{substitution} & {AGAP004707-RA} & {Md} & {domain} & {AF_AOM} & {AF_BFM} & {AF_GNS} & {AF_BFS} & {AF_CMS} & {AF_GAS} & {AF_UGS} & {AF_KES} & {AF_GWA} \\
"""
epilogue = r"""
\bottomrule
\end{tabular}
"""
tbl_variants_display.totext('../tables/variants_missense.tex',
encoding='ascii',
prologue=prologue,
template=template,
epilogue=epilogue)
!cat ../tables/variants_missense.tex
In [8]:
# tbl_pheno = etl.wrap([
# ['AGAP004707-RA', 'Md', 'domain', 'phenotype', 'evidence', 'study'],
# ['R254K', 'R261', 'IN (I.S4--I.S5)', r'putative \texttt{L995F} enhancer', '-', '-'],
# ['V402L', 'V410', 'TM (I.S6)', r'known driver/enhancer', 'assoc./\emph{in vitro}', '\cite{Yoon2008,Hopkins2010,Park1997,Lee2013,Haddi2017}'],
# ['D466H', '-', 'IN (I.S6--II.S1)', r'putative \texttt{L995F} enhancer', '-', '-'],
# ['M490I', 'M508', 'IN (I.S6--II.S1)', 'no known or putative phenotype', '-', '-'],
# ['T791M', 'T810', 'TM (II.S1)', r'putative \texttt{L995F} enhancer', '-', '-'],
# ['L995S', 'L1014', 'TM (II.S6)', r'known driver', 'assoc./\emph{in vitro}', '\cite{Burton2011}'],
# ['L995F', 'L1014', 'TM (II.S6)', r'known driver', 'assoc./\emph{in vitro}', '\cite{Burton2011}'],
# ['A1125V', 'K1133', 'IN (II.S6--III.S1)', r'no known or putative phenotype', '-', '-'],
# ['V1254I', 'I1262', 'IN (II.S6--III.S1)', r'no known or putative phenotype', '-', '-'],
# ['I1527T', 'I1532', 'TM (III.S6)', r'putative driver\tnote{4}', '\emph{in vitro}', '\cite{Haddi2017}'],
# ['N1570Y', 'N1575', 'IN (III.S6--IV.S1)', r' known \texttt{L995F} enhancer', 'assoc./\emph{in vitro}', '\cite{Jones2012,Wang2015}'],
# ['E1597G', 'E1602', 'IN (III.S6--IV.S1)', r'putative \texttt{L995F} enhancer', '-', '-'],
# ['K1603T', 'K1608', 'TM (IV.S1)', r'putative \texttt{L995F} enhancer', '-', '-'],
# ['A1746S', 'A1751', 'TM (IV.S5)', r'putative \texttt{L995F} enhancer', '-', '-'],
# ['V1853I', 'V1858', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', '-', '-'],
# ['I1868T', 'I1873', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', '-', '-'],
# ['P1874S', 'P1879', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', 'assoc.', '\cite{Sonoda2008}'],
# ['P1874L', 'P1879', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', 'assoc.', '\cite{Sonoda2008}'],
# ['F1920S', 'Y1925', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', '-', '-'],
# ['A1934V', 'A1939', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', '-', '-'],
# ['I1940T', 'I1945', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', '-', '-'],
# ])
# prologue = r"""
# \begin{tabular}{llllll}
# \toprule
# \multicolumn{2}{c}{Variant} &
# \multicolumn{4}{c}{Function}\\
# \cmidrule(r){1-2}
# \cmidrule(r){3-6}
# \emph{Ag} &
# \emph{Md} & Domain\tnote{1} &
# Phenotype\tnote{2} &
# Experimental evidence\tnote{3} &
# Publication\\
# \midrule
# """
# template = r"""
# {AGAP004707-RA} & {Md} & {domain} & {phenotype} & {evidence} & {study} \\
# """
# epilogue = r"""
# \bottomrule
# \end{tabular}
# """
# tbl_pheno.totext('../tables/variants_pheno.tex',
# encoding='ascii',
# prologue=prologue,
# template=template,
# epilogue=epilogue)
# !cat ../tables/variants_pheno.tex
In [ ]: