In [1]:
%run setup.ipynb



In [2]:
tbl_variants_selected = etl.frompickle('../data/tbl_variants_missense_selected.pkl')

Add housefly numbering


In [3]:
#to easily split the codon numbers from the codon letters
import re

#load the codon map from the blog post (with the header info removed)
md_tbl = etl.fromtsv('../data/domestica_gambiae_map.txt')

#get codons
dom = list(md_tbl['domestica_codon'])
ano = list(md_tbl['gambiae_codon'])

#make a dictionary with a as the key and d as the value
map_dict = {a: d for a, d in zip(ano, dom)}

#get the snpeff annotations
gam_cod = list(tbl_variants_selected['AGAP004707-RA'])

#ditch the codon letters - using re the regex module - LOVELY
gam_cod_cl = []
r = re.compile("([a-zA-Z]+)([0-9]+)")
for c in gam_cod:
    if c:
        d = c[0:6]
        m = r.match(d)
        g = m.group(2)
        gam_cod_cl.append(g)
len(gam_cod_cl)

MD = [map_dict[c] for c in gam_cod_cl]
MD


Out[3]:
['261',
 '410',
 '410',
 '.',
 '508',
 '508',
 '810',
 '1014',
 '1014',
 '1133',
 '1262',
 '1532',
 '1575',
 '1602',
 '1608',
 '1751',
 '1858',
 '1873',
 '1879',
 '1879',
 '1925',
 '1939',
 '1945']

In [4]:
#get the musca codon letter at these positions and add
fs = pyfasta.Fasta('../data/domestica_gambiae_PROT_MEGA.fas')

#grab the right sample
dom = fs.get('domestica_vgsc')

#remove the '-' from the aligned fasta so the numbering makes sense
dom_fix = [p for p in dom if p != '-']
#check
dom_fix[261-1],dom_fix[1945-1]

#add them to the position

MD_fix = []
for p in MD:
    if p == '.':
        MD_fix.append('-')
    if p != '.':
        MD_fix.append(dom_fix[int(p)-1]+p)

MD_fix


Out[4]:
['R261',
 'V410',
 'V410',
 '-',
 'M508',
 'M508',
 'T810',
 'L1014',
 'L1014',
 'K1133',
 'I1262',
 'I1532',
 'N1575',
 'E1602',
 'K1608',
 'A1751',
 'V1858',
 'I1873',
 'P1879',
 'P1879',
 'Y1925',
 'A1939',
 'I1945']

Build Latex table


In [5]:
tbl_function = etl.wrap([
    ['AGAP004707-RA', 'domain', 'phenotype', 'evidence', 'study'],
    ['R254K', 'IL45', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['V402L', 'IS6', r'known \texttt{I1527T} driver/enhancer)', 'assoc./\emph{in vitro}', '\cite{Yoon2008,Hopkins2010,Park1997,Lee2013,Haddi2017}'],
    ['D466H', 'LI/II', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['M490I', 'LI/II', 'no known or putative phenotype', '-', '-'],
    ['T791M', 'IIS1', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['L995S', 'IIS6', r'known driver', 'assoc./\emph{in vitro}', '\cite{Burton2011}'],
    ['L995F', 'IIS6', r'known driver', 'assoc./\emph{in vitro}', '\cite{Burton2011}'],
    ['A1125V', 'LII/III', r'no known or putative phenotype', '-', '-'],
    ['V1254I', 'LII/III', r'no known or putative phenotype', '-', '-'],
    ['I1527T', 'IIIS6', r'putative driver and two residues from known enhancer', '\emph{in vitro}', '\cite{Haddi2017}'],
    ['N1570Y', 'LIII/IV', r' known \texttt{L995F} enhancer', 'assoc./\emph{in vitro}', '\cite{Jones2012,Wang2015}'],
    ['E1597G', 'LIII/IV', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['K1603T', 'IVS1', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['A1746S', 'IVS5', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['V1853I', 'COOH', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['I1868T', 'COOH', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['P1874S', 'COOH', r'putative \texttt{L995F} enhancer', 'assoc.', '\cite{Sonoda2008}'],
    ['P1874L', 'COOH', r'putative \texttt{L995F} enhancer', 'assoc.', '\cite{Sonoda2008}'],
    ['F1920S', 'COOH', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['A1934V', 'COOH', r'putative \texttt{L995F} enhancer', '-', '-'],
    ['I1940T', 'COOH', r'putative \texttt{L995F} enhancer', '-', '-'],
])

In [6]:
pop_ids = 'AOM BFM GWA GNS BFS CMS GAS UGS KES'.split()

tbl_variants_display = (
    tbl_variants_selected
    # keep only the fields we need
    .cut(['POS', 'REF', 'ALT', 'ALTIX', 'FILTER_PASS', 'AGAP004707-RA'] + 
         ['AF_' + p for p in pop_ids])
    # join in function
    .leftjoin(tbl_function, key='AGAP004707-RA', missing='')
    # resort by position
    .sort(key='POS')
    # round allele frequencies to integer
    .convert(['AF_' + p for p in pop_ids], lambda v: int(np.rint(v * 100)))
    # add the column of M. domestica codons
    .addcolumn('Md', MD_fix)
    # add a formatted "substitution" field
    .addfield('substitution', lambda row: '{:,} {}>{}'.format(row['POS'], row['REF'], row['ALT']), index=5)
    .convert(['substitution', 'Md', 'AGAP004707-RA', 'domain'], lambda v: r'\texttt{%s}' % v)
    .convert('substitution', lambda v, row: v + '*' if not row['FILTER_PASS'] else v, pass_row=True)
#    .cutout('POS', 'REF', 'ALT', 'ALTIX', 'FILTER_PASS')
)
tbl_variants_display.displayall()


0|POS 1|REF 2|ALT 3|ALTIX 4|FILTER_PASS 5|substitution 6|AGAP004707-RA 7|AF_AOM 8|AF_BFM 9|AF_GWA 10|AF_GNS 11|AF_BFS 12|AF_CMS 13|AF_GAS 14|AF_UGS 15|AF_KES 16|domain 17|phenotype 18|evidence 19|study 20|Md
2390177 G A 0 True \texttt{2,390,177 G>A} \texttt{R254K} 0 0 0 0 0 32 21 0 0 \texttt{IL45} putative \texttt{L995F} enhancer - - \texttt{R261}
2391228 G C 0 True \texttt{2,391,228 G>C} \texttt{V402L} 0 7 0 0 0 0 0 0 0 \texttt{IS6} known \texttt{I1527T} driver/enhancer) assoc./\emph{in vitro} \cite{Yoon2008,Hopkins2010,Park1997,Lee2013,Haddi2017} \texttt{V410}
2391228 G T 1 True \texttt{2,391,228 G>T} \texttt{V402L} 0 7 0 0 0 0 0 0 0 \texttt{IS6} known \texttt{I1527T} driver/enhancer) assoc./\emph{in vitro} \cite{Yoon2008,Hopkins2010,Park1997,Lee2013,Haddi2017} \texttt{V410}
2399997 G C 0 True \texttt{2,399,997 G>C} \texttt{D466H} 0 0 0 0 0 7 0 0 0 \texttt{LI/II} putative \texttt{L995F} enhancer - - \texttt{-}
2400071 G A 0 True \texttt{2,400,071 G>A} \texttt{M490I} 0 0 0 0 0 0 0 0 18 \texttt{LI/II} no known or putative phenotype - - \texttt{M508}
2400071 G T 1 True \texttt{2,400,071 G>T} \texttt{M490I} 0 0 0 0 0 0 0 0 0 \texttt{LI/II} no known or putative phenotype - - \texttt{M508}
2416980 C T 0 True \texttt{2,416,980 C>T} \texttt{T791M} 0 1 0 13 14 0 0 0 0 \texttt{IIS1} putative \texttt{L995F} enhancer - - \texttt{T810}
2422651 T C 0 True \texttt{2,422,651 T>C} \texttt{L995S} 0 0 0 0 0 15 64 100 76 \texttt{IIS6} known driver assoc./\emph{in vitro} \cite{Burton2011} \texttt{L1014}
2422652 A T 0 True \texttt{2,422,652 A>T} \texttt{L995F} 86 85 0 100 100 53 36 0 0 \texttt{IIS6} known driver assoc./\emph{in vitro} \cite{Burton2011} \texttt{L1014}
2424384 C T 0 True \texttt{2,424,384 C>T} \texttt{A1125V} 9 0 0 0 0 0 0 0 0 \texttt{LII/III} no known or putative phenotype - - \texttt{K1133}
2425077 G A 0 True \texttt{2,425,077 G>A} \texttt{V1254I} 0 0 5 0 0 0 0 0 0 \texttt{LII/III} no known or putative phenotype - - \texttt{I1262}
2429617 T C 0 True \texttt{2,429,617 T>C} \texttt{I1527T} 0 14 0 0 0 0 0 0 0 \texttt{IIIS6} putative driver and two residues from known enhancer \emph{in vitro} \cite{Haddi2017} \texttt{I1532}
2429745 A T 0 False \texttt{2,429,745 A>T}* \texttt{N1570Y} 0 26 0 10 22 6 0 0 0 \texttt{LIII/IV} known \texttt{L995F} enhancer assoc./\emph{in vitro} \cite{Jones2012,Wang2015} \texttt{N1575}
2429897 A G 0 True \texttt{2,429,897 A>G} \texttt{E1597G} 0 0 0 6 4 0 0 0 0 \texttt{LIII/IV} putative \texttt{L995F} enhancer - - \texttt{E1602}
2429915 A C 0 True \texttt{2,429,915 A>C} \texttt{K1603T} 0 5 0 0 0 0 0 0 0 \texttt{IVS1} putative \texttt{L995F} enhancer - - \texttt{K1608}
2430424 G T 0 True \texttt{2,430,424 G>T} \texttt{A1746S} 0 0 0 11 13 0 0 0 0 \texttt{IVS5} putative \texttt{L995F} enhancer - - \texttt{A1751}
2430817 G A 0 True \texttt{2,430,817 G>A} \texttt{V1853I} 0 0 0 8 5 0 0 0 0 \texttt{COOH} putative \texttt{L995F} enhancer - - \texttt{V1858}
2430863 T C 0 True \texttt{2,430,863 T>C} \texttt{I1868T} 0 0 0 18 25 0 0 0 0 \texttt{COOH} putative \texttt{L995F} enhancer - - \texttt{I1873}
2430880 C T 0 True \texttt{2,430,880 C>T} \texttt{P1874S} 0 21 0 0 0 0 0 0 0 \texttt{COOH} putative \texttt{L995F} enhancer assoc. \cite{Sonoda2008} \texttt{P1879}
2430881 C T 0 True \texttt{2,430,881 C>T} \texttt{P1874L} 0 7 0 45 26 0 0 0 0 \texttt{COOH} putative \texttt{L995F} enhancer assoc. \cite{Sonoda2008} \texttt{P1879}
2431019 T C 0 True \texttt{2,431,019 T>C} \texttt{F1920S} 0 0 0 0 0 1 4 0 0 \texttt{COOH} putative \texttt{L995F} enhancer - - \texttt{Y1925}
2431061 C T 0 True \texttt{2,431,061 C>T} \texttt{A1934V} 0 12 0 0 0 0 0 0 0 \texttt{COOH} putative \texttt{L995F} enhancer - - \texttt{A1939}
2431079 T C 0 True \texttt{2,431,079 T>C} \texttt{I1940T} 0 4 0 0 0 7 0 0 0 \texttt{COOH} putative \texttt{L995F} enhancer - - \texttt{I1945}

In [7]:
prologue = r"""
\begin{tabular}{llllrrrrrrrrr}
\toprule
\multicolumn{4}{c}{Variant} &
\multicolumn{9}{c}{Population allele frequency (\%)}\\
\cmidrule(r){1-4}
\cmidrule(r){5-13}
Position\tnote{1} & 
\emph{Ag}\tnote{2} & 
\emph{Md}\tnote{3} &
Domain\tnote{4} &
AO\emph{Ac} & 
BF\emph{Ac} & 
GN\emph{Ag} & 
BF\emph{Ag} & 
CM\emph{Ag} & 
GA\emph{Ag} & 
UG\emph{Ag} & 
KE & 
GW\\
\midrule
"""
template = r"""
{substitution} & {AGAP004707-RA} & {Md} & {domain} & {AF_AOM} & {AF_BFM} & {AF_GNS} & {AF_BFS} & {AF_CMS} & {AF_GAS} & {AF_UGS} & {AF_KES} & {AF_GWA} \\
"""
epilogue = r"""
\bottomrule
\end{tabular}
"""
tbl_variants_display.totext('../tables/variants_missense.tex', 
                            encoding='ascii',
                            prologue=prologue, 
                            template=template,
                            epilogue=epilogue)

!cat ../tables/variants_missense.tex


\begin{tabular}{llllrrrrrrrrr}
\toprule
\multicolumn{4}{c}{Variant} &
\multicolumn{9}{c}{Population allele frequency (\%)}\\
\cmidrule(r){1-4}
\cmidrule(r){5-13}
Position\tnote{1} & 
\emph{Ag}\tnote{2} & 
\emph{Md}\tnote{3} &
Domain\tnote{4} &
AO\emph{Ac} & 
BF\emph{Ac} & 
GN\emph{Ag} & 
BF\emph{Ag} & 
CM\emph{Ag} & 
GA\emph{Ag} & 
UG\emph{Ag} & 
KE & 
GW\\
\midrule

\texttt{2,390,177 G>A} & \texttt{R254K} & \texttt{R261} & \texttt{IL45} & 0 & 0 & 0 & 0 & 32 & 21 & 0 & 0 & 0 \\

\texttt{2,391,228 G>C} & \texttt{V402L} & \texttt{V410} & \texttt{IS6} & 0 & 7 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\

\texttt{2,391,228 G>T} & \texttt{V402L} & \texttt{V410} & \texttt{IS6} & 0 & 7 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\

\texttt{2,399,997 G>C} & \texttt{D466H} & \texttt{-} & \texttt{LI/II} & 0 & 0 & 0 & 0 & 7 & 0 & 0 & 0 & 0 \\

\texttt{2,400,071 G>A} & \texttt{M490I} & \texttt{M508} & \texttt{LI/II} & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 18 & 0 \\

\texttt{2,400,071 G>T} & \texttt{M490I} & \texttt{M508} & \texttt{LI/II} & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\

\texttt{2,416,980 C>T} & \texttt{T791M} & \texttt{T810} & \texttt{IIS1} & 0 & 1 & 13 & 14 & 0 & 0 & 0 & 0 & 0 \\

\texttt{2,422,651 T>C} & \texttt{L995S} & \texttt{L1014} & \texttt{IIS6} & 0 & 0 & 0 & 0 & 15 & 64 & 100 & 76 & 0 \\

\texttt{2,422,652 A>T} & \texttt{L995F} & \texttt{L1014} & \texttt{IIS6} & 86 & 85 & 100 & 100 & 53 & 36 & 0 & 0 & 0 \\

\texttt{2,424,384 C>T} & \texttt{A1125V} & \texttt{K1133} & \texttt{LII/III} & 9 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\

\texttt{2,425,077 G>A} & \texttt{V1254I} & \texttt{I1262} & \texttt{LII/III} & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 5 \\

\texttt{2,429,617 T>C} & \texttt{I1527T} & \texttt{I1532} & \texttt{IIIS6} & 0 & 14 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\

\texttt{2,429,745 A>T}* & \texttt{N1570Y} & \texttt{N1575} & \texttt{LIII/IV} & 0 & 26 & 10 & 22 & 6 & 0 & 0 & 0 & 0 \\

\texttt{2,429,897 A>G} & \texttt{E1597G} & \texttt{E1602} & \texttt{LIII/IV} & 0 & 0 & 6 & 4 & 0 & 0 & 0 & 0 & 0 \\

\texttt{2,429,915 A>C} & \texttt{K1603T} & \texttt{K1608} & \texttt{IVS1} & 0 & 5 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\

\texttt{2,430,424 G>T} & \texttt{A1746S} & \texttt{A1751} & \texttt{IVS5} & 0 & 0 & 11 & 13 & 0 & 0 & 0 & 0 & 0 \\

\texttt{2,430,817 G>A} & \texttt{V1853I} & \texttt{V1858} & \texttt{COOH} & 0 & 0 & 8 & 5 & 0 & 0 & 0 & 0 & 0 \\

\texttt{2,430,863 T>C} & \texttt{I1868T} & \texttt{I1873} & \texttt{COOH} & 0 & 0 & 18 & 25 & 0 & 0 & 0 & 0 & 0 \\

\texttt{2,430,880 C>T} & \texttt{P1874S} & \texttt{P1879} & \texttt{COOH} & 0 & 21 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\

\texttt{2,430,881 C>T} & \texttt{P1874L} & \texttt{P1879} & \texttt{COOH} & 0 & 7 & 45 & 26 & 0 & 0 & 0 & 0 & 0 \\

\texttt{2,431,019 T>C} & \texttt{F1920S} & \texttt{Y1925} & \texttt{COOH} & 0 & 0 & 0 & 0 & 1 & 4 & 0 & 0 & 0 \\

\texttt{2,431,061 C>T} & \texttt{A1934V} & \texttt{A1939} & \texttt{COOH} & 0 & 12 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\

\texttt{2,431,079 T>C} & \texttt{I1940T} & \texttt{I1945} & \texttt{COOH} & 0 & 4 & 0 & 0 & 7 & 0 & 0 & 0 & 0 \\

\bottomrule
\end{tabular}

Table 2. Phenotype


In [8]:
# tbl_pheno = etl.wrap([
#     ['AGAP004707-RA', 'Md', 'domain', 'phenotype', 'evidence', 'study'],
#     ['R254K', 'R261', 'IN (I.S4--I.S5)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['V402L', 'V410', 'TM (I.S6)', r'known driver/enhancer', 'assoc./\emph{in vitro}', '\cite{Yoon2008,Hopkins2010,Park1997,Lee2013,Haddi2017}'],
#     ['D466H', '-', 'IN (I.S6--II.S1)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['M490I', 'M508', 'IN (I.S6--II.S1)', 'no known or putative phenotype', '-', '-'],
#     ['T791M', 'T810', 'TM (II.S1)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['L995S', 'L1014', 'TM (II.S6)', r'known driver', 'assoc./\emph{in vitro}', '\cite{Burton2011}'],
#     ['L995F', 'L1014', 'TM (II.S6)', r'known driver', 'assoc./\emph{in vitro}', '\cite{Burton2011}'],
#     ['A1125V', 'K1133', 'IN (II.S6--III.S1)', r'no known or putative phenotype', '-', '-'],
#     ['V1254I', 'I1262', 'IN (II.S6--III.S1)', r'no known or putative phenotype', '-', '-'],
#     ['I1527T', 'I1532', 'TM (III.S6)', r'putative driver\tnote{4}', '\emph{in vitro}', '\cite{Haddi2017}'],
#     ['N1570Y', 'N1575', 'IN (III.S6--IV.S1)', r' known \texttt{L995F} enhancer', 'assoc./\emph{in vitro}', '\cite{Jones2012,Wang2015}'],
#     ['E1597G', 'E1602', 'IN (III.S6--IV.S1)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['K1603T', 'K1608', 'TM (IV.S1)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['A1746S', 'A1751', 'TM (IV.S5)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['V1853I', 'V1858', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['I1868T', 'I1873', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['P1874S', 'P1879', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', 'assoc.', '\cite{Sonoda2008}'],
#     ['P1874L', 'P1879', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', 'assoc.', '\cite{Sonoda2008}'],
#     ['F1920S', 'Y1925', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['A1934V', 'A1939', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', '-', '-'],
#     ['I1940T', 'I1945', 'IN (IV.S6--)', r'putative \texttt{L995F} enhancer', '-', '-'],
# ])

# prologue = r"""
# \begin{tabular}{llllll}
# \toprule
# \multicolumn{2}{c}{Variant} &
# \multicolumn{4}{c}{Function}\\
# \cmidrule(r){1-2}
# \cmidrule(r){3-6}
# \emph{Ag} & 
# \emph{Md} & Domain\tnote{1} & 
# Phenotype\tnote{2} &
# Experimental evidence\tnote{3} &
# Publication\\
# \midrule
# """
# template = r"""
# {AGAP004707-RA} & {Md} & {domain} & {phenotype} & {evidence} & {study} \\
# """
# epilogue = r"""
# \bottomrule
# \end{tabular}
# """
# tbl_pheno.totext('../tables/variants_pheno.tex', 
#                             encoding='ascii',
#                             prologue=prologue, 
#                             template=template,
#                             epilogue=epilogue)

# !cat ../tables/variants_pheno.tex

In [ ]: