Required software

The following packages were used in this analysis:



In [1]:

    
with open("requirements.txt", "r") as handle:
    print(handle.read())









    



bcbio-gff==0.6.2
biopython==1.65
certifi==2015.4.28
ipython==3.1.0
Jinja2==2.7.3
json-schema==0.3
jsonschema==2.4.0
MarkupSafe==0.23
matplotlib==1.4.3
mistune==0.5.1
nose==1.3.6
numpy==1.9.2
pandas==0.16.0
Pillow==2.8.1
ptyprocess==0.4
Pygments==2.0.2
pyparsing==2.0.3
python-dateutil==2.4.2
pytz==2015.2
pyzmq==14.6.0
reportlab==3.1.44
scipy==0.15.1
six==1.9.0
terminado==0.5
tornado==4.1

To run this notebook for yourself create a new virtual environment in the prompt:

$ virtualenv env
$ source env/bin/activate
$ pip install numpy
$ pip install -r requirements

Then to run the notebook itself:

$ ipython notebook



In [2]:

    
%matplotlib inline



In [3]:

    
from collections import defaultdict
import os
from os.path import join as pjoin
import subprocess
import re
import json

from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.Graphics.GenomeDiagram import CrossLink
from Bio.Blast.Applications import NcbirpstblastnCommandline
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast.Applications import NcbipsiblastCommandline
from Bio.SeqFeature import SeqFeature
from Bio.SeqFeature import FeatureLocation
from reportlab.lib import colors
from reportlab.lib.units import cm
from BCBio import GFF
from IPython.display import SVG
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib.colors import LinearSegmentedColormap



In [4]:

    
#plt.style.use('ggplot')



In [5]:

    
pd.set_option("display.max_colwidth", 800)
pd.options.display.max_rows = 999
THREADS = 2  # Number of CPUs available for calls to BLAST etc.



In [6]:

    
STEPS = [
    'data',
    "01-found_distribution_of_scaffold_length",
    "02-found_length_distribution_of_scaffolds_with_genes",
    "03-place_candidate_scaffold_lengths_distribution",
    "04-found_dist_nearest_scaf_end_plotted",
    "05-found_distance_nearest_te_gene"
    ]

for step in STEPS:
    if not os.path.isdir(step):
        os.mkdir(step)

Candidates

Jason identified 10 candidate AvrRvi5 genes. Nine were discovered by comparative genomics, and 1 was discovered by comparative transcriptomics of 14 day old Venturia inaequalis cultures in potato dextrose broth (PDB). atg10719 was the candidate identified by the transcriptomics method, it is expressed in MNH120 but not in Race 5.

Below is a summary dataframe of the candidates:



In [7]:

    
CANDIDATES = pd.read_csv(pjoin(STEPS[0], "summary.DNA-seq.RNA-seq.csv"))
CANDIDATES









    Out[7]:






  
    
      
      method
      gene
      aa_mutation
      length
      gb_acc
      blast_desc
      organism
      evalue
      identity
    
  
  
    
      0
      DNA-seq
      atg161
      T184I
      237
      No hit
      NaN
      NaN
      NaN
      NaN
    
    
      1
      DNA-seq
      atg162
      T417A
      459
      WP_030543571.1
      Hypothetical protein
      Streptomyces albus
      0.000000008
      29
    
    
      2
      DNA-seq
      atg4020
      P174PNAKPDAKPDAKPDAKP, V155L
      185
      XP_007283047.1
      Snoal-like polyketide cyclase family protein
      Colletotrichum gloeosporioides
      3E-029
      40.15
    
    
      3
      DNA-seq
      atg12487
      V388M
      403
      ADT70774.1
      Cellobiose dehydrogenase
      Dichomera saubinetii
      7E-013
      55.56
    
    
      4
      DNA-seq
      atg140* (not original gene call) first 15 amino acids removed
      D83N
      232
      No hit
      NaN
      NaN
      NaN
      NaN
    
    
      5
      DNA-seq
      atg7465
      V463I
      626
      EKG19035.1
      Glycoside hydrolase family 1
      Macrophomina phaesolina
      0
      75.36
    
    
      6
      DNA-seq
      atg10724
      K178N,K239R
      292
      No hit
      NaN
      NaN
      NaN
      NaN
    
    
      7
      DNA-seq
      atg10703
      G102A
      202
      No hit
      NaN
      NaN
      NaN
      NaN
    
    
      8
      DNA-seq
      atg4290
      L423S
      436
      XP_007795436.1
      putative chloroperoxidase-like protein [Eutypa lata UCREL1]
      Eutypa lata
      1E-022
      38
    
    
      9
      RNA-seq
      atg10719
      Not expressed in race  5
      86
      No hit
      No hit
      No hit
      No hit
      No hit



In [8]:

    
CANDIDATES.loc[4,'gene'] = 'atg140'



In [9]:

    
GENOME_FILES = {
    'MNH120': {
        'fasta': pjoin(STEPS[0], 'MNH120.genome.masked.fasta'),
        'gff': [
            pjoin(STEPS[0], 'augustus.hints.GeneWithUTR.withNCBI_Fungi.gff3'),
            pjoin(STEPS[0], 'MNH120.REPET_TEs.gff3'),
            pjoin(STEPS[0], 'MNH120.REPET_SSRs.gff3'),
            ]
        },
    'B04': {
        'fasta': pjoin(STEPS[0], 'B04.genome.fa'),
        'gff': [
            pjoin(STEPS[0], 'B04.genes.gff3'),
            ]
        },
    'USR5': {
        'fasta': pjoin(STEPS[0], 'I5V.gapfilled.final.fa'),
        'gff': [
            pjoin(STEPS[0], 'I5V.PredictedPass.cleaned.gff3'),
            ]
        },
    'I9A': {
        'fasta': pjoin(STEPS[0], 'I9A.gapfilled.final.fa'),
        'gff': [
            pjoin(STEPS[0], 'I9A.PredictedPass.cleaned.gff3'),
            ]
        },
    'I17V': {
        'fasta': pjoin(STEPS[0], 'I17V.gapfilled.final.fa'),
        'gff': [
            pjoin(STEPS[0], 'I17V.PredictedPass.cleaned.gff3'),
            ]
        },
    'I19A': {
        'fasta': pjoin(STEPS[0], 'I19A.gapfilled.final.fa'),
        'gff': [
            pjoin(STEPS[0], 'I19A.PredictedPass.cleaned.gff3'),
            ]
        },
    'I31A': {
        'fasta': pjoin(STEPS[0], 'I31A.gapfilled.final.fa'),
        'gff': [
            pjoin(STEPS[0], 'I31A.gapfilled.final.cleaned.gff3'),
            ],
        },
    'I37A': {
        'fasta': pjoin(STEPS[0], 'I37A.gapfilled.final.fa'),
        'gff': [
            pjoin(STEPS[0], 'I37A.PredictedPass.cleaned.gff3'),
            ],
        },
    'I61A': {
        'fasta': pjoin(STEPS[0], 'I61A.gapfilled.final.fa'),
        'gff': [
            pjoin(STEPS[0], 'I61A.PredictedPass.cleaned.gff3'),
            ],
        },
    'I65V': {
        'fasta': pjoin(STEPS[0], 'I65V.gapfilled.final.fa'),
        'gff': [
            pjoin(STEPS[0], 'I65V.PredictedPass.cleaned.gff3'),
            ],
        }
    }



In [10]:

    
GENOMES = dict()
for isolate, files in GENOME_FILES.items():
    GENOMES[isolate] = SeqIO.to_dict(
        SeqIO.parse(files['fasta'], format="fasta")
        )

    for gff in files['gff']:
        genome_with_features = GFF.parse(
            gff,
            base_dict=GENOMES[isolate]
            )
        GENOMES[isolate].update(SeqIO.to_dict(genome_with_features))



In [11]:

    
GENES = dict()
GENES['MNH120'] = SeqIO.to_dict(
    SeqIO.parse(pjoin(STEPS[0], 'MNH120.Auto.genes.fna'), format="fasta")
    )



In [12]:

    
cdict = {
    'red':   ((0., 59/256, 59/256),
              (0.25, 120/256, 120/256),
              (0.5, 235/256, 235/236),
              (0.75, 225/256, 225/256),
              (1., 242/256, 242/256)),
    'green': ((0., 156/256, 156/256),
              (0.25, 183/256, 183/256),
              (0.5, 204/256, 204/256),
              (0.75, 175/256, 175/256),
              (1., 26/256, 26/256)),
    'blue':  ((0., 178/256, 178/256),
              (0.25, 197/256, 197/256),
              (0.5, 42/256, 42/256),
              (0.75, 0., 0.),
              (1., 0., 0.))
    }

zissou = LinearSegmentedColormap('Zissou', cdict)
plt.register_cmap(cmap=zissou)

cat_colours = ["#FF0000", "#00A08A", "#F2AD00", "#F98400", "#5BBCD6", "#046C9A", '#35274A', '#B40F20', "#D69C4E"]



In [13]:

    
mpl.rcParams['text.usetex']=True
mpl.rcParams['text.latex.preamble'] = [
       r'\usepackage{siunitx}',   # i need upright \micro symbols, but you need...
       r'\sisetup{detect-all}',   # ...this to force siunitx to actually use your fonts
       r'\usepackage{helvet}',    # set the normal font here
       r'\usepackage{sansmath}',  # load up the sansmath so that math -> helvet
       r'\sansmath'               # <- tricky! -- gotta actually tell tex to use!
]

Step 01 - found distribution of scaffold length



In [14]:

    
scaffold_lengths = list()

for isolate, genome in GENOMES.items():
    for scaffold, sequence in genome.items():
        row = {
            'isolate': isolate,
            'scaffold': scaffold,
            'length': len(sequence)
            }
        scaffold_lengths.append(row)

scaffold_lengths = pd.DataFrame(scaffold_lengths)
scaffold_lengths.to_csv(
    pjoin(STEPS[1], 'scaffold_lengths.csv'),
    index=False,
    sep='\t',
    encoding='utf-8',
    header=True
    )

isolate_names = list()
isolate_lengths = list()
for i, j in scaffold_lengths.groupby(['isolate'])['length']:
    isolate_names.append(i)
    isolate_lengths.append(j.values)
fig, ax = plt.subplots()
dummy = ax.boxplot(isolate_lengths, labels=isolate_names, vert=False, showmeans=True, widths=0.8)
fig.savefig(pjoin(STEPS[1], 'scaffold_distribution.png'))

Step 02 - looking again only at scaffolds with genes



In [15]:

    
scaffold_lengths_genes = list()

for isolate, genome in GENOMES.items():
    for scaffold, sequence in genome.items():
        if len([f for f in sequence.features if f.type == 'gene']) == 0:
            continue
        row = {
            'isolate': isolate,
            'scaffold': scaffold,
            'length': len(sequence)
            }
        scaffold_lengths_genes.append(row)

scaffold_lengths_genes = pd.DataFrame(scaffold_lengths_genes)
scaffold_lengths_genes.to_csv(
    pjoin(STEPS[2], 'scaffold_lengths_containing_genes.csv'),
    index=False,
    sep='\t',
    encoding='utf-8',
    header=True
    )

isolate_names = list()
isolate_lengths = list()
for i, j in scaffold_lengths_genes.groupby(['isolate'])['length']:
    isolate_names.append(i)
    isolate_lengths.append(j.values)
fig, ax = plt.subplots()
dummy = ax.boxplot(isolate_lengths, labels=isolate_names, vert=False, showmeans=True, widths=0.8)
fig.savefig(pjoin(STEPS[2], 'scaffold_distribution.png'))

Step 03 - place candidate scaffold lengths in distribution



In [16]:

    
with open(pjoin(STEPS[0], 'MNH120_candidate_homologues.json'), 'r') as handle:
    candidate_homologues = json.load(handle)

positions = defaultdict(dict)
for candidate in CANDIDATES['gene']:
    print(candidate)
    for isolate, homologues in candidate_homologues[candidate].items():
        homologue = homologues[0]
        scaffold = homologue['scaffold']
        positions[candidate][isolate] = len(GENOMES[isolate][scaffold])
        print("{:>8}:".format(isolate), len(GENOMES[isolate][scaffold]))
    
with open(pjoin(STEPS[3], 'candidate_scaf_lengths.json'), 'w') as handle:
    json.dump(positions, handle)









    



atg161
    I19A: 148984
    I65V: 336338
  MNH120: 65488
    I37A: 1058651
    USR5: 249012
    I61A: 111918
     B04: 99402
     I9A: 106603
    I31A: 67120
    I17V: 615942
atg162
    I19A: 148984
    I65V: 336338
  MNH120: 65488
    I37A: 1058651
    USR5: 249012
    I61A: 111918
     B04: 99402
     I9A: 106603
    I31A: 67120
    I17V: 615942
atg4020
    I19A: 407893
    I65V: 241990
  MNH120: 9883
    I37A: 1058651
    USR5: 306619
    I61A: 223483
     B04: 112942
     I9A: 238720
    I31A: 55337
    I17V: 615942
atg12487
    I19A: 236498
    I65V: 336338
  MNH120: 656374
    I37A: 294686
    USR5: 249012
    I61A: 235822
     B04: 173445
     I9A: 226476
    I31A: 107040
    I17V: 615942
atg140
    I19A: 261048
    I65V: 173274
  MNH120: 25314
    I37A: 89232
    USR5: 605022
    I61A: 539802
     B04: 104567
     I9A: 154533
    I31A: 65464
    I17V: 290198
atg7465
    I19A: 148984
    I65V: 336338
  MNH120: 103197
    I37A: 1058651
    USR5: 249012
    I61A: 111918
     B04: 99402
     I9A: 106603
    I31A: 67120
    I17V: 615942
atg10724
    I19A: 96264
    I65V: 241990
  MNH120: 141894
    I37A: 1058651
    USR5: 306619
    I61A: 322091
     B04: 99402
     I9A: 107544
    I31A: 44436
    I17V: 615942
atg10703
    I19A: 96264
    I65V: 336338
  MNH120: 141894
    I37A: 1058651
    USR5: 249012
    I61A: 322091
     B04: 99402
     I9A: 107544
    I31A: 40696
    I17V: 615942
atg4290
    I19A: 315437
    I65V: 44742
  MNH120: 377941
    I37A: 74141
    USR5: 89503
    I61A: 36239
     B04: 111428
     I9A: 442334
    I31A: 36313
    I17V: 87555
atg10719
    I19A: 96264
    I65V: 241990
  MNH120: 141894
    I37A: 1058651
    USR5: 306619
    I61A: 322091
     B04: 99402
     I9A: 107544
    I31A: 44436
    I17V: 615942

Step 04 - Find distance to nearest scaffold end.



In [17]:

    
%run lib/subset_features.py



In [18]:

    
all_scaffold_lengths = list()
for isolate, scaffolds in GENOMES.items():
    for scaffold, record in scaffolds.items():
        for feat in record.features:
            if feat.type != 'gene':
                continue
            distance = min([
                feat.location.start,
                feat.location.end,
                len(record) - feat.location.start,
                len(record) - feat.location.end
                ])
            row = {
                'feature': feat.id,
                'isolate': isolate,
                'scaffold': scaffold,
                'length': len(record),
                'distance': distance
                }
            all_scaffold_lengths.append(row)

all_scaffold_lengths = pd.DataFrame(all_scaffold_lengths)
all_scaffold_lengths.to_csv(
    pjoin(STEPS[4], 'all_scaffold_lengths.csv'),
    index=False,
    sep='\t',
    encoding='utf-8',
    header=True
    )

with open(pjoin(STEPS[0], 'MNH120_candidate_homologues.json'), 'r') as handle:
    candidate_homologues = json.load(handle)

# candidate: genome: scaffold: feature1, feature2, ...
features = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
scaffold_lengths = list()
for candidate in CANDIDATES['gene']:
    for isolate, homologues in candidate_homologues[candidate].items():
        homologue = homologues[0]
        scaffold = homologue['scaffold']
        these_features = subset_features(
            GENOMES[isolate][scaffold],
            homologue['start'],
            homologue['end']
            )
        these_features = [f for f in these_features if f.type == 'gene']
        if len(these_features) == 0:
            feature = SeqFeature(
                location=FeatureLocation(
                    homologue['start'],
                    homologue['end'],
                    homologue['strand'],
                    ),
                id=isolate + '_' + candidate + '_homologue',
                )
            features[candidate][isolate][scaffold] = feature
        else:
            feature = these_features[0]
            features[candidate][isolate][scaffold] = feature

        distance = min([
            feature.location.start,
            feature.location.end,
            len(GENOMES[isolate][scaffold]) - feature.location.start,
            len(GENOMES[isolate][scaffold]) - feature.location.end
            ])
        row = {
            'candidate': candidate,
            'feature': feature.id,
            'isolate': isolate,
            'scaffold': scaffold,
            'length': len(GENOMES[isolate][scaffold]),
            'distance': distance
            }
        scaffold_lengths.append(row)

candidate_scaffold_lengths = pd.DataFrame(scaffold_lengths)
candidate_scaffold_lengths.to_csv(
    pjoin(STEPS[4], 'candidate_scaffold_lengths.csv'),
    index=False,
    sep='\t',
    encoding='utf-8',
    header=True
    )



In [21]:

    
all_scaffold_lengths = pd.read_table(pjoin(STEPS[4], 'all_scaffold_lengths.csv'))

candidate_scaffold_lengths = pd.read_table(pjoin(STEPS[4], 'candidate_scaffold_lengths.csv'))

fig, ax = plt.subplots()
hexbin = ax.hexbin(
    x=all_scaffold_lengths['distance'].values, 
    y=all_scaffold_lengths['length'].values,
    cmap=zissou,
    yscale='log',
    gridsize=40,
    )
cbar = plt.colorbar(hexbin, ax=ax)
cbar.set_label('Number of genes in bin')
if len(hexbin._offsets) == 1:
    mask = list()
    for i, path in enumerate(hexbin._paths):
        if (path._vertices[0][0] / path._vertices[0][1] > 0.5) & \
                (hexbin._A[i] == 0.):
            mask.append(True)
        else:
            mask.append(False)
    mask = np.array(mask)
else:
    mask = (hexbin._offsets[:, 0] / hexbin._offsets[:, 1] >= 1/2) & \
                (hexbin._A == 0.)
hexbin._A = np.ma.masked_where(mask, hexbin._A)

scatter = ax.scatter(
    x=candidate_scaffold_lengths['distance'].values,
    y=candidate_scaffold_lengths['length'].values,
    color='#000000',
    )
ax.set_xlim(-15000, 6.5e+5)
xticklabels = ax.set_xticklabels([0, 0] + [r'${} \times 10^{{5}}$'.format(i) for i in range(1, 7)])
#ax.autoscale()
ax.set_ylim(400, 1.5 * 10**6)
ax.set_ylabel('Scaffold size (bp)')
ax.set_xlabel('Distance to scaffold end (bp)')
fig.set_size_inches(7, 6)
fig.savefig(pjoin(STEPS[4], "scaf_dist_all.png"), dpi=300)
fig.savefig(pjoin(STEPS[4], "scaf_dist_all.pdf"))
fig.savefig(pjoin(STEPS[4], "scaf_dist_all.svg"))



In [22]:

    
candidates = list(CANDIDATES['gene'])

for candidate in candidates:
    fig, ax = plt.subplots()
    hexbin = ax.hexbin(
        x=all_scaffold_lengths['distance'].values, 
        y=all_scaffold_lengths['length'].values,
        cmap=zissou,
        yscale='log',
        gridsize=40,
        )
    cbar = plt.colorbar(hexbin, ax=ax)
    cbar.set_label('Number of genes in bin')
    if len(hexbin._offsets) == 1:
        mask = list()
        for i, path in enumerate(hexbin._paths):
            if (path._vertices[0][0] / path._vertices[0][1] > 0.5) & \
                    (hexbin._A[i] == 0.):
                mask.append(True)
            else:
                mask.append(False)
        mask = np.array(mask)
    else:
        mask = (hexbin._offsets[:, 0] / hexbin._offsets[:, 1] >= 1/2) & \
                    (hexbin._A == 0.)
    hexbin._A = np.ma.masked_where(mask, hexbin._A)

    scatter = ax.scatter(
        x=candidate_scaffold_lengths[
            candidate_scaffold_lengths.candidate == candidate
            ]['distance'].values,
        y=candidate_scaffold_lengths[
            candidate_scaffold_lengths.candidate == candidate
            ]['length'].values,
        color='#000000',
        )
    ax.set_xlim(-15000, 6.5e+5)
    xticklabels = ax.set_xticklabels([0, 0] + [r'${} \times 10^{{5}}$'.format(i) for i in range(1, 7)])
    #ax.autoscale()
    ax.set_ylim(400, 1.5 * 10**6)
    ax.set_ylabel('Scaffold size (bp)')
    ax.set_xlabel('Distance to scaffold end (bp)')
    ax.set_title(candidate)
    fig.set_size_inches(7, 6)
    fig.savefig(pjoin(STEPS[4], "scaf_dist_{}.png".format(candidate)), dpi=300)
    fig.savefig(pjoin(STEPS[4], "scaf_dist_{}.pdf".format(candidate)))
    fig.savefig(pjoin(STEPS[4], "scaf_dist_{}.svg".format(candidate)))



In [23]:

    
candidate = 'atg4020'

fig, ax = plt.subplots()
hexbin = ax.hexbin(
    x=all_scaffold_lengths[
        all_scaffold_lengths.isolate == 'MNH120'
        ]['distance'].values, 
    y=all_scaffold_lengths[
        all_scaffold_lengths.isolate == 'MNH120'
        ]['length'].values,
    cmap=zissou,
    yscale='log',
    gridsize=40,
    )

if len(hexbin._offsets) == 1:
    mask = list()
    for i, path in enumerate(hexbin._paths):
        if (path._vertices[0][0] / path._vertices[0][1] > 0.5) & \
                (hexbin._A[i] == 0.):
            mask.append(True)
        else:
            mask.append(False)
    mask = np.array(mask)
else:
    mask = (hexbin._offsets[:, 0] / hexbin._offsets[:, 1] >= 1/2) & \
                (hexbin._A == 0.)
hexbin._A = np.ma.masked_where(mask, hexbin._A)

cbar = plt.colorbar(hexbin, ax=ax)
cbar.set_label('Number of genes in bin')
scatter = ax.scatter(
    x=candidate_scaffold_lengths[
        candidate_scaffold_lengths.isolate == 'MNH120'
        ]['distance'].values,
    y=candidate_scaffold_lengths[
        candidate_scaffold_lengths.isolate == 'MNH120'
        ]['length'].values,
    color='#000000'
    )

ax.set_xlim(-15000, 6e+5)
xticklabels = ax.set_xticklabels([0, 0] + [r'${} \times 10^{{5}}$'.format(i) for i in range(1, 7)])
#ax.autoscale()
ax.set_ylim(2000, 1.3 * 10**6)
ax.set_ylabel('Scaffold size (bp)')
ax.set_xlabel('Distance to scaffold end (bp)')
fig.set_size_inches(7, 6)
fig.savefig(pjoin(STEPS[4], "scaf_dist_mnh120.png"), dpi=300)
fig.savefig(pjoin(STEPS[4], "scaf_dist_mnh120.pdf"))
fig.savefig(pjoin(STEPS[4], "scaf_dist_mnh120.svg"))

Step 05 - find distance to nearest TE-Gene



In [22]:

    
# Preprocessing genome
## Super inefficient implementation

rows = list()
for isolate, genome in GENOMES.items():
    if isolate != 'MNH120':
        continue
        # We currently only have REPET data for MNH120, others to come.
    for scaffold, sequence in genome.items():
        these_tes = [f for f in sequence.features if ('te' in f.type.lower() or 'ssr' in f.type.lower())]
        these_genes = [f for f in sequence.features if f.type == 'gene']
        
        ## sort by min(feature.start, feature.end)
        #these_tes.sort(key=lambda f: min(f.location.start, f.location.end))
        #these_genes.sort(key=lambda f: min(f.location.start, f.location.end))

        if len(these_genes) == 1:  # Can't find nearest gene in this case
            continue
        if len(these_tes) == 0:  # Can't find nearest te in this case
            continue

        for i, gene in enumerate(these_genes):
            id_ = gene.id 
            start = gene.location.start
            end = gene.location.end
            
            min_gene_dist = float('inf')
            for j, n_gene in enumerate(these_genes):
                if j == i:
                    continue
                n_start = n_gene.location.start
                n_end = n_gene.location.end
                gene_dist = min(
                    abs(start - n_start),
                    abs(start - n_end),
                    abs(end - n_start),
                    abs(end - n_end),
                    )
                if gene_dist < min_gene_dist:
                    min_gene_dist = gene_dist

            min_te_dist = float('inf')
            for te in these_tes:
                te_start = te.location.start
                te_end = te.location.end
                te_dist = min(
                    abs(start - te_start),
                    abs(start - te_end),
                    abs(end - te_start),
                    abs(end - te_end),
                    )
                if te_dist < min_te_dist:
                    min_te_dist = te_dist

            row = {
                'isolate': isolate,
                'scaffold': scaffold,
                'feature': id_,
                'nearest_gene': min_gene_dist,
                'nearest_te': min_te_dist
                }
            rows.append(row)
rows = pd.DataFrame(rows)
rows.loc[rows['nearest_te'] <= 0., 'nearest_te'] = 1
rows.to_csv(pjoin(STEPS[5], 'nearest_gene_te.csv'), sep='\t', index=False)

Because I am going to plot these on a log scale, any zero values will screw up the plot because log(0) = -infinity. I changed those values to 1.



In [23]:

    
rows = pd.read_table(pjoin(STEPS[5], 'nearest_gene_te.csv'), sep='\t')
candidates = set(CANDIDATES['gene'])
candidate_pos = rows[rows['feature'].isin(candidates)]
candidate_pos









    Out[23]:






  
    
      
      feature
      isolate
      nearest_gene
      nearest_te
      scaffold
    
  
  
    
      2815
      atg10703
      MNH120
      281
      976
      scaffold_996
    
    
      2831
      atg10719
      MNH120
      841
      345
      scaffold_996
    
    
      2836
      atg10724
      MNH120
      107
      4126
      scaffold_996
    
    
      4219
      atg161
      MNH120
      647
      20
      scaffold_48
    
    
      4220
      atg162
      MNH120
      797
      214
      scaffold_48
    
    
      4308
      atg7465
      MNH120
      622
      24
      scaffold_893
    
    
      5614
      atg12487
      MNH120
      682
      155
      scaffold_978
    
    
      7826
      atg140
      MNH120
      11265
      1
      scaffold_38
    
    
      9724
      atg4290
      MNH120
      202
      7
      scaffold_447



In [24]:

    
rows = pd.read_table(pjoin(STEPS[5], 'nearest_gene_te.csv'), sep='\t')

fig, ax = plt.subplots()
im = ax.hexbin(
    x=rows['nearest_te'].values,
    y=rows['nearest_gene'].values,
    cmap=zissou,
    yscale='log',
    xscale='log',
    gridsize=40,
    )
cbar = plt.colorbar(im, ax=ax)
cbar.set_label('Number of genes in bin')
#ax.set_xlim(-1, 10**5)
ax.autoscale

scatter = ax.scatter(
    x=candidate_pos['nearest_te'].values,
    y=candidate_pos['nearest_gene'].values,
    c='#000000',
    lw=0.,
    #alpha=.70,
    )

ax.set_ylabel('Nearest gene (bp)')
ax.set_xlabel('Nearest TE (bp)')
fig.set_size_inches(7, 6)
fig.savefig(pjoin(STEPS[5], 'heatmap_te_gene.png'), dpi=300, )
fig.savefig(pjoin(STEPS[5], 'heatmap_te_gene.pdf'))
fig.savefig(pjoin(STEPS[5], 'heatmap_te_gene.svg'))
plt.show()

	method	gene	aa_mutation	length	gb_acc	blast_desc	organism	evalue	identity
0	DNA-seq	atg161	T184I	237	No hit	NaN	NaN	NaN	NaN
1	DNA-seq	atg162	T417A	459	WP_030543571.1	Hypothetical protein	Streptomyces albus	0.000000008	29
2	DNA-seq	atg4020	P174PNAKPDAKPDAKPDAKP, V155L	185	XP_007283047.1	Snoal-like polyketide cyclase family protein	Colletotrichum gloeosporioides	3E-029	40.15
3	DNA-seq	atg12487	V388M	403	ADT70774.1	Cellobiose dehydrogenase	Dichomera saubinetii	7E-013	55.56
4	DNA-seq	atg140* (not original gene call) first 15 amino acids removed	D83N	232	No hit	NaN	NaN	NaN	NaN
5	DNA-seq	atg7465	V463I	626	EKG19035.1	Glycoside hydrolase family 1	Macrophomina phaesolina	0	75.36
6	DNA-seq	atg10724	K178N,K239R	292	No hit	NaN	NaN	NaN	NaN
7	DNA-seq	atg10703	G102A	202	No hit	NaN	NaN	NaN	NaN
8	DNA-seq	atg4290	L423S	436	XP_007795436.1	putative chloroperoxidase-like protein [Eutypa lata UCREL1]	Eutypa lata	1E-022	38
9	RNA-seq	atg10719	Not expressed in race 5	86	No hit	No hit	No hit	No hit	No hit

	feature	isolate	nearest_gene	nearest_te	scaffold
2815	atg10703	MNH120	281	976	scaffold_996
2831	atg10719	MNH120	841	345	scaffold_996
2836	atg10724	MNH120	107	4126	scaffold_996
4219	atg161	MNH120	647	20	scaffold_48
4220	atg162	MNH120	797	214	scaffold_48
4308	atg7465	MNH120	622	24	scaffold_893
5614	atg12487	MNH120	682	155	scaffold_978
7826	atg140	MNH120	11265	1	scaffold_38
9724	atg4290	MNH120	202	7	scaffold_447