In [1]:
with open("requirements.txt", "r") as handle:
print(handle.read())
To run this notebook for yourself create a new virtual environment in the prompt:
$ virtualenv env
$ source env/bin/activate
$ pip install numpy
$ pip install -r requirements
Then to run the notebook itself:
$ ipython notebook
In [2]:
%matplotlib inline
In [3]:
from collections import defaultdict
import os
from os.path import join as pjoin
import subprocess
import re
import json
from Bio import SeqIO
from Bio.Graphics import GenomeDiagram
from Bio.Graphics.GenomeDiagram import CrossLink
from Bio.Blast.Applications import NcbirpstblastnCommandline
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast.Applications import NcbipsiblastCommandline
from Bio.SeqFeature import SeqFeature
from Bio.SeqFeature import FeatureLocation
from reportlab.lib import colors
from reportlab.lib.units import cm
from BCBio import GFF
from IPython.display import SVG
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
In [4]:
#plt.style.use('ggplot')
In [5]:
pd.set_option("display.max_colwidth", 800)
pd.options.display.max_rows = 999
THREADS = 2 # Number of CPUs available for calls to BLAST etc.
In [6]:
STEPS = [
'data',
"01-found_distribution_of_scaffold_length",
"02-found_length_distribution_of_scaffolds_with_genes",
"03-place_candidate_scaffold_lengths_distribution",
"04-found_dist_nearest_scaf_end_plotted",
"05-found_distance_nearest_te_gene"
]
for step in STEPS:
if not os.path.isdir(step):
os.mkdir(step)
Jason identified 10 candidate AvrRvi5 genes.
Nine were discovered by comparative genomics, and 1 was discovered by comparative transcriptomics of 14 day old Venturia inaequalis cultures in potato dextrose broth (PDB).
atg10719
was the candidate identified by the transcriptomics method, it is expressed in MNH120 but not in Race 5.
Below is a summary dataframe of the candidates:
In [7]:
CANDIDATES = pd.read_csv(pjoin(STEPS[0], "summary.DNA-seq.RNA-seq.csv"))
CANDIDATES
Out[7]:
In [8]:
CANDIDATES.loc[4,'gene'] = 'atg140'
In [9]:
GENOME_FILES = {
'MNH120': {
'fasta': pjoin(STEPS[0], 'MNH120.genome.masked.fasta'),
'gff': [
pjoin(STEPS[0], 'augustus.hints.GeneWithUTR.withNCBI_Fungi.gff3'),
pjoin(STEPS[0], 'MNH120.REPET_TEs.gff3'),
pjoin(STEPS[0], 'MNH120.REPET_SSRs.gff3'),
]
},
'B04': {
'fasta': pjoin(STEPS[0], 'B04.genome.fa'),
'gff': [
pjoin(STEPS[0], 'B04.genes.gff3'),
]
},
'USR5': {
'fasta': pjoin(STEPS[0], 'I5V.gapfilled.final.fa'),
'gff': [
pjoin(STEPS[0], 'I5V.PredictedPass.cleaned.gff3'),
]
},
'I9A': {
'fasta': pjoin(STEPS[0], 'I9A.gapfilled.final.fa'),
'gff': [
pjoin(STEPS[0], 'I9A.PredictedPass.cleaned.gff3'),
]
},
'I17V': {
'fasta': pjoin(STEPS[0], 'I17V.gapfilled.final.fa'),
'gff': [
pjoin(STEPS[0], 'I17V.PredictedPass.cleaned.gff3'),
]
},
'I19A': {
'fasta': pjoin(STEPS[0], 'I19A.gapfilled.final.fa'),
'gff': [
pjoin(STEPS[0], 'I19A.PredictedPass.cleaned.gff3'),
]
},
'I31A': {
'fasta': pjoin(STEPS[0], 'I31A.gapfilled.final.fa'),
'gff': [
pjoin(STEPS[0], 'I31A.gapfilled.final.cleaned.gff3'),
],
},
'I37A': {
'fasta': pjoin(STEPS[0], 'I37A.gapfilled.final.fa'),
'gff': [
pjoin(STEPS[0], 'I37A.PredictedPass.cleaned.gff3'),
],
},
'I61A': {
'fasta': pjoin(STEPS[0], 'I61A.gapfilled.final.fa'),
'gff': [
pjoin(STEPS[0], 'I61A.PredictedPass.cleaned.gff3'),
],
},
'I65V': {
'fasta': pjoin(STEPS[0], 'I65V.gapfilled.final.fa'),
'gff': [
pjoin(STEPS[0], 'I65V.PredictedPass.cleaned.gff3'),
],
}
}
In [10]:
GENOMES = dict()
for isolate, files in GENOME_FILES.items():
GENOMES[isolate] = SeqIO.to_dict(
SeqIO.parse(files['fasta'], format="fasta")
)
for gff in files['gff']:
genome_with_features = GFF.parse(
gff,
base_dict=GENOMES[isolate]
)
GENOMES[isolate].update(SeqIO.to_dict(genome_with_features))
In [11]:
GENES = dict()
GENES['MNH120'] = SeqIO.to_dict(
SeqIO.parse(pjoin(STEPS[0], 'MNH120.Auto.genes.fna'), format="fasta")
)
In [12]:
cdict = {
'red': ((0., 59/256, 59/256),
(0.25, 120/256, 120/256),
(0.5, 235/256, 235/236),
(0.75, 225/256, 225/256),
(1., 242/256, 242/256)),
'green': ((0., 156/256, 156/256),
(0.25, 183/256, 183/256),
(0.5, 204/256, 204/256),
(0.75, 175/256, 175/256),
(1., 26/256, 26/256)),
'blue': ((0., 178/256, 178/256),
(0.25, 197/256, 197/256),
(0.5, 42/256, 42/256),
(0.75, 0., 0.),
(1., 0., 0.))
}
zissou = LinearSegmentedColormap('Zissou', cdict)
plt.register_cmap(cmap=zissou)
cat_colours = ["#FF0000", "#00A08A", "#F2AD00", "#F98400", "#5BBCD6", "#046C9A", '#35274A', '#B40F20', "#D69C4E"]
In [13]:
mpl.rcParams['text.usetex']=True
mpl.rcParams['text.latex.preamble'] = [
r'\usepackage{siunitx}', # i need upright \micro symbols, but you need...
r'\sisetup{detect-all}', # ...this to force siunitx to actually use your fonts
r'\usepackage{helvet}', # set the normal font here
r'\usepackage{sansmath}', # load up the sansmath so that math -> helvet
r'\sansmath' # <- tricky! -- gotta actually tell tex to use!
]
In [14]:
scaffold_lengths = list()
for isolate, genome in GENOMES.items():
for scaffold, sequence in genome.items():
row = {
'isolate': isolate,
'scaffold': scaffold,
'length': len(sequence)
}
scaffold_lengths.append(row)
scaffold_lengths = pd.DataFrame(scaffold_lengths)
scaffold_lengths.to_csv(
pjoin(STEPS[1], 'scaffold_lengths.csv'),
index=False,
sep='\t',
encoding='utf-8',
header=True
)
isolate_names = list()
isolate_lengths = list()
for i, j in scaffold_lengths.groupby(['isolate'])['length']:
isolate_names.append(i)
isolate_lengths.append(j.values)
fig, ax = plt.subplots()
dummy = ax.boxplot(isolate_lengths, labels=isolate_names, vert=False, showmeans=True, widths=0.8)
fig.savefig(pjoin(STEPS[1], 'scaffold_distribution.png'))
In [15]:
scaffold_lengths_genes = list()
for isolate, genome in GENOMES.items():
for scaffold, sequence in genome.items():
if len([f for f in sequence.features if f.type == 'gene']) == 0:
continue
row = {
'isolate': isolate,
'scaffold': scaffold,
'length': len(sequence)
}
scaffold_lengths_genes.append(row)
scaffold_lengths_genes = pd.DataFrame(scaffold_lengths_genes)
scaffold_lengths_genes.to_csv(
pjoin(STEPS[2], 'scaffold_lengths_containing_genes.csv'),
index=False,
sep='\t',
encoding='utf-8',
header=True
)
isolate_names = list()
isolate_lengths = list()
for i, j in scaffold_lengths_genes.groupby(['isolate'])['length']:
isolate_names.append(i)
isolate_lengths.append(j.values)
fig, ax = plt.subplots()
dummy = ax.boxplot(isolate_lengths, labels=isolate_names, vert=False, showmeans=True, widths=0.8)
fig.savefig(pjoin(STEPS[2], 'scaffold_distribution.png'))
In [16]:
with open(pjoin(STEPS[0], 'MNH120_candidate_homologues.json'), 'r') as handle:
candidate_homologues = json.load(handle)
positions = defaultdict(dict)
for candidate in CANDIDATES['gene']:
print(candidate)
for isolate, homologues in candidate_homologues[candidate].items():
homologue = homologues[0]
scaffold = homologue['scaffold']
positions[candidate][isolate] = len(GENOMES[isolate][scaffold])
print("{:>8}:".format(isolate), len(GENOMES[isolate][scaffold]))
with open(pjoin(STEPS[3], 'candidate_scaf_lengths.json'), 'w') as handle:
json.dump(positions, handle)
In [17]:
%run lib/subset_features.py
In [18]:
all_scaffold_lengths = list()
for isolate, scaffolds in GENOMES.items():
for scaffold, record in scaffolds.items():
for feat in record.features:
if feat.type != 'gene':
continue
distance = min([
feat.location.start,
feat.location.end,
len(record) - feat.location.start,
len(record) - feat.location.end
])
row = {
'feature': feat.id,
'isolate': isolate,
'scaffold': scaffold,
'length': len(record),
'distance': distance
}
all_scaffold_lengths.append(row)
all_scaffold_lengths = pd.DataFrame(all_scaffold_lengths)
all_scaffold_lengths.to_csv(
pjoin(STEPS[4], 'all_scaffold_lengths.csv'),
index=False,
sep='\t',
encoding='utf-8',
header=True
)
with open(pjoin(STEPS[0], 'MNH120_candidate_homologues.json'), 'r') as handle:
candidate_homologues = json.load(handle)
# candidate: genome: scaffold: feature1, feature2, ...
features = defaultdict(lambda: defaultdict(lambda: defaultdict(set)))
scaffold_lengths = list()
for candidate in CANDIDATES['gene']:
for isolate, homologues in candidate_homologues[candidate].items():
homologue = homologues[0]
scaffold = homologue['scaffold']
these_features = subset_features(
GENOMES[isolate][scaffold],
homologue['start'],
homologue['end']
)
these_features = [f for f in these_features if f.type == 'gene']
if len(these_features) == 0:
feature = SeqFeature(
location=FeatureLocation(
homologue['start'],
homologue['end'],
homologue['strand'],
),
id=isolate + '_' + candidate + '_homologue',
)
features[candidate][isolate][scaffold] = feature
else:
feature = these_features[0]
features[candidate][isolate][scaffold] = feature
distance = min([
feature.location.start,
feature.location.end,
len(GENOMES[isolate][scaffold]) - feature.location.start,
len(GENOMES[isolate][scaffold]) - feature.location.end
])
row = {
'candidate': candidate,
'feature': feature.id,
'isolate': isolate,
'scaffold': scaffold,
'length': len(GENOMES[isolate][scaffold]),
'distance': distance
}
scaffold_lengths.append(row)
candidate_scaffold_lengths = pd.DataFrame(scaffold_lengths)
candidate_scaffold_lengths.to_csv(
pjoin(STEPS[4], 'candidate_scaffold_lengths.csv'),
index=False,
sep='\t',
encoding='utf-8',
header=True
)
In [21]:
all_scaffold_lengths = pd.read_table(pjoin(STEPS[4], 'all_scaffold_lengths.csv'))
candidate_scaffold_lengths = pd.read_table(pjoin(STEPS[4], 'candidate_scaffold_lengths.csv'))
fig, ax = plt.subplots()
hexbin = ax.hexbin(
x=all_scaffold_lengths['distance'].values,
y=all_scaffold_lengths['length'].values,
cmap=zissou,
yscale='log',
gridsize=40,
)
cbar = plt.colorbar(hexbin, ax=ax)
cbar.set_label('Number of genes in bin')
if len(hexbin._offsets) == 1:
mask = list()
for i, path in enumerate(hexbin._paths):
if (path._vertices[0][0] / path._vertices[0][1] > 0.5) & \
(hexbin._A[i] == 0.):
mask.append(True)
else:
mask.append(False)
mask = np.array(mask)
else:
mask = (hexbin._offsets[:, 0] / hexbin._offsets[:, 1] >= 1/2) & \
(hexbin._A == 0.)
hexbin._A = np.ma.masked_where(mask, hexbin._A)
scatter = ax.scatter(
x=candidate_scaffold_lengths['distance'].values,
y=candidate_scaffold_lengths['length'].values,
color='#000000',
)
ax.set_xlim(-15000, 6.5e+5)
xticklabels = ax.set_xticklabels([0, 0] + [r'${} \times 10^{{5}}$'.format(i) for i in range(1, 7)])
#ax.autoscale()
ax.set_ylim(400, 1.5 * 10**6)
ax.set_ylabel('Scaffold size (bp)')
ax.set_xlabel('Distance to scaffold end (bp)')
fig.set_size_inches(7, 6)
fig.savefig(pjoin(STEPS[4], "scaf_dist_all.png"), dpi=300)
fig.savefig(pjoin(STEPS[4], "scaf_dist_all.pdf"))
fig.savefig(pjoin(STEPS[4], "scaf_dist_all.svg"))
In [22]:
candidates = list(CANDIDATES['gene'])
for candidate in candidates:
fig, ax = plt.subplots()
hexbin = ax.hexbin(
x=all_scaffold_lengths['distance'].values,
y=all_scaffold_lengths['length'].values,
cmap=zissou,
yscale='log',
gridsize=40,
)
cbar = plt.colorbar(hexbin, ax=ax)
cbar.set_label('Number of genes in bin')
if len(hexbin._offsets) == 1:
mask = list()
for i, path in enumerate(hexbin._paths):
if (path._vertices[0][0] / path._vertices[0][1] > 0.5) & \
(hexbin._A[i] == 0.):
mask.append(True)
else:
mask.append(False)
mask = np.array(mask)
else:
mask = (hexbin._offsets[:, 0] / hexbin._offsets[:, 1] >= 1/2) & \
(hexbin._A == 0.)
hexbin._A = np.ma.masked_where(mask, hexbin._A)
scatter = ax.scatter(
x=candidate_scaffold_lengths[
candidate_scaffold_lengths.candidate == candidate
]['distance'].values,
y=candidate_scaffold_lengths[
candidate_scaffold_lengths.candidate == candidate
]['length'].values,
color='#000000',
)
ax.set_xlim(-15000, 6.5e+5)
xticklabels = ax.set_xticklabels([0, 0] + [r'${} \times 10^{{5}}$'.format(i) for i in range(1, 7)])
#ax.autoscale()
ax.set_ylim(400, 1.5 * 10**6)
ax.set_ylabel('Scaffold size (bp)')
ax.set_xlabel('Distance to scaffold end (bp)')
ax.set_title(candidate)
fig.set_size_inches(7, 6)
fig.savefig(pjoin(STEPS[4], "scaf_dist_{}.png".format(candidate)), dpi=300)
fig.savefig(pjoin(STEPS[4], "scaf_dist_{}.pdf".format(candidate)))
fig.savefig(pjoin(STEPS[4], "scaf_dist_{}.svg".format(candidate)))
In [23]:
candidate = 'atg4020'
fig, ax = plt.subplots()
hexbin = ax.hexbin(
x=all_scaffold_lengths[
all_scaffold_lengths.isolate == 'MNH120'
]['distance'].values,
y=all_scaffold_lengths[
all_scaffold_lengths.isolate == 'MNH120'
]['length'].values,
cmap=zissou,
yscale='log',
gridsize=40,
)
if len(hexbin._offsets) == 1:
mask = list()
for i, path in enumerate(hexbin._paths):
if (path._vertices[0][0] / path._vertices[0][1] > 0.5) & \
(hexbin._A[i] == 0.):
mask.append(True)
else:
mask.append(False)
mask = np.array(mask)
else:
mask = (hexbin._offsets[:, 0] / hexbin._offsets[:, 1] >= 1/2) & \
(hexbin._A == 0.)
hexbin._A = np.ma.masked_where(mask, hexbin._A)
cbar = plt.colorbar(hexbin, ax=ax)
cbar.set_label('Number of genes in bin')
scatter = ax.scatter(
x=candidate_scaffold_lengths[
candidate_scaffold_lengths.isolate == 'MNH120'
]['distance'].values,
y=candidate_scaffold_lengths[
candidate_scaffold_lengths.isolate == 'MNH120'
]['length'].values,
color='#000000'
)
ax.set_xlim(-15000, 6e+5)
xticklabels = ax.set_xticklabels([0, 0] + [r'${} \times 10^{{5}}$'.format(i) for i in range(1, 7)])
#ax.autoscale()
ax.set_ylim(2000, 1.3 * 10**6)
ax.set_ylabel('Scaffold size (bp)')
ax.set_xlabel('Distance to scaffold end (bp)')
fig.set_size_inches(7, 6)
fig.savefig(pjoin(STEPS[4], "scaf_dist_mnh120.png"), dpi=300)
fig.savefig(pjoin(STEPS[4], "scaf_dist_mnh120.pdf"))
fig.savefig(pjoin(STEPS[4], "scaf_dist_mnh120.svg"))
In [22]:
# Preprocessing genome
## Super inefficient implementation
rows = list()
for isolate, genome in GENOMES.items():
if isolate != 'MNH120':
continue
# We currently only have REPET data for MNH120, others to come.
for scaffold, sequence in genome.items():
these_tes = [f for f in sequence.features if ('te' in f.type.lower() or 'ssr' in f.type.lower())]
these_genes = [f for f in sequence.features if f.type == 'gene']
## sort by min(feature.start, feature.end)
#these_tes.sort(key=lambda f: min(f.location.start, f.location.end))
#these_genes.sort(key=lambda f: min(f.location.start, f.location.end))
if len(these_genes) == 1: # Can't find nearest gene in this case
continue
if len(these_tes) == 0: # Can't find nearest te in this case
continue
for i, gene in enumerate(these_genes):
id_ = gene.id
start = gene.location.start
end = gene.location.end
min_gene_dist = float('inf')
for j, n_gene in enumerate(these_genes):
if j == i:
continue
n_start = n_gene.location.start
n_end = n_gene.location.end
gene_dist = min(
abs(start - n_start),
abs(start - n_end),
abs(end - n_start),
abs(end - n_end),
)
if gene_dist < min_gene_dist:
min_gene_dist = gene_dist
min_te_dist = float('inf')
for te in these_tes:
te_start = te.location.start
te_end = te.location.end
te_dist = min(
abs(start - te_start),
abs(start - te_end),
abs(end - te_start),
abs(end - te_end),
)
if te_dist < min_te_dist:
min_te_dist = te_dist
row = {
'isolate': isolate,
'scaffold': scaffold,
'feature': id_,
'nearest_gene': min_gene_dist,
'nearest_te': min_te_dist
}
rows.append(row)
rows = pd.DataFrame(rows)
rows.loc[rows['nearest_te'] <= 0., 'nearest_te'] = 1
rows.to_csv(pjoin(STEPS[5], 'nearest_gene_te.csv'), sep='\t', index=False)
Because I am going to plot these on a log scale, any zero values will screw up the plot because log(0) = -infinity. I changed those values to 1.
In [23]:
rows = pd.read_table(pjoin(STEPS[5], 'nearest_gene_te.csv'), sep='\t')
candidates = set(CANDIDATES['gene'])
candidate_pos = rows[rows['feature'].isin(candidates)]
candidate_pos
Out[23]:
In [24]:
rows = pd.read_table(pjoin(STEPS[5], 'nearest_gene_te.csv'), sep='\t')
fig, ax = plt.subplots()
im = ax.hexbin(
x=rows['nearest_te'].values,
y=rows['nearest_gene'].values,
cmap=zissou,
yscale='log',
xscale='log',
gridsize=40,
)
cbar = plt.colorbar(im, ax=ax)
cbar.set_label('Number of genes in bin')
#ax.set_xlim(-1, 10**5)
ax.autoscale
scatter = ax.scatter(
x=candidate_pos['nearest_te'].values,
y=candidate_pos['nearest_gene'].values,
c='#000000',
lw=0.,
#alpha=.70,
)
ax.set_ylabel('Nearest gene (bp)')
ax.set_xlabel('Nearest TE (bp)')
fig.set_size_inches(7, 6)
fig.savefig(pjoin(STEPS[5], 'heatmap_te_gene.png'), dpi=300, )
fig.savefig(pjoin(STEPS[5], 'heatmap_te_gene.pdf'))
fig.savefig(pjoin(STEPS[5], 'heatmap_te_gene.svg'))
plt.show()