This script parses Flybase 5.51 gff data to create several different BED files.
In [ ]:
%run './ipython_startup.py'
In [ ]:
import gff as mcgff
In [ ]:
# function to merge overlapping regions
def mergeOverlap(cds, gene, chrom):
cds.sort(key=lambda x: x[0])
cds.sort(key=lambda x: x[1])
out = list()
for i, cs in enumerate(cds):
cstart = cs[0]
cend = cs[1]
if i == 0:
pstart = cstart
pend = cend
elif cstart <= pend:
pend = cend
else:
out.append((chrom, pstart-1, pend, gene))
pstart = cstart
pend = cend
out.append((chrom, pstart-1, pend, gene))
return out
In [ ]:
# Connect to GFF data base
fly = mcgff.FlyGff(os.path.join(MCLAB, 'useful_dmel_data/flybase551/flybase_files/dmel-all-no-analysis-r5.51.gff'))
In [ ]:
# list of genes in sex determination hierarchy
sdGenes = ['B52', 'Psi', 'Rbp1', 'Spf45', 'Sxl', 'Yp1', 'Yp2', 'Yp3',
'dsx', 'fl(2)d', 'fru', 'her', 'ix', 'mub', 'ps', 'snf',
'sqd', 'tra', 'tra2', 'vir']
In [ ]:
# Generate CDS bed file
out = list()
for gene in sdGenes:
fbgn = fly.symbol2fbgn[gene]
chrom = fly.db[fbgn].chrom
CDS = [(min(x.start, x.end), max(x.start, x.end)) for x in fly.db.children(fbgn, featuretype='CDS')]
merged = mergeOverlap(CDS, gene, chrom)
out.extend(merged)
with open(os.path.join(PROJ, 'exported_data/sd_coding_sequence.bed'), 'w') as OUT:
for row in out:
OUT.write('\t'.join([str(x) for x in row]) + "\n")
In [ ]:
# Generate 2kb up-down of mRNA start site bed file
out = list()
for gene in sdGenes:
fbgn = fly.symbol2fbgn[gene]
chrom = fly.db[fbgn].chrom
strand = fly.db[fbgn].strand
mRNA = fly.db.children(fbgn, featuretype='mRNA')
if strand == '+':
TSS = [(ts.start - 2000, ts.start + 2000) for ts in mRNA]
elif strand == '-':
TSS = [(ts.end - 2000, ts.end + 2000) for ts in mRNA]
merged = mergeOverlap(TSS, gene, chrom)
out.extend(merged)
with open(os.path.join(PROJ, 'exported_data/sd_up_down_mRNA_sequence.bed'), 'w') as OUT:
for row in out:
OUT.write('\t'.join([str(x) for x in row]) + "\n")
In [ ]:
# Generate Gene Region bed file
out = list()
for name in sdGenes:
fbgn = fly.symbol2fbgn[name]
gene = fly.db[fbgn]
chrom = gene.chrom
start = gene.start
end = gene.end
out.append((chrom, start - 1, end, name))
with open(os.path.join(PROJ, 'exported_data/sd_gene_sequence.bed'), 'w') as OUT:
for row in out:
OUT.write('\t'.join([str(x) for x in row]) + "\n")
In [ ]: