Generate Sex Determination BED File

This script parses Flybase 5.51 gff data to create several different BED files.

  • sd_coding_sequence.bed: Contains CDS coordiantes for each gene
  • sd_up_down_mRNA_sequence.bed: Contains 2kb upstream and downstream of each mRNA start
  • sd_gene_sequence.bed: Contains entire gene region.

In [ ]:
%run './ipython_startup.py'

In [ ]:
import gff as mcgff

In [ ]:
# function to merge overlapping regions
def mergeOverlap(cds, gene, chrom):
    cds.sort(key=lambda x: x[0])
    cds.sort(key=lambda x: x[1])
    
    out = list()
    for i, cs in enumerate(cds):
        cstart = cs[0]
        cend = cs[1]
        
        if i == 0:
            pstart = cstart
            pend = cend
        elif cstart <= pend:
            pend = cend
        else:
            out.append((chrom, pstart-1, pend, gene))
            pstart = cstart
            pend = cend
    out.append((chrom, pstart-1, pend, gene))
    
    return out

In [ ]:
# Connect to GFF data base
fly = mcgff.FlyGff(os.path.join(MCLAB, 'useful_dmel_data/flybase551/flybase_files/dmel-all-no-analysis-r5.51.gff'))

In [ ]:
# list of genes in sex determination hierarchy
sdGenes = ['B52', 'Psi', 'Rbp1', 'Spf45', 'Sxl', 'Yp1', 'Yp2', 'Yp3', 
         'dsx', 'fl(2)d', 'fru', 'her', 'ix', 'mub', 'ps', 'snf', 
         'sqd', 'tra', 'tra2', 'vir']

In [ ]:
# Generate CDS bed file
out = list()
for gene in sdGenes:
    fbgn = fly.symbol2fbgn[gene]
    chrom = fly.db[fbgn].chrom
    CDS = [(min(x.start, x.end), max(x.start, x.end)) for x in fly.db.children(fbgn, featuretype='CDS')]    
    merged = mergeOverlap(CDS, gene, chrom)
    out.extend(merged)
    
with open(os.path.join(PROJ, 'exported_data/sd_coding_sequence.bed'), 'w') as OUT:
    for row in out:
        OUT.write('\t'.join([str(x) for x in row]) + "\n")

In [ ]:
# Generate 2kb up-down of mRNA start site bed file

out = list()
for gene in sdGenes:
    fbgn = fly.symbol2fbgn[gene]
    chrom = fly.db[fbgn].chrom
    strand = fly.db[fbgn].strand
    mRNA = fly.db.children(fbgn, featuretype='mRNA')
    
    if strand == '+':
        TSS = [(ts.start - 2000, ts.start + 2000) for ts in mRNA]
    elif strand == '-':
        TSS = [(ts.end - 2000, ts.end + 2000) for ts in mRNA]
            
    merged = mergeOverlap(TSS, gene, chrom)
    out.extend(merged)
    
with open(os.path.join(PROJ, 'exported_data/sd_up_down_mRNA_sequence.bed'), 'w') as OUT:
    for row in out:
        OUT.write('\t'.join([str(x) for x in row]) + "\n")

In [ ]:
# Generate Gene Region bed file
out = list()
for name in sdGenes:
    fbgn = fly.symbol2fbgn[name]
    gene = fly.db[fbgn]
    chrom = gene.chrom
    start = gene.start
    end = gene.end
    
    out.append((chrom, start - 1, end, name))
    
with open(os.path.join(PROJ, 'exported_data/sd_gene_sequence.bed'), 'w') as OUT:
    for row in out:
        OUT.write('\t'.join([str(x) for x in row]) + "\n")

In [ ]: