In [ ]:
%run './ipython_startup.py'

In [ ]:
import bed as mcbed

CDS


In [ ]:
# Import bed file of genes of interest
bedName = os.path.join(PROJ, 'exported_data/sd_coding_sequence.bed')
bed = mcbed.Bed(bedName)

# Create data frame with positions of interest (poi)
poi = list()
for row in bed:
    start = min(row[1], row[2])
    end = max(row[1], row[2])
    poi.append(pd.DataFrame([(row[3], row[0], x) for x in xrange(start, end)], columns=['gene', 'chrom', 'pos']))

dfPoi = pd.concat(poi)
# Import Parental RIL SNP calls
fname = '/home/jfear/sandbox/cegs_sem_sd_paper/from_matt/SD-pathway_files/format_SNP_files/SNPtableBWA.txt'

# Ignore following founders
ignore = ['3RA2', 'B5XSAM', 'B5XSS']

out = list()
with open(fname, 'r') as FH:
    for row in FH:
        chrom, pos, minor, major, founderID, minorFreq, readCnt = row.strip().split('\t')
        if founderID not in ignore:
            pos = int(pos)
            minorFreq = float(minorFreq)
            if minorFreq > 0:
                out.append((chrom, pos, founderID, minor))
            else:
                out.append((chrom, pos, founderID, major))
df = pd.DataFrame(out, columns=['chrom', 'pos', 'founderID', 'base'])

In [ ]:
# Import Parental RIL SNP calls
fname = '/home/jfear/sandbox/cegs_sem_sd_paper/from_matt/SD-pathway_files/format_SNP_files/SNPtableBWA.txt'

# Ignore following founders
ignore = ['3RA2', 'B5XSAM', 'B5XSS']

out = list()
with open(fname, 'r') as FH:
    for row in FH:
        chrom, pos, minor, major, founderID, minorFreq, readCnt = row.strip().split('\t')
        if founderID not in ignore:
            pos = int(pos)
            minorFreq = float(minorFreq)
            if minorFreq > 0:
                out.append((chrom, pos, founderID, minor))
            else:
                out.append((chrom, pos, founderID, major))
df = pd.DataFrame(out, columns=['chrom', 'pos', 'founderID', 'base'])

In [ ]:
# Merge positions of interest to SNP data
dfFilter = df.merge(dfPoi, how='inner', on=['chrom', 'pos'])

# Make wide data set of filtered data
dfFilter.set_index(['gene', 'chrom', 'pos', 'founderID'], inplace=True)
dfWide = dfFilter.unstack(level='founderID')

# Iterate over gene and count the number of haplotypes
grp = dfWide.groupby(level='gene')
out = list()
for name, g in grp:
    uniq = set([tuple(x) for x in g.T.values])
    out.append([name, len(uniq)])

cds_out = pd.DataFrame(out, columns=['gene', 'cds_dspr'])
cds_out.set_index('gene', inplace=True)

Upstream and Downstream of TSS


In [ ]:
# Import bed file of genes of interest
bedName = os.path.join(PROJ, 'exported_data/sd_up_down_mRNA_sequence.bed')
bed = mcbed.Bed(bedName)

# Create data frame with positions of interest (poi)
poi = list()
for row in bed:
    start = min(row[1], row[2])
    end = max(row[1], row[2])
    poi.append(pd.DataFrame([(row[3], row[0], x) for x in xrange(start, end)], columns=['gene', 'chrom', 'pos']))

dfPoi = pd.concat(poi)

In [ ]:
# Import Parental RIL SNP calls
fname = '/home/jfear/sandbox/cegs_sem_sd_paper/from_matt/SD-pathway_files/format_SNP_files/SNPtableBWA.txt'

# Ignore following founders
ignore = ['3RA2', 'B5XSAM', 'B5XSS']

out = list()
with open(fname, 'r') as FH:
    for row in FH:
        chrom, pos, minor, major, founderID, minorFreq, readCnt = row.strip().split('\t')
        if founderID not in ignore:
            pos = int(pos)
            minorFreq = float(minorFreq)
            if minorFreq > 0:
                out.append((chrom, pos, founderID, minor))
            else:
                out.append((chrom, pos, founderID, major))
df = pd.DataFrame(out, columns=['chrom', 'pos', 'founderID', 'base'])

In [ ]:
# Merge positions of interest to SNP data
dfFilter = df.merge(dfPoi, how='inner', on=['chrom', 'pos'])

# Make wide data set of filtered data
dfFilter.set_index(['gene', 'chrom', 'pos', 'founderID'], inplace=True)
dfWide = dfFilter.unstack(level='founderID')

# Iterate over gene and count the number of haplotypes
grp = dfWide.groupby(level='gene')
out = list()
for name, g in grp:
    uniq = set([tuple(x) for x in g.T.values])
    out.append([name, len(uniq)])

mRNA_out = pd.DataFrame(out, columns=['gene', 'mRNA_dspr'])
mRNA_out.set_index('gene', inplace=True)

In [ ]:
dspr = pd.concat([cds_out, mRNA_out], axis=1)

In [ ]: