In [ ]:
%run './ipython_startup.py'
In [ ]:
import bed as mcbed
In [ ]:
# Import bed file of genes of interest
bedName = os.path.join(PROJ, 'exported_data/sd_coding_sequence.bed')
bed = mcbed.Bed(bedName)
# Create data frame with positions of interest (poi)
poi = list()
for row in bed:
start = min(row[1], row[2])
end = max(row[1], row[2])
poi.append(pd.DataFrame([(row[3], row[0], x) for x in xrange(start, end)], columns=['gene', 'chrom', 'pos']))
dfPoi = pd.concat(poi)
# Import Parental RIL SNP calls
fname = '/home/jfear/sandbox/cegs_sem_sd_paper/from_matt/SD-pathway_files/format_SNP_files/SNPtableBWA.txt'
# Ignore following founders
ignore = ['3RA2', 'B5XSAM', 'B5XSS']
out = list()
with open(fname, 'r') as FH:
for row in FH:
chrom, pos, minor, major, founderID, minorFreq, readCnt = row.strip().split('\t')
if founderID not in ignore:
pos = int(pos)
minorFreq = float(minorFreq)
if minorFreq > 0:
out.append((chrom, pos, founderID, minor))
else:
out.append((chrom, pos, founderID, major))
df = pd.DataFrame(out, columns=['chrom', 'pos', 'founderID', 'base'])
In [ ]:
# Import Parental RIL SNP calls
fname = '/home/jfear/sandbox/cegs_sem_sd_paper/from_matt/SD-pathway_files/format_SNP_files/SNPtableBWA.txt'
# Ignore following founders
ignore = ['3RA2', 'B5XSAM', 'B5XSS']
out = list()
with open(fname, 'r') as FH:
for row in FH:
chrom, pos, minor, major, founderID, minorFreq, readCnt = row.strip().split('\t')
if founderID not in ignore:
pos = int(pos)
minorFreq = float(minorFreq)
if minorFreq > 0:
out.append((chrom, pos, founderID, minor))
else:
out.append((chrom, pos, founderID, major))
df = pd.DataFrame(out, columns=['chrom', 'pos', 'founderID', 'base'])
In [ ]:
# Merge positions of interest to SNP data
dfFilter = df.merge(dfPoi, how='inner', on=['chrom', 'pos'])
# Make wide data set of filtered data
dfFilter.set_index(['gene', 'chrom', 'pos', 'founderID'], inplace=True)
dfWide = dfFilter.unstack(level='founderID')
# Iterate over gene and count the number of haplotypes
grp = dfWide.groupby(level='gene')
out = list()
for name, g in grp:
uniq = set([tuple(x) for x in g.T.values])
out.append([name, len(uniq)])
cds_out = pd.DataFrame(out, columns=['gene', 'cds_dspr'])
cds_out.set_index('gene', inplace=True)
In [ ]:
# Import bed file of genes of interest
bedName = os.path.join(PROJ, 'exported_data/sd_up_down_mRNA_sequence.bed')
bed = mcbed.Bed(bedName)
# Create data frame with positions of interest (poi)
poi = list()
for row in bed:
start = min(row[1], row[2])
end = max(row[1], row[2])
poi.append(pd.DataFrame([(row[3], row[0], x) for x in xrange(start, end)], columns=['gene', 'chrom', 'pos']))
dfPoi = pd.concat(poi)
In [ ]:
# Import Parental RIL SNP calls
fname = '/home/jfear/sandbox/cegs_sem_sd_paper/from_matt/SD-pathway_files/format_SNP_files/SNPtableBWA.txt'
# Ignore following founders
ignore = ['3RA2', 'B5XSAM', 'B5XSS']
out = list()
with open(fname, 'r') as FH:
for row in FH:
chrom, pos, minor, major, founderID, minorFreq, readCnt = row.strip().split('\t')
if founderID not in ignore:
pos = int(pos)
minorFreq = float(minorFreq)
if minorFreq > 0:
out.append((chrom, pos, founderID, minor))
else:
out.append((chrom, pos, founderID, major))
df = pd.DataFrame(out, columns=['chrom', 'pos', 'founderID', 'base'])
In [ ]:
# Merge positions of interest to SNP data
dfFilter = df.merge(dfPoi, how='inner', on=['chrom', 'pos'])
# Make wide data set of filtered data
dfFilter.set_index(['gene', 'chrom', 'pos', 'founderID'], inplace=True)
dfWide = dfFilter.unstack(level='founderID')
# Iterate over gene and count the number of haplotypes
grp = dfWide.groupby(level='gene')
out = list()
for name, g in grp:
uniq = set([tuple(x) for x in g.T.values])
out.append([name, len(uniq)])
mRNA_out = pd.DataFrame(out, columns=['gene', 'mRNA_dspr'])
mRNA_out.set_index('gene', inplace=True)
In [ ]:
dspr = pd.concat([cds_out, mRNA_out], axis=1)
In [ ]: