In [ ]:
%run './ipython_startup.py'
%run './allele_cnt_table.py'

CDS

What is the number of unique haplotypes when looking in the CDS regions.


In [ ]:
# CDS BED
bedName = os.path.join(PROJ, 'exported_data/sd_coding_sequence.bed')

In [ ]:
# CDS, not imputed -- assume ref base, (Full 75 Lines)

# Create list of all 75 lines
with open(os.path.join(PROJ, 'design_file/cegsV_line_list.csv'), 'r') as FH:
    lineList = [convertLine(x) for x in FH]
    lineList.append('w1118_w118')

# Filtered VCF
vcfName = '/home/jfear/storage/cegs_variants/CEGS.216.NO_DPGP4.GATK.SNP.HETS.FILTERED.vcf.gz'

# Create table
out = cegsAlleleCnt(bedName, vcfName, lineList)
out1 = pd.DataFrame(out, columns=['gene', 'CDS_75_ref'])
out1.set_index('gene', inplace=True)

In [ ]:
# CDS, not imputed -- asume ref base, (Random 35 Lines)
lines = ['r101', 'r21', 'r286', 'r309', 'r324', 'r335', 'r358', 'r362', 'r365', 'r371', 'r374', 'r377',
         'r380', 'r392', 'r486', 'r502', 'r535', 'r555', 'r705', 'r714', 'r732', 'r737', 'r790', 'r799',
         'r808', 'r85', 'r850', 'r853', 'w23', 'w36', 'w52', 'w54', 'w62', 'w68', 'w79']

lineList = [convertLine(x) for x in lines]
lineList.append('w1118_w118')

# Filtered VCF
vcfName = '/home/jfear/storage/cegs_variants/CEGS.216.NO_DPGP4.GATK.SNP.HETS.FILTERED.vcf.gz'

# Create table
out = cegsAlleleCnt(bedName, vcfName, lineList)
out2 = pd.DataFrame(out, columns=['gene', 'CDS_35_ref'])
out2.set_index('gene', inplace=True)

In [ ]:
# CDS, imputed, (Full 75 Lines)
with open(os.path.join(PROJ, 'design_file/cegsV_line_list.csv'), 'r') as FH:
    lineList = [convertLine(x) for x in FH]
    lineList.append('w1118_w118')

# Imputed VCF
vcfName = '/home/jfear/storage/cegs_variants/CEGS.216.lines.NO_DPGP4.GATK.SNP.HETS.FILTERED.11-6-13_imputed.vcf.gz'

# Create table
out = cegsAlleleCnt(bedName, vcfName, lineList)
out3 = pd.DataFrame(out, columns=['gene', 'CDS_75_impute'])
out3.set_index('gene', inplace=True)

In [ ]:
# CDS, imputed, (Random 35 Lines)
lines = ['r101', 'r21', 'r286', 'r309', 'r324', 'r335', 'r358', 'r362', 'r365', 'r371', 'r374', 'r377',
         'r380', 'r392', 'r486', 'r502', 'r535', 'r555', 'r705', 'r714', 'r732', 'r737', 'r790', 'r799',
         'r808', 'r85', 'r850', 'r853', 'w23', 'w36', 'w52', 'w54', 'w62', 'w68', 'w79']

lineList = [convertLine(x) for x in lines]
lineList.append('w1118_w118')

# Imputed VCF
vcfName = '/home/jfear/storage/cegs_variants/CEGS.216.lines.NO_DPGP4.GATK.SNP.HETS.FILTERED.11-6-13_imputed.vcf.gz'

# Create table
out = cegsAlleleCnt(bedName, vcfName, lineList)
out4 = pd.DataFrame(out, columns=['gene', 'CDS_35_impute'])
out4.set_index('gene', inplace=True)

In [ ]:
cds_out = pd.concat([out1, out2, out3, out4], axis=1)
cds_out

Upstream and Downstream of TSS

What is the number of unique haplotypes when looking at 2kb up and downstream of mRNA starts.


In [ ]:
# mRNA BED
bedName = os.path.join(PROJ, 'exported_data/sd_up_down_mRNA_sequence.bed')

In [ ]:
# Up Down, not imputed -- assume ref base, (Full 75 Lines)

# Create list of all 75 lines
with open(os.path.join(PROJ, 'design_file/cegsV_line_list.csv'), 'r') as FH:
    lineList = [convertLine(x) for x in FH]
    lineList.append('w1118_w118')

# Filtered VCF
vcfName = '/home/jfear/storage/cegs_variants/CEGS.216.NO_DPGP4.GATK.SNP.HETS.FILTERED.vcf.gz'

# Create table
out = cegsAlleleCnt(bedName, vcfName, lineList)
out1 = pd.DataFrame(out, columns=['gene', 'mRNA_75_ref'])
out1.set_index('gene', inplace=True)

In [ ]:
# Up Down, not imputed -- asume ref base, (Random 35 Lines)
lines = ['r101', 'r21', 'r286', 'r309', 'r324', 'r335', 'r358', 'r362', 'r365', 'r371', 'r374', 'r377',
         'r380', 'r392', 'r486', 'r502', 'r535', 'r555', 'r705', 'r714', 'r732', 'r737', 'r790', 'r799',
         'r808', 'r85', 'r850', 'r853', 'w23', 'w36', 'w52', 'w54', 'w62', 'w68', 'w79']

lineList = [convertLine(x) for x in lines]
lineList.append('w1118_w118')

# Filtered VCF
vcfName = '/home/jfear/storage/cegs_variants/CEGS.216.NO_DPGP4.GATK.SNP.HETS.FILTERED.vcf.gz'

# Create table
out = cegsAlleleCnt(bedName, vcfName, lineList)
out2 = pd.DataFrame(out, columns=['gene', 'mRNA_35_ref'])
out2.set_index('gene', inplace=True)

In [ ]:
# Up Down, imputed, (Full 75 Lines)
with open(os.path.join(PROJ, 'design_file/cegsV_line_list.csv'), 'r') as FH:
    lineList = [convertLine(x) for x in FH]
    lineList.append('w1118_w118')

# Imputed VCF
vcfName = '/home/jfear/storage/cegs_variants/CEGS.216.lines.NO_DPGP4.GATK.SNP.HETS.FILTERED.11-6-13_imputed.vcf.gz'

# Create table
out = cegsAlleleCnt(bedName, vcfName, lineList)
out3 = pd.DataFrame(out, columns=['gene', 'mRNA_75_impute'])
out3.set_index('gene', inplace=True)

In [ ]:
# Up Down, imputed, (Random 35 Lines)
lines = ['r101', 'r21', 'r286', 'r309', 'r324', 'r335', 'r358', 'r362', 'r365', 'r371', 'r374', 'r377',
         'r380', 'r392', 'r486', 'r502', 'r535', 'r555', 'r705', 'r714', 'r732', 'r737', 'r790', 'r799',
         'r808', 'r85', 'r850', 'r853', 'w23', 'w36', 'w52', 'w54', 'w62', 'w68', 'w79']

lineList = [convertLine(x) for x in lines]
lineList.append('w1118_w118')

# Imputed VCF
vcfName = '/home/jfear/storage/cegs_variants/CEGS.216.lines.NO_DPGP4.GATK.SNP.HETS.FILTERED.11-6-13_imputed.vcf.gz'

# Create table
out = cegsAlleleCnt(bedName, vcfName, lineList)
out4 = pd.DataFrame(out, columns=['gene', 'mRNA_35_impute'])
out4.set_index('gene', inplace=True)

In [ ]:
mRNA_out = pd.concat([out1, out2, out3, out4], axis=1)
mRNA_out