Abundant contribution of short tandem repeats to gene expression variation in humans - Figure 4


In [1]:
%pylab inline

# Libraries
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scipy.stats
import sys
from dist_tss_utils import *

# path to raw data files - EDIT depending on where the data is
DATAPATH = "data/"

# Other parameters
FDR_CUTOFF = 0.05

##### Data file names #####
ESTR_RESULTS_FILE = os.path.join(DATAPATH, "dataset2_rnaseq_gene_level_estr.qvals.csv")
ESTR_CONS_MED_FILE = os.path.join(DATAPATH, "estrs_pm1kb.txt")
ANTI_ESTR_CONS_MED_FILE = os.path.join(DATAPATH, "anti_strs_pm1kb.txt")
CHROM_MARK_FILE = os.path.join(DATAPATH, "strs_encode_features_hg19.bed")


Populating the interactive namespace from numpy and matplotlib

Figure 4B: Conservation


In [2]:
estr_cons_med = pd.read_csv(ESTR_CONS_MED_FILE, names=["pos","med","med1","med2","med3","med4"])
anti_estr_cons_med = pd.read_csv(ANTI_ESTR_CONS_MED_FILE, names=["pos","med","med1","med2","med3","med4"])

medcol="med"
pos = map(lambda x: -1*x, list(estr_cons_med["pos"])[::-1]) + list(estr_cons_med["pos"])
estr_med = list(estr_cons_med[medcol])[::-1] + list(estr_cons_med[medcol])
anti_estr_med = list(anti_estr_cons_med[medcol])[::-1] + list(anti_estr_cons_med[medcol])

fig = plt.figure()
fig.set_size_inches((8,6))
ax = fig.add_subplot(111)
ext=25
ax.plot(pos, Smooth(estr_med, ext), color="red", lw=2)
ax.plot(pos, Smooth(anti_estr_med, ext), color="darkgray", lw=2)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.set_ylim(bottom=0.1, top=0.18);
ax.set_xlabel("Distance (bp) relative to STR", size=20)
ax.set_ylabel("Median PhyloP score", size=20)
ax.set_xticklabels(map(int, ax.get_xticks()), size=15)
ax.set_yticklabels(ax.get_yticks(), size=15);


Figure 4B: Chromatin mark enrichment around TSS


In [7]:
estr = pd.read_csv(ESTR_RESULTS_FILE)
estr["signif"] = estr.apply(lambda x: x["best_str"] & (x["qval.gene"]<=FDR_CUTOFF), 1)

chrom_marks = pd.read_csv(CHROM_MARK_FILE, sep="\t")
chrom_marks["str.start"] = chrom_marks["var.start"]+1
estrs_chrom = pd.merge(estr, chrom_marks, on=["chrom","str.start"])

marks = ["H3k4me3","H3k9ac","H3k27ac","H3k36me3","H3k27me3"]
binsize=5000
distbins = np.arange(0, 100001, binsize)
enrich = []
for mark in marks:
    estr_mark_tss = estrs_chrom[estrs_chrom[mark]>0].apply(lambda x: GetLoc(x, "TSS"), 1)
    mbins, markenrich = GetEnrichmentProfile(estrs_chrom[estrs_chrom[mark]>0], estr_mark_tss, "dist.tss", "signif", distbins)
    enrich.append(markenrich)
    
estr_tss = estr.apply(lambda x: GetLoc(x, "TSS"), 1)
distbins = np.arange(0, 100001, binsize)
bins, tssenrich = GetEnrichmentProfile(estr, estr_tss, "dist.tss", "signif", distbins)

In [8]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()

markcolors = {
    "H3k4me3": "green",
    "H3k9ac": "blue",
    "H3k27ac": "purple",
    "H3k36me3": "orange",
    "H3k27me3": "brown"
}

# Plot bins for each mark
ext=4
for i in range(len(marks)):
    ax.plot(map(lambda x: x+binsize/2, mbins)[:-1], Smooth(enrich[i], extend=ext),
            color=markcolors[marks[i]], lw=1.5)

# Plot overall
ext=4
ax.plot(map(lambda x: x+binsize/2, bins)[:-1], Smooth(tssenrich, extend=ext),
        label="Significant eSTRs", color="black", lw=3)
ax.axvline(x=0, linestyle="dashed", color="gray", lw=2)
ax.set_xlabel("Dist from TSS (kb)", size=20)
ax.set_ylabel("P(eSTR)", size=20)
ax.set_xlim(left=-100000, right=100000)
ax.set_ylim(bottom=0, top=0.04)
ax.set_xticks([-100000, -50000, 0, 50000, 100000])
ax.set_xticklabels([100, 50, 0, 50, 100], size=15)
ax.set_yticklabels(map(lambda x: "%.3f"%x, ax.get_yticks()), size=15);