Abundant contribution of short tandem repeats to gene expression variation in humans - Figure 2


In [3]:
%pylab inline

# Libraries
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from scipy.stats.kde import gaussian_kde

# path to raw data files - EDIT depending on where the data is
DATAPATH = "data/"

##### Data file names #####

# LMM RNAseq results
LMM_RESULTS_FILE = os.path.join(DATAPATH, "gcta_str_fe_h2_rnaseq.tab")
ESTR_GENE_FILE = os.path.join(DATAPATH, "estr_genes.txt")


Populating the interactive namespace from numpy and matplotlib

In [4]:
# Load LMM results
lmmres = pd.read_csv(LMM_RESULTS_FILE, sep="\t")
lmmres = lmmres[~np.isnan(lmmres["cis_str_h2_se"])]
estr_genes = [line.strip() for line in open(ESTR_GENE_FILE, "r").readlines()]
lmmres["cis_str_h2_unbiased"] = lmmres["cis_str_h2"] - lmmres["cis_str_h2_se"].apply(lambda x: x**2)
lmmres["is.estr"] = lmmres["gene"].apply(lambda x: x in estr_genes)
lmmres["cish2"] = lmmres["cis_str_h2_unbiased"]+ lmmres["cis_snp_h2"]
lmmres["cish2"] = lmmres["cish2"].apply(lambda x: max([0, x]))
lmmres["is.eqtl"] = lmmres["cish2"]>0.05 # here "eqtl" means "moderate cis h2"

Figure 2B/C: Variance explained by STRs


In [5]:
# eSTR genes only
fig = plt.figure()
fig.set_size_inches((8, 3))
ax = fig.add_subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()

bins = np.arange(-0.1, 0.7, 0.02)

# draw histogram
ax.hist(list(lmmres[lmmres["is.estr"]]["cis_str_h2_unbiased"]), color="red", edgecolor="white", alpha=0.2,
        bins=bins,
        weights=np.zeros_like(lmmres[lmmres["is.estr"]]["cis_str_h2_unbiased"]) + 1. / lmmres[lmmres["is.estr"]].shape[0]);
ax.axvline(x=np.median(lmmres[lmmres["is.estr"]]["cis_str_h2_unbiased"]), color="gray", linestyle="dashed", lw=2)

# smoothed line
h2_pdf = gaussian_kde(list(lmmres[lmmres["is.estr"]]["cis_str_h2_unbiased"]))
smoothed_hist = h2_pdf(bins)
smoothed_hist = [item*1.0/sum(smoothed_hist) for item in smoothed_hist]
ax.plot(bins, smoothed_hist, color="red", lw=2)
ax.set_xticks([])
ax.set_yticks([]);



In [6]:
# eQTL genes only
fig = plt.figure()
fig.set_size_inches((8, 3))
ax = fig.add_subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()

bins = np.arange(-0.1, 0.7, 0.02)

# draw histogram
ax.hist(list(lmmres[lmmres["is.eqtl"]]["cis_str_h2_unbiased"]), color="red", edgecolor="white", alpha=0.2,
        bins=bins,
        weights=np.zeros_like(lmmres[lmmres["is.eqtl"]]["cis_str_h2_unbiased"]) + 1. / lmmres[lmmres["is.eqtl"]].shape[0]);
ax.axvline(x=np.median(lmmres[lmmres["is.eqtl"]]["cis_str_h2_unbiased"]), color="gray", linestyle="dashed", lw=2)

# smoothed line
h2_pdf = gaussian_kde(list(lmmres[lmmres["is.eqtl"]]["cis_str_h2_unbiased"]))
smoothed_hist = h2_pdf(bins)
smoothed_hist = [item*1.0/sum(smoothed_hist) for item in smoothed_hist]
ax.plot(bins, smoothed_hist, color="red", lw=2)
ax.set_xticks([])
ax.set_yticks([]);


Figure 2B/C: Variance explained by SNPs


In [7]:
# eSTR genes only
fig = plt.figure()
fig.set_size_inches((8, 3))
ax = fig.add_subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()

bins = np.arange(-0.1, 0.7, 0.02)

# draw histogram
ax.hist(list(lmmres[lmmres["is.estr"]]["cis_snp_h2"]), color="#52DBFF", edgecolor="white", alpha=0.5,
        bins=bins,
        weights=np.zeros_like(lmmres[lmmres["is.estr"]]["cis_snp_h2"]) + 1. / lmmres[lmmres["is.estr"]].shape[0]);
ax.axvline(x=np.median(lmmres[lmmres["is.estr"]]["cis_snp_h2"]), color="gray", linestyle="dashed", lw=2)

# smoothed line
h2_pdf = gaussian_kde(list(lmmres[lmmres["is.estr"]]["cis_snp_h2"]))
smoothed_hist = h2_pdf(bins)
smoothed_hist = [item*1.0/sum(smoothed_hist) for item in smoothed_hist]
ax.plot(bins, smoothed_hist, color="#52DBFF", lw=2)
ax.set_xticks([])
ax.set_yticks([]);



In [8]:
# eQTL genes only
fig = plt.figure()
fig.set_size_inches((8, 3))
ax = fig.add_subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()

bins = np.arange(-0.1, 0.7, 0.02)

# draw histogram
ax.hist(list(lmmres[lmmres["is.eqtl"]]["cis_snp_h2"]), color="#52DBFF", edgecolor="white", alpha=0.5,
        bins=bins,
        weights=np.zeros_like(lmmres[lmmres["is.eqtl"]]["cis_snp_h2"]) + 1. / lmmres[lmmres["is.eqtl"]].shape[0]);
ax.axvline(x=np.median(lmmres[lmmres["is.eqtl"]]["cis_snp_h2"]), color="gray", linestyle="dashed", lw=2)

# smoothed line
h2_pdf = gaussian_kde(list(lmmres[lmmres["is.eqtl"]]["cis_snp_h2"]))
smoothed_hist = h2_pdf(bins)
smoothed_hist = [item*1.0/sum(smoothed_hist) for item in smoothed_hist]
ax.plot(bins, smoothed_hist, color="#52DBFF", lw=2)
ax.set_xticks([])
ax.set_yticks([]);


Figure 2B/C: Heatmap of str h2 vs. snp h2


In [9]:
# eSTR genes
aspect=1
step=0.02
x = lmmres[~np.isnan(lmmres["cis_str_h2_unbiased"])]
x = x[x["is.estr"]]
fig = plt.figure()
fig.set_size_inches((6,6))
ax = fig.add_subplot(111)
counts, xedges, yedges = np.histogram2d(list(x["cis_str_h2_unbiased"]), list(x["cis_snp_h2"]),
          bins=[np.arange(-0.1, 0.71, 0.01),np.arange(-0.1, 0.71, 0.01)]);
def MultiDimLog(x):
    if type(x) == float or type(x) == np.float64:
        return math.log10(x+1.0/sum(counts))
    else:
        return map(MultiDimLog, x)
im = ax.imshow(map(MultiDimLog, (counts/(sum(counts)*1.0)).transpose()), origin='lower', interpolation="bilinear",
          extent=[0, (-1*xedges[0]+xedges[-1])*aspect, yedges[0], yedges[-1]])
ax.set_xlabel("cis_str_h2_unbiased", size=20)
ax.set_ylabel("cis_snp_h2", size=20)
ax.set_xticklabels(ax.get_xticks(), size=12);
ax.set_yticklabels(ax.get_yticks(), size=12);
plt.colorbar(im);



In [10]:
# eQTL genes
aspect=1
step=0.02
x = lmmres[~np.isnan(lmmres["cis_str_h2_unbiased"])]
x = x[x["is.eqtl"]]
fig = plt.figure()
fig.set_size_inches((6,6))
ax = fig.add_subplot(111)
#bins=[np.arange(-0.03, 0.13, 0.002),np.arange(-0.1, 0.71, 0.01)]
counts, xedges, yedges = np.histogram2d(list(x["cis_str_h2_unbiased"]), list(x["cis_snp_h2"]),
          bins=[np.arange(-0.1, 0.71, 0.01),np.arange(-0.1, 0.71, 0.01)]);
def MultiDimLog(x):
    if type(x) == float or type(x) == np.float64:
        return math.log10(x+1.0/sum(counts))
    else:
        return map(MultiDimLog, x)
im = ax.imshow(map(MultiDimLog, (counts/(sum(counts)*1.0)).transpose()), origin='lower', interpolation="bilinear",
          extent=[0, (-1*xedges[0]+xedges[-1])*aspect, yedges[0], yedges[-1]])
ax.set_xlabel("cis_str_h2_unbiased", size=20)
ax.set_ylabel("cis_snp_h2", size=20)
ax.set_xticklabels(ax.get_xticks(), size=12);
ax.set_yticklabels(ax.get_yticks(), size=12);
plt.colorbar(im);