Abundant contribution of short tandem repeats to gene expression variation in humans - Figure 3


In [15]:
%pylab inline

# Libraries
import beeswarm
import math
import matplotlib.pyplot as plt
import os
import pandas as pd
import tempfile

# path to raw data files - EDIT depending on where the data is
DATAPATH = "data/"

# Other parameters
MIN_SAMPLES = 25

##### Data file names #####

# For SNP held constant
SNP_CONST_FILE = os.path.join(DATAPATH, "snp_constant_dataset2.tab")

# For model comparison
MODEL_COMPARE_RESULTS_FILE = os.path.join(DATAPATH, "hidden_h2_rnaseq.tab")

# For examples
STR_RESULTS_SEP = "\t"
SNP_RESULTS_SEP = "\t"
EXPRFILE = os.path.join(DATAPATH, "geuvadis_expression_norm_EUR_residuals.csv")

# Example 1
GENE1 = "ENSG00000130413"
CHROM1 = 11
STRPOS1 = 8624901
STR_RESULTS_FILE1 = os.path.join(DATAPATH, "linreg_results_dataset2_chrom%s.tab"%CHROM1)
SNP_RESULTS_FILE1 = os.path.join(DATAPATH, "linreg_results_dataset2_chrom%s_snps.tab"%CHROM1)
STRGTFILE1 = os.path.join(DATAPATH, "1kg_normstrs_chr%s.tab"%CHROM1)
SNPGTFILE1 = os.path.join(DATAPATH, "1kg_normsnps_chr%s.tab"%CHROM1)

# Example 2
GENE2 = "ENSG00000171067"
CHROM2 = 11
STRPOS2 = 68063456
STR_RESULTS_FILE2 = os.path.join(DATAPATH, "linreg_results_dataset2_chrom%s.tab"%CHROM2)
SNP_RESULTS_FILE2 = os.path.join(DATAPATH, "linreg_results_dataset2_chrom%s_snps.tab"%CHROM2)
STRGTFILE2 = os.path.join(DATAPATH, "1kg_normstrs_chr%s.tab"%CHROM2)
SNPGTFILE2 = os.path.join(DATAPATH, "1kg_normsnps_chr%s.tab"%CHROM2)

snpcolors = ["green","blue","purple"]


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['colors', 'e']
`%matplotlib` prevents importing * from pylab and numpy

In [16]:
# Functions for examples
def LoadEqtlResults(resfile, gene, sep="\t"):
    fd, temp_path = tempfile.mkstemp()
    cmd = "cat %s | grep \"chrom\|%s\" > %s"%(resfile, gene, temp_path)
    os.system(cmd)
    gt = pd.read_csv(temp_path, sep=sep)
    os.close(fd)
    os.remove(temp_path)
    return gt

def LoadGenotypes(gtfile, region_start, region_end):
    """
    Load genotypes, only the ones between $region_start and $region_end
    """
    fd, temp_path = tempfile.mkstemp()
    cmd = "cat %s | awk '($1==\"chrom\" || ($2>=%s && $2<=%s))' > %s"%(gtfile, region_start, region_end, temp_path)
    os.system(cmd)
    gt = pd.read_csv(temp_path, sep="\t")
    os.close(fd)
    os.remove(temp_path)
    return gt

def LoadExpression(exprfile, gene):
    expr = pd.read_csv(exprfile, usecols=[gene, "Unnamed: 0"])
    expr["sample"] = expr["Unnamed: 0"]
    expr = expr.drop("Unnamed: 0", 1)
    return expr

Figure 3B: SNP held constant


In [17]:
def GetSlope(x):
    if x["n_AA"] > x["n_BB"]: return x["slope_AA"]
    else: return x["slope_BB"]

def GetVar(x):
    if x["n_AA"] > x["n_BB"]: return float(x["var_AA"])
    else: return float(x["var_BB"])
    
def GetP(x):
    if x["n_AA"] > x["n_BB"]: return float(x["p_AA"])
    else: return float(x["p_BB"])
    
def GetNumGT(x):
    if x["n_AA"] > x["n_BB"]:
        try: return int(x["numgt_AA"])
        except: return None
    else: 
        try: return int(x["numgt_BB"])
        except: return None

def GetColor(x):
    if float(x["str.effect"])*float(x["slope_XX"])>0: return "red"
    else: return "gray"
    

res = pd.read_csv(SNP_CONST_FILE, sep="\t")
res["n_XX"] = res.apply(lambda x: max([x["n_AA"],x["n_BB"]]), 1)
res["slope_XX"] = res.apply(lambda x: GetSlope(x), 1)
res = res[(res["slope_XX"].apply(lambda x: x != "None"))]
res["numgt_XX"] = res.apply(lambda x: GetNumGT(x), 1)
res["var_XX"] = res.apply(lambda x: GetVar(x), 1)
res["p_XX"] = res.apply(lambda x: GetP(x), 1)
res["slope_XX"] = res["slope_XX"].apply(float)

# Filter
res_filt = res[(res["n_XX"]>=MIN_SAMPLES)]

fig = plt.figure()
fig.set_size_inches((8,6))
ax = fig.add_subplot(111)
colors = list(res_filt.apply(lambda x: GetColor(x), 1))
ax.scatter(res_filt["str.effect"], res_filt["slope_XX"], color=colors, alpha=0.5)
ax.axhline(y=0, color="gray", linestyle="dashed", lw=2)
ax.axvline(x=0, color="gray", linestyle="dashed", lw=2)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.set_xlabel("Effect size", size=20)
ax.set_ylabel("Effect size - SNP constant", size=20)
ax.set_ylim(bottom=-0.6, top=0.6)
ax.set_xlim(left=-0.6, right=0.6)
ax.set_xticklabels(ax.get_xticks(), size=15)
ax.set_yticklabels(ax.get_yticks(), size=15);


Figure 3C: Anova


In [18]:
res = pd.read_csv(MODEL_COMPARE_RESULTS_FILE)
res = res[res["anova_pval"]>0]

In [19]:
yvals = sorted(map(lambda x: -1*math.log10(x), res["anova_pval"]))
xvals = sorted(map(lambda x: -1*math.log10(x), np.random.uniform(size=len(yvals))))

fig = plt.figure()
fig.set_size_inches((8, 4))
ax = fig.add_subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.scatter(xvals, yvals, color="black")

# Point out examples
for gene in [GENE1, GENE2]:
    pval = -1*math.log10(res[res["gene"]==gene]["anova_pval"])
    ind = yvals.index(pval)
    ax.scatter(xvals[ind], yvals[ind], color="red", s=100)
    
ax.plot([0, 4], [0, 4], color="gray")
ax.set_xlim(left=-0.1, right=4)
ax.set_ylim(bottom=-0.5, top=25);
ax.set_xlabel("-log10 expected pval", size=20)
ax.set_ylabel("-log10 observed pval", size=20)
ax.set_xticklabels(ax.get_xticks(), size=15)
ax.set_yticklabels(ax.get_yticks(), size=15);


Figure 3D: Example 1


In [20]:
# Load data
esnp_results = LoadEqtlResults(SNP_RESULTS_FILE1, GENE1, sep=SNP_RESULTS_SEP)
estr_results = LoadEqtlResults(STR_RESULTS_FILE1, GENE1, sep=STR_RESULTS_SEP)
region_start = min([min(estr_results["str.start"]), min(esnp_results["str.start"])])
region_end = max([max(estr_results["str.start"]), max(esnp_results["str.start"])])
strgt = LoadGenotypes(STRGTFILE1, region_start, region_end)
snpgt = LoadGenotypes(SNPGTFILE1, region_start, region_end)
samples = list(set(strgt.columns[2:]).intersection(set(snpgt.columns[2:])))
best_snp_pos = esnp_results.sort("p.wald")["str.start"].values[0]

# SNP vs. gene expression
x = snpgt[snpgt["start"]==best_snp_pos].iloc[0,2:]
snp_genotypes = pd.DataFrame({"sample": x.index, "snpgt": list(x)})
expr_vals = LoadExpression(EXPRFILE, GENE1)
expsnp = pd.merge(snp_genotypes, expr_vals, on=["sample"])
expsnp = expsnp[expsnp["snpgt"].apply(str) != "None"]
expsnp[GENE1] = expsnp[GENE1].apply(float)
expsnp["snpgt"] = expsnp["snpgt"].apply(int)

# STR vs. gene expression
x = strgt[strgt["start"]==STRPOS1].iloc[0,2:]
str_genotypes = pd.DataFrame({"sample": x.index, "strgt": list(x)})
expr_vals = LoadExpression(EXPRFILE, GENE1)
expstr = pd.merge(str_genotypes, expr_vals, on=["sample"])
expstr = expstr[expstr["strgt"].apply(str) != "None"]
expstr[GENE1] = expstr[GENE1].apply(float)
expstr["strgt"] = expstr["strgt"].apply(int)
expstr = pd.merge(expstr, expsnp, on=["sample",GENE1])
colors = dict(zip(range(3), snpcolors))
expstr["color"] = expstr.snpgt.apply(lambda x: colors[x])

In [21]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()

s = sorted(list(set(expsnp["snpgt"])))
e = [list(expsnp[expsnp["snpgt"]==item][GENE1].values) for item in s]
bs, ax = beeswarm.beeswarm(e, col=snpcolors, ax=ax, positions=s, labels=[], lw=0)

# Add best fit line
fit = np.polyfit(expsnp["snpgt"], expsnp[GENE1], 1)
svals = np.arange(-0.5, 2.6, 0.5)
ax.plot(svals, [item*fit[0]+fit[1] for item in svals], color="black", linestyle="dashed", lw=2)

ax.set_xlabel("SNP genotype", size=20)
ax.set_ylabel("Expression", size=20);
ax.set_xticks([0, 1, 2])
ax.set_xticklabels(["AA","AB","BB"], size=15)
ax.set_yticklabels(ax.get_yticks(), size=15);
ax.set_xlim(left=min(svals), right=max(svals))
ax.set_ylim(bottom=-3, top=3)


Out[21]:
(-3, 3)

In [22]:
# Plot by SNP allele
fig = plt.figure()
ax = fig.add_subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()

svals = np.arange(min(expstr.strgt)-4, max(expstr.strgt)+5, 4)
snp_const_gt = 0
x = expstr[expstr.snpgt==snp_const_gt]

# Plot all points
s = sorted(list(set(x["strgt"])))
e = [list(x[x["strgt"]==item][GENE1].values) for item in s]
bs, ax = beeswarm.beeswarm(e, col=snpcolors[snp_const_gt], ax=ax, positions=s, labels=[], lw=0)

# Add best fit line
fit = np.polyfit(x["strgt"], x[GENE1], 1)
ax.plot(svals, [item*fit[0]+fit[1] for item in svals], color="black", linestyle="dashed", lw=2)

# label
ax.set_xlabel("STR dosage", size=20)
ax.set_ylabel("Expression", size=20);
ax.set_xlim(left=min(expstr.strgt), right=max(expstr.strgt))
ax.set_ylim(bottom=-3, top=3)
ax.set_xticks(svals)
ax.set_xticklabels(svals, size=12)
ax.set_yticklabels(ax.get_yticks(), size=15);


Figure 3E: Example 2


In [23]:
# Load data
esnp_results = LoadEqtlResults(SNP_RESULTS_FILE2, GENE2, sep=SNP_RESULTS_SEP)
estr_results = LoadEqtlResults(STR_RESULTS_FILE2, GENE2, sep=STR_RESULTS_SEP)
region_start = min([min(estr_results["str.start"]), min(esnp_results["str.start"])])
region_end = max([max(estr_results["str.start"]), max(esnp_results["str.start"])])
strgt = LoadGenotypes(STRGTFILE2, region_start, region_end)
snpgt = LoadGenotypes(SNPGTFILE2, region_start, region_end)
samples = list(set(strgt.columns[2:]).intersection(set(snpgt.columns[2:])))
best_snp_pos = esnp_results.sort("p.wald")["str.start"].values[0]

# SNP vs. gene expression
x = snpgt[snpgt["start"]==best_snp_pos].iloc[0,2:]
snp_genotypes = pd.DataFrame({"sample": x.index, "snpgt": list(x)})
expr_vals = LoadExpression(EXPRFILE, GENE2)
expsnp = pd.merge(snp_genotypes, expr_vals, on=["sample"])
expsnp = expsnp[expsnp["snpgt"].apply(str) != "None"]
expsnp[GENE2] = expsnp[GENE2].apply(float)
expsnp["snpgt"] = expsnp["snpgt"].apply(int)

# STR vs. gene expression
x = strgt[strgt["start"]==STRPOS2].iloc[0,2:]
str_genotypes = pd.DataFrame({"sample": x.index, "strgt": list(x)})
expr_vals = LoadExpression(EXPRFILE, GENE2)
expstr = pd.merge(str_genotypes, expr_vals, on=["sample"])
expstr = expstr[expstr["strgt"].apply(str) != "None"]
expstr[GENE2] = expstr[GENE2].apply(float)
expstr["strgt"] = expstr["strgt"].apply(int)
expstr = pd.merge(expstr, expsnp, on=["sample",GENE2])
colors = dict(zip(range(3), snpcolors))
expstr["color"] = expstr.snpgt.apply(lambda x: colors[x])

In [24]:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()

s = sorted(list(set(expsnp["snpgt"])))
e = [list(expsnp[expsnp["snpgt"]==item][GENE2].values) for item in s]
bs, ax = beeswarm.beeswarm(e, col=snpcolors, ax=ax, positions=s, labels=[], lw=0)

# Add best fit line
fit = np.polyfit(expsnp["snpgt"], expsnp[GENE2], 1)
svals = np.arange(-0.5, 2.6, 0.5)
ax.plot(svals, [item*fit[0]+fit[1] for item in svals], color="black", linestyle="dashed", lw=2)

ax.set_xlabel("SNP genotype", size=20)
ax.set_ylabel("Expression", size=20);
ax.set_xticks([0, 1, 2])
ax.set_xticklabels(["AA","AB","BB"], size=15)
ax.set_yticklabels(ax.get_yticks(), size=15);
ax.set_xlim(left=min(svals), right=max(svals))
ax.set_ylim(bottom=-3, top=3)


Out[24]:
(-3, 3)

In [25]:
# Plot by SNP allele
fig = plt.figure()
ax = fig.add_subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()

snp_const_gt = 0
x = expstr[expstr.snpgt==snp_const_gt]
svals = np.arange(min(x.strgt)-4, max(x.strgt)+5, 4)

# Plot all points
s = sorted(list(set(x["strgt"])))
e = [list(x[x["strgt"]==item][GENE2].values) for item in s]
bs, ax = beeswarm.beeswarm(e, col=snpcolors[snp_const_gt], ax=ax, positions=s, labels=[], lw=0)

# Add best fit line
fit = np.polyfit(x["strgt"], x[GENE2], 1)
ax.plot(svals, [item*fit[0]+fit[1] for item in svals], color="black", linestyle="dashed", lw=2)

# label
ax.set_xlabel("STR dosage", size=20)
ax.set_ylabel("Expression", size=20);
ax.set_xlim(left=min(expstr.strgt), right=max(expstr.strgt))
ax.set_ylim(bottom=-3, top=3)
ax.set_xticks(svals)
ax.set_xticklabels(svals, size=12)
ax.set_yticklabels(ax.get_yticks(), size=15);