Abundant contribution of short tandem repeats to gene expression variation in humans - Figure 1


In [1]:
%pylab inline

# Libraries
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

# Other parameters
FDR_THRESHOLD = 0.05

# path to raw data files - EDIT depending on where the data is
DATAPATH = "data/"

##### Data file names #####

# Linear regression results
RNASEQ_LINREG_FILE = os.path.join(DATAPATH, "dataset2_rnaseq_gene_level_estr.qvals.csv")
RNASEQ_UNLINKEDCTRL_FILE = os.path.join(DATAPATH, "linreg_estr_unlinked_results.tab")
ARRAY_LINREG_FILE = os.path.join(DATAPATH, "linreg_estr_dataset1_array_independent_results.tab")

# Annotation
PROBE_ANNOT_FILE = os.path.join(DATAPATH, "gene_expr_annot_hg19.txt")
STR_ANNOT_FILE = os.path.join(DATAPATH, "lobSTR_ref_hg19_ensembleannot.bed")

##### Load results #####
rnaseq_linreg = pd.read_csv(RNASEQ_LINREG_FILE, sep=",")
rnaseq_linreg_unlinked_ctrl = pd.read_csv(RNASEQ_UNLINKEDCTRL_FILE, sep="\t")
array_linreg = pd.read_csv(ARRAY_LINREG_FILE, sep="\t")

# Convert probe annotations
probe_annot = pd.read_csv(PROBE_ANNOT_FILE)
probe_annot = probe_annot[["gene.id","probe.id"]]
probe_annot.columns = ["gene","probe"]
array_linreg["probe"] = array_linreg["gene"]
array_linreg = array_linreg.drop("gene", 1)
array_linreg = pd.merge(array_linreg, probe_annot, on="probe")


Populating the interactive namespace from numpy and matplotlib

Figure 1B: qq plot


In [2]:
# Observed data
y = sorted(rnaseq_linreg["p.wald"].apply(lambda x: -1* math.log10(x)))
x = sorted(map(lambda x: -1*math.log10(x), np.random.uniform(size=len(y))))

# Control data
yctrl = sorted(rnaseq_linreg_unlinked_ctrl["p.wald"].apply(lambda x: -1*math.log10(x)))
xctrl = sorted(map(lambda x: -1*math.log10(x), np.random.uniform(size=len(yctrl))))

# Set up figure and plot data
fig = plt.figure()
fig.set_size_inches((8,6))
ax = fig.add_subplot(111)
ax.scatter(x, y, color="red", label="Observed")
ax.scatter(xctrl, yctrl, color="black", label="Control")
ax.set_xlim(left=0, right=5);
ax.set_ylim(bottom=0, top=40);
ax.plot([0,5],[0,5], color="gray")
ax.set_xlabel("Expected pval [-log10(p)]", size=20)
ax.set_ylabel("Observed pval [-log10(p)]", size=20)
ax.set_xticklabels(ax.get_xticks(), size=15)
ax.set_yticklabels(ax.get_yticks(), size=15);


Figure 1C: reproducibility in array


In [3]:
estr_hits = rnaseq_linreg[(rnaseq_linreg["qval.gene"]<=FDR_THRESHOLD) & rnaseq_linreg["best_str"]]
allres = pd.merge(estr_hits[["gene","str.start","beta"]], \
                  array_linreg[["gene","str.start","beta"]], \
                  on=["gene","str.start"])
allres.columns = ["gene","str.start","beta_rnaseq","beta_array"]

def GetColor(x):
    if float(x["beta_rnaseq"])*float(x["beta_array"])>0: return "red"
    else: return "gray"
    
colors = list(allres.apply(lambda x: GetColor(x), 1))
fig = plt.figure()
fig.set_size_inches((6,6))
ax = fig.add_subplot(111)
ax.scatter(allres["beta_rnaseq"], allres["beta_array"], color=colors, alpha=0.5, s=10);
ax.axhline(y=0, linestyle="dashed", color="gray", lw=1)
ax.axvline(x=0, linestyle="dashed", color="gray", lw=1);

# labels
ax.set_xlim(left=-0.8, right=0.8)
ax.set_ylim(bottom=-0.8, top=0.8)
ax.set_xlabel("Effect - RNAseq", size=20)
ax.set_ylabel("Effect - Array", size=20)
ax.set_xticklabels(ax.get_xticks(), size=15)
ax.set_yticklabels(ax.get_yticks(), size=15);