Download variants in gzipped file from https://ckdgen.imbi.uni-freiburg.de/ then fed through python to make vcf for selene and flip to hg38
wget https://ckdgen.imbi.uni-freiburg.de/files/Wuttke2019/20171016_MW_eGFR_overall_ALL_nstud61.dbgap.txt.gz
In [ ]:
import math
from pyliftover import LiftOver
#Iterate over downloaded and unzipped tsv GWAS output, filter by p-value, flip to hg38, and output as "vcf" w/ out header for selen
p_thresh = 1e-8
gwas_tsv_in = "/input_dir/ml_testing/kidney_test/gwas_out/20171016_MW_eGFR_overall_ALL_nstud61.dbgap.txt"
vcf_out_name = "/input_dir/ml_testing/kidney_test/gwas_out/Wuttke_eGFR_"+str(p_thresh)+"_hg38.vcf"
bed_out_name = "/input_dir/ml_testing/kidney_test/gwas_out/Wuttke_eGFR_"+str(p_thresh)+"_hg38.bed"
flip_genome = True
#Input header
# Chr Pos_b37 RSID Allele1 Allele2 Freq1 Effect StdErr P-value n_total_sum
#Output vcf format:
# chr1 783547 783547 A G
if (flip_genome):
lo = LiftOver("hg19", "hg38")
with open(bed_out_name, "w") as bed_out:
with open(vcf_out_name, "w") as vcf_out:
with open(gwas_tsv_in, "r") as gwas_in:
header = gwas_in.readline().strip().split("\t")
for loci in gwas_in.readlines():
loci_arr = loci.strip().split()
if float(loci_arr[8]) <= p_thresh:
cur_loc = loci_arr[1]
if (flip_genome):
lift_out = lo.convert_coordinate("chr"+loci_arr[0],int(cur_loc))
if (lift_out is not None) and len(lift_out)>0:
cur_loc = lift_out[0][1]
vcf_out_line = "chr" + loci_arr[0] + "\t" + str(cur_loc) + "\t" + str(cur_loc) + "\t" + loci_arr[3].upper() + "\t" + loci_arr[4].upper() + "\n"
vcf_out.write(vcf_out_line)
bed_out_line = "chr" + loci_arr[0] + "\t" + str(cur_loc) + "\t" + str(cur_loc+1) + "\t" + loci_arr[2] + ":" + loci_arr[3].upper() + ":" + loci_arr[4].upper() + "\t" + str(round(-math.log10(float(loci_arr[8])),2)) + "\n"
bed_out.write(bed_out_line)
else:
print("variant " + str(loci_arr[0]) + ":" + str(loci_arr[1]) + " not found when liftover, skipping")
ops: [analyze]
model: {
path: /input_dir/ml_testing/kidney_test/deeperdeepsea.py,
class: DeeperDeepSEA,
class_args: {
sequence_length: 1000,
n_targets: 1,
},
non_strand_specific: mean
}
analyze_sequences: !obj:selene_sdk.predict.AnalyzeSequences {
trained_model_path:/input_dir/ml_testing/kidney_test/training_outputs/best_model.pth.tar,
sequence_length: 1000,
features: !obj:selene_sdk.utils.load_features_list {
input_path: /input_dir/ml_testing/kidney_test/distinct_features.txt
},
batch_size: 64,
use_cuda: True,
reference_sequence: !obj:selene_sdk.sequences.Genome {
input_path: /input_dir/ml_testing/genomic_annotations/hg38.fa
},
write_mem_limit: 75000
}
variant_effect_prediction: {
vcf_files: [
/input_dir/ml_testing/kidney_test/gwas_out/Wuttke_eGFR_1e8_hg19.vcf
],
save_data: [logits],
output_dir: /input_dir/ml_testing/kidney_test/prediction_outputs,
output_format: tsv,
strand_index: 9
}
random_seed: 123
In [ ]:
%matplotlib inline
from selene_sdk.utils import load_path
from selene_sdk.utils import parse_configs_and_run
from selene_sdk.interpret import load_variant_abs_diff_scores
In [ ]:
configs = load_path("./CTCF_kidney_predict.yml")
In [ ]:
parse_configs_and_run(configs, lr=0.01)
In [10]:
import pandas as pd
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
def gen_chrom_col(in_chroms):
col_map = px.colors.sequential.Rainbow * 3
chroms = ["chr" + str(x) for x in range(1,23)]
chroms = chroms + ["chrX","chrY"]
chrom_rainbow = {chroms[i]: col_map[i] for i in range(len(chroms))}
mapped_cols = [chrom_rainbow[x] for x in in_chroms]
return(mapped_cols)
out_var_scores = pd.read_table("/input_dir/ml_testing/kidney_test/prediction_outputs/Wuttke_eGFR_1e-08_hg38_abs_diffs.tsv")
var_label = out_var_scores.apply (lambda row: row["chrom"] + ":" + str(row["pos"]) + ":" + row["ref"] + ":" + row["alt"], axis=1)
data_var = { 'variant_labels' : var_label,
'abs_diffs' : out_var_scores["RPTEC|CTCF|None"],
'chromosome' : out_var_scores["chrom"],
'locations' : out_var_scores["pos"]}
df_var = pd.DataFrame(data_var)
df_var_filt = df_var[df_var["abs_diffs"]>.001]
In [11]:
#Make bed files for abs diff
df_var_filt_hard = df_var[df_var["abs_diffs"]>.01]
bed_out_diff = "/input_dir/ml_testing/kidney_test/prediction_outputs/Wuttke_eGFR_1e-08_hg38_abs_diffs_filt.bed"
with open(bed_out_diff,"w") as bed_out:
for index, row in df_var_filt_hard.iterrows():
b = (row["chromosome"] +
"\t" + str(row["locations"]) +
"\t" + str(row["locations"]+1) +
"\t" + row["variant_labels"] +
"\t" + str(abs(row["abs_diffs"])) + "\n")
bed_out.write(b)
In [12]:
xaxis=dict(
showline=False,
zeroline=False,
showgrid=False,
showticklabels=False,
showbackground=False,
titlefont=dict(size=20),
ticks = ''
)
yaxis=dict(
showline=False,
zeroline=False,
showgrid=False,
showticklabels=False,
showbackground=False,
titlefont=dict(size=20),
ticks = '')
layout = dict(
scene= dict(
xaxis=xaxis,
yaxis=yaxis),
title=dict(y=.95,
x=.5,
font=dict(size=30)),
width = 1280,
height = 860,
showlegend=False
)
data = [dict(
type = 'scatter',
mode = 'markers',
y = df_var_filt['abs_diffs'],
text = df_var_filt['variant_labels'],
hoverinfo = 'text',
opacity = 0.8,
marker = dict(size=8, color = gen_chrom_col(df_var_filt["chromosome"]), line=dict(color='DarkGrey',width=1))
)]
fig = dict({'data' : data, 'layout' : layout})
fig_out = go.Figure(fig)
fig_out.update_layout(title_text="Significant eGFR GWAS SNP's impact on CTCF NN activation",
xaxis_showticklabels=False,
xaxis_title="Genomic location",
yaxis_title = "Absolute difference in NN activation")
fig_out.update_layout(template="plotly_white+xgridoff")
pio.write_html(fig_out, file= "CTCF_eGFR_vars.html", auto_open=False)
In [13]:
fig_out.show()
In [14]:
##Repeat for logit scores
out_var_logit = pd.read_table("/input_dir/ml_testing/kidney_test/prediction_outputs/Wuttke_eGFR_1e-08_hg38_logits.tsv")
var_label = out_var_logit.apply (lambda row: row["chrom"] + ":" + str(row["pos"]) + ":" + row["ref"] + ":" + row["alt"], axis=1)
data_var = { 'variant_labels' : var_label,
'logit' : out_var_logit["RPTEC|CTCF|None"],
'chromosome' : out_var_logit["chrom"],
'locations' : out_var_logit["pos"]}
df_var = pd.DataFrame(data_var)
df_var_filt = df_var[abs(df_var["logit"])>.1]
In [15]:
#Make bed files for logit
df_var_filt_hard = df_var[abs(df_var["logit"])>1]
bed_out_logit = "/input_dir/ml_testing/kidney_test/prediction_outputs/Wuttke_eGFR_1e-08_hg38_logit_filt.bed"
with open(bed_out_logit,"w") as bed_out:
for index, row in df_var_filt_hard.iterrows():
b = (row["chromosome"] +
"\t" + str(row["locations"]) +
"\t" + str(row["locations"]+1) +
"\t" + row["variant_labels"] +
"\t" + str(abs(row["logit"])) + "\n")
bed_out.write(b)
In [16]:
data = [dict(
type = 'scatter',
mode = 'markers',
y = df_var_filt['logit'],
text = df_var_filt['variant_labels'],
hoverinfo = 'text',
opacity = 0.8,
marker = dict(size=8, color = gen_chrom_col(df_var_filt["chromosome"]), line=dict(color='DarkGrey',width=1))
)]
fig = dict({'data' : data, 'layout' : layout})
fig_out = go.Figure(fig)
fig_out.update_layout(title_text="Significant eGFR GWAS SNP's impact on CTCF NN logit score",
xaxis_showticklabels=False,
xaxis_title="Genomic location",
yaxis_title = "Logit score in NN activation")
fig_out.update_layout(template="plotly_white+xgridoff")
pio.write_html(fig_out, file= "CTCF_eGFR_vars_logit.html", auto_open=False)
In [17]:
fig_out.show()