In [ ]:
# This cell contains default parameters values for execution by `papermill`.
filename = '../sample_data/postgap.20180817.asthma.txt.gz'
In [ ]:
from reports import helpers
In [ ]:
helpers.calc_run_str()
In [ ]:
# pg = pd.read_csv(filename, sep='\t', na_values=['None'])
pg = helpers.load_file(filename)
Q: How many rows and columns?
In [ ]:
print(pg.shape)
Q: How many unique target-disease associations?
In [ ]:
helpers.calc_g2d_pair_counts(pg)
Q: What is the distribution of unique diseases per gene? And vice versa?
In [ ]:
helpers.calc_pairwise_degree_dist(pg, 'gene_id', 'disease_efo_id', 'Gene', 'Disease')
Q: How many unique values appear for each identifier?
In [ ]:
helpers.calc_id_field_counts(pg)
Q: What is the maximum number of rows for a given fixed identifier?
In [ ]:
helpers.calc_id_field_max_rows(pg)
Q: How many unique identifier pairs appear?
In [ ]:
helpers.calc_id_field_pair_counts(pg)
Q: What is the distribution of each association subscore (VEP, GTEx, etc.)?
In [ ]:
helpers.calc_g2v_field_hists(pg)
Q: What is the distribution of unique LD SNPs per gene? And vice versa?
In [ ]:
helpers.calc_pairwise_degree_dist(pg, 'gene_id', 'ld_snp_rsID', 'Gene', 'LD SNP')
Q: What is the overlap between presence of association subscores?
In [ ]:
helpers.calc_g2v_field_overlap(pg)
Q: What is the joint distribution between association subscore pairs (ie. how correlated are they)?
In [ ]:
helpers.calc_g2v_field_cross_dists(pg)
Q: What is the distribution of r2?
In [ ]:
helpers.calc_dist_r2(pg)
Q: What is the distribution of unique GWAS SNPs per LD SNP? And vice versa?
In [ ]:
helpers.calc_pairwise_degree_dist(pg, 'ld_snp_rsID', 'gwas_snp', 'LD SNP', 'GWAS SNP')
Q: What are the distributions of (gwas_pvalue, gwas_beta, gwas_odds_ratio)?
In [ ]:
helpers.calc_v2d_field_hists(pg)
Q: What is the distribution of unique diseases per GWAS SNP? And vice versa?
In [ ]:
helpers.calc_pairwise_degree_dist(pg, 'gwas_snp', 'disease_efo_id', 'GWAS SNP', 'Disease')