In [ ]:
from environment import *
with open("../project.json") as io:
PROJECT_JSON = json.load(io)
PATH = make_path_dict(PROJECT_JSON)
In [ ]:
gene_x_sample = pd.read_csv(PATH["gene_x_sample.processed.tsv"], sep="\t", index_col=0)
target_x_sample = pd.read_csv(PATH["target_x_sample.tsv"], sep="\t", index_col=0)
gene_sets = pd.concat(
kraft.read_gmt(gmt_file_path)
for gmt_file_path in PROJECT_JSON["gene_set_file_paths"]
)
gene_set_x_sample = pd.read_csv(PATH["gene_set_x_sample.tsv"], sep="\t", index_col=0)
In [ ]:
for target_name, target_values in target_x_sample.iterrows():
target_values = target_values[target_values != -1]
match_function_name = "compute_information_coefficient"
if target_values.value_counts().min() < 2:
continue
output_directory_path = os.path.join(PATH["expand_gene_set/"], target_name)
kraft.establish_path(output_directory_path, "directory")
score_moe_p_value_fdr = pd.concat(
(
pd.read_csv(
os.path.join(
PATH["find_differentially_expressed_gene/"],
target_name,
f"all.{match_function_name}.tsv",
),
sep="\t",
index_col=0,
),
pd.read_csv(
os.path.join(
PATH["find_differentially_expressed_gene_set/"],
target_name,
"all.tsv",
),
sep="\t",
index_col=0,
),
)
)
for gene_set_name in PROJECT_JSON["gene_sets_to_peek"]:
features = pd.concat(
(
gene_x_sample.reindex(gene_sets.loc[gene_set_name].dropna()),
gene_set_x_sample.reindex((gene_set_name,)),
)
)
if features.shape[0] < 100:
kraft.make_match_panel(
target_values,
features,
score_moe_p_value_fdr=score_moe_p_value_fdr,
n_extreme=None,
target_data_type="binary",
plot_std=PROJECT_JSON["plot_std"],
title_text=gene_set_name,
file_path_prefix=os.path.join(output_directory_path, gene_set_name),
)