notebook.community

Edit and run



In [ ]:

    
from environment import *

with open("../project.json") as io:

    PROJECT_JSON = json.load(io)

PATH = make_path_dict(PROJECT_JSON)



In [ ]:

    
gene_x_sample = pd.read_csv(PATH["gene_x_sample.processed.tsv"], sep="\t", index_col=0)

target_x_sample = pd.read_csv(PATH["target_x_sample.tsv"], sep="\t", index_col=0)

gene_sets = pd.concat(
    kraft.read_gmt(gmt_file_path)
    for gmt_file_path in PROJECT_JSON["gene_set_file_paths"]
)

gene_set_x_sample = pd.read_csv(PATH["gene_set_x_sample.tsv"], sep="\t", index_col=0)



In [ ]:

    
for target_name, target_values in target_x_sample.iterrows():

    target_values = target_values[target_values != -1]

    match_function_name = "compute_information_coefficient"

    if target_values.value_counts().min() < 2:

        continue

    output_directory_path = os.path.join(PATH["expand_gene_set/"], target_name)

    kraft.establish_path(output_directory_path, "directory")

    score_moe_p_value_fdr = pd.concat(
        (
            pd.read_csv(
                os.path.join(
                    PATH["find_differentially_expressed_gene/"],
                    target_name,
                    f"all.{match_function_name}.tsv",
                ),
                sep="\t",
                index_col=0,
            ),
            pd.read_csv(
                os.path.join(
                    PATH["find_differentially_expressed_gene_set/"],
                    target_name,
                    "all.tsv",
                ),
                sep="\t",
                index_col=0,
            ),
        )
    )

    for gene_set_name in PROJECT_JSON["gene_sets_to_peek"]:

        features = pd.concat(
            (
                gene_x_sample.reindex(gene_sets.loc[gene_set_name].dropna()),
                gene_set_x_sample.reindex((gene_set_name,)),
            )
        )

        if features.shape[0] < 100:

            kraft.make_match_panel(
                target_values,
                features,
                score_moe_p_value_fdr=score_moe_p_value_fdr,
                n_extreme=None,
                target_data_type="binary",
                plot_std=PROJECT_JSON["plot_std"],
                title_text=gene_set_name,
                file_path_prefix=os.path.join(output_directory_path, gene_set_name),
            )