In [ ]:
from environment import *

with open("../project.json") as io:

    PROJECT_JSON = json.load(io)

PATH = make_path_dict(PROJECT_JSON)

In [ ]:
gene_x_sample = pd.read_csv(PATH["gene_x_sample.processed.tsv"], sep="\t", index_col=0)

gene_sets = pd.concat(
    kraft.read_gmt(gmt_file_path)
    for gmt_file_path in PROJECT_JSON["gene_set_file_paths"]
)

In [ ]:
gene_set_x_information = pd.DataFrame(index=gene_sets.index)

gene_set_x_information["Size"] = gene_sets.apply(
    lambda genes: genes.dropna().size, axis=1
)

gene_set_x_information["N"] = gene_sets.apply(
    lambda genes: sum(gene in gene_x_sample.index for gene in genes), axis=1
)

gene_set_x_information["Fraction"] = (
    gene_set_x_information["N"] / gene_set_x_information["Size"]
)

selected_gene_sets = kraft.select_series_indices(
    gene_set_x_information["N"],
    ">",
    thresholds=(PROJECT_JSON["gene_set_minimum_n"],),
    title={"text": "Good Gene Sets"},
    yaxis={"title": "Number of Genes in the Data"},
) & kraft.select_series_indices(
    gene_set_x_information["Fraction"],
    ">",
    thresholds=(PROJECT_JSON["gene_set_minimum_fraction"],),
    title={"text": "Good Gene Sets"},
    yaxis={"title": "Fraction of Genes in the Data"},
)

gene_set_x_information.loc[selected_gene_sets, "Good"] = 1

gene_set_x_information["Good"].fillna(value=0, inplace=True)

print(gene_set_x_information["Good"].value_counts())

gene_set_x_information.to_csv(PATH["gene_set_x_information.tsv"], sep="\t")

gene_set_x_information.sort_values("Good")

In [ ]:
gene_set_x_information.reindex(PROJECT_JSON["gene_sets_to_peek"]).sort_values("Good")