In [ ]:
from environment import *
with open("../project.json") as io:
PROJECT_JSON = json.load(io)
PATH = make_path_dict(PROJECT_JSON)
In [ ]:
gene_x_sample = pd.read_csv(PATH["gene_x_sample.processed.tsv"], sep="\t", index_col=0)
gene_sets = pd.concat(
kraft.read_gmt(gmt_file_path)
for gmt_file_path in PROJECT_JSON["gene_set_file_paths"]
)
In [ ]:
gene_set_x_information = pd.DataFrame(index=gene_sets.index)
gene_set_x_information["Size"] = gene_sets.apply(
lambda genes: genes.dropna().size, axis=1
)
gene_set_x_information["N"] = gene_sets.apply(
lambda genes: sum(gene in gene_x_sample.index for gene in genes), axis=1
)
gene_set_x_information["Fraction"] = (
gene_set_x_information["N"] / gene_set_x_information["Size"]
)
selected_gene_sets = kraft.select_series_indices(
gene_set_x_information["N"],
">",
thresholds=(PROJECT_JSON["gene_set_minimum_n"],),
title={"text": "Good Gene Sets"},
yaxis={"title": "Number of Genes in the Data"},
) & kraft.select_series_indices(
gene_set_x_information["Fraction"],
">",
thresholds=(PROJECT_JSON["gene_set_minimum_fraction"],),
title={"text": "Good Gene Sets"},
yaxis={"title": "Fraction of Genes in the Data"},
)
gene_set_x_information.loc[selected_gene_sets, "Good"] = 1
gene_set_x_information["Good"].fillna(value=0, inplace=True)
print(gene_set_x_information["Good"].value_counts())
gene_set_x_information.to_csv(PATH["gene_set_x_information.tsv"], sep="\t")
gene_set_x_information.sort_values("Good")
In [ ]:
gene_set_x_information.reindex(PROJECT_JSON["gene_sets_to_peek"]).sort_values("Good")