Setting


In [ ]:
import os, sys, psutil, difflib
from collections import OrderedDict

import numpy as np
import scipy as sp
import pandas as pd
import gseapy as gp

from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

from bionlp.util import fs, io, func, njobs
from bionlp import txtclf

LABEL2IDX = {'gene perturbation':2, 'drug perturbation':1, 'disease signature':0}
LABEL2OBJ = {'gene perturbation':'hs_gene_symbol', 'drug perturbation':'drug_name', 'disease signature':'disease_name'}
RUN_LABEL = 'drug perturbation'
_RUN_LABEL = RUN_LABEL.replace(' ', '_')
DGE_METHOD = 'limma-fdr'
DATA_PATH = '../../data/gesgnext'
GE_PATH = '../../data/gesgnext/gedata/%s' % _RUN_LABEL
DGE_PATH = '../../data/gesgnext/dge/%s/%s' % (DGE_METHOD, _RUN_LABEL)
DGE_DATA_PATH = '../../data/gesgnext/dge/%s' % _RUN_LABEL
# DGE_CACHE_PATH = '../../data/gesgnext/dge/cache/%s/%s' % (_RUN_LABEL, DGE_METHOD)
GEO_PATH = '../../data/gesgnext/geo'
GSE_DIR = '../../data/gesgnext/geo/xml/%s' % _RUN_LABEL
SAMP_DIR = '../../data/gesgnext/geo/xml/%s/samples' % _RUN_LABEL
PLATFORM_PATH = '../../data/gesgnext/geo/xml/%s/platforms' % _RUN_LABEL
SGNDB_PATH = '../../data/gesgnext/sgndb/%s' % _RUN_LABEL
WIKIPATHWAYS_PATH = '../../data/gesgnext/wikipathways'

# probe_gene_map = io.read_obj(os.path.join(PLATFORM_PATH, 'probe_gene_map.pkl'))
probe_gene_map = None
SGN_MIN_SIZE, SGN_MAX_SIZE = 5, 100
SC=' /// '

Read and Construct Data


In [ ]:
# Signatures
# sgn_df = pd.read_csv(os.path.join(DATA_PATH, '%s.csv'%RUN_LABEL.replace(' ', '_')))
sgn_df = pd.read_excel(os.path.join(DATA_PATH, '%s.xlsx'%RUN_LABEL.replace(' ', '_')))
# Differential gene expression
# dge_dfs = [io.read_df(os.path.join(DGE_PATH, fname), with_idx=True) for fname in ['dge_%s.npz'%x for x in range(sgn_df.shape[0])]]
dge_dfs = [io.read_df(os.path.join(DGE_PATH, fname), with_idx=True) for fname in ['dge_%s.npz'%sgn_id for sgn_id in sgn_df['id']]]
# dge_dfs = [io.read_df(os.path.join(DGE_PATH, 'dge_%s.npz'%sgn_id.split(':')[-1]), with_idx=True) for sgn_id in sgn_df['id']]
# dge_dfs = [io.read_df(os.path.join(DGE_CACHE_PATH, '%s.npz'%sgn_id)) for sgn_id in sgn_df['id']]

for geo_id, sgn_ids in sgn_df.groupby('geo_id').groups.iteritems():
    # Training data for classifier
    sub_sgn_df = sgn_df.loc[sgn_ids]
    sub_dge_dfs = [dge_dfs[i] for i in sgn_ids]
    dge_X = pd.concat([dge_df['statistic'].to_frame() for dge_df in sub_dge_dfs], axis=1, join='inner')
#     dge_X = pd.concat([dge_df['t'].to_frame() for dge_df in sub_dge_dfs], axis=1, join='inner')
    dge_X.columns = sub_sgn_df['id']
    dge_X = dge_X.transpose()
    io.write_df(dge_X, os.path.join(DGE_DATA_PATH, 'dge_X_%s.npz'%geo_id), with_idx=True, compress=True)
    # Label Construction
    mlb = MultiLabelBinarizer()
    bin_label = (mlb.fit_transform(sub_sgn_df[LABEL2OBJ[RUN_LABEL]].apply(str).as_matrix().reshape(-1,1)), mlb.classes_)
    io.write_df(pd.DataFrame(bin_label[0], index=dge_X.index, columns=bin_label[1]), os.path.join(DGE_DATA_PATH, 'dge_Y_%s.npz'%geo_id), with_idx=True, sparse_fmt='csr', compress=True)
    le = LabelEncoder()
    encoded_lb = (le.fit_transform(sub_sgn_df[LABEL2OBJ[RUN_LABEL]].apply(str).as_matrix()), le.classes_)
    io.write_df(pd.DataFrame(encoded_lb[0], index=dge_X.index, columns=[';'.join(['%i:%s'%(i,x) for i, x in enumerate(encoded_lb[1])])]), os.path.join(DGE_DATA_PATH, 'dge_ecY_%s.npz'%geo_id), with_idx=True, compress=True)
    del dge_X, bin_label, encoded_lb

Read and Construct Data Parallel


In [ ]:
def sgn2dgeg(groups, sgn_df, dge_dir, dgeg_dir):
    for geo_id, sgn_ids in groups:
        # Training data for classifier
        sub_sgn_df = sgn_df.loc[sgn_ids]
#         sub_dge_dfs = [dge_dfs[i] for i in sgn_ids]
        sub_dge_dfs = [io.read_df(os.path.join(dge_dir, fname), with_idx=True) for fname in ['dge_%s.npz'%sgn_id for sgn_id in sub_sgn_df['id']]]
        dge_X = pd.concat([dge_df['statistic'].to_frame() for dge_df in sub_dge_dfs], axis=1, join='inner')
        dge_X.columns = sub_sgn_df['id']
        dge_X = dge_X.transpose()
        io.write_df(dge_X, os.path.join(dgeg_dir, 'dge_X_%s.npz'%geo_id), with_idx=True, compress=True)
        # Label Construction
        mlb = MultiLabelBinarizer()
        bin_label = (mlb.fit_transform(sub_sgn_df[LABEL2OBJ[RUN_LABEL]].apply(str).as_matrix().reshape(-1,1)), mlb.classes_)
        io.write_df(pd.DataFrame(bin_label[0], index=dge_X.index, columns=bin_label[1]), os.path.join(dgeg_dir, 'dge_Y_%s.npz'%geo_id), with_idx=True, sparse_fmt='csr', compress=True)
        le = LabelEncoder()
        encoded_lb = (le.fit_transform(sub_sgn_df[LABEL2OBJ[RUN_LABEL]].apply(str).as_matrix()), le.classes_)
        io.write_df(pd.DataFrame(encoded_lb[0], index=dge_X.index, columns=[';'.join(['%i:%s'%(i,x) for i, x in enumerate(encoded_lb[1])])]), os.path.join(DGE_DATA_PATH, 'dge_ecY_%s.npz'%geo_id), with_idx=True, compress=True)
        del dge_X, bin_label, encoded_lb


sgn_df = pd.read_excel(os.path.join(DATA_PATH, '%s.xlsx'%RUN_LABEL.replace(' ', '_')))

groups = sgn_df.groupby('geo_id').groups.items()
numprocs = psutil.cpu_count()
task_bnd = njobs.split_1d(len(groups), split_num=numprocs, ret_idx=True)
_ = njobs.run_pool(sgn2dgeg, n_jobs=numprocs, dist_param=['groups'], groups=[groups[task_bnd[i]:task_bnd[i+1]] for i in range(numprocs)], sgn_df=sgn_df, dge_dir=DGE_PATH, dgeg_dir=DGE_DATA_PATH)

Extract Gene Set


In [ ]:
sgn_df = pd.read_excel(os.path.join(DATA_PATH, '%s.xlsx' % _RUN_LABEL))
idx_sgn_df = sgn_df.set_index('id')
probe_gene_map = io.read_obj(os.path.join(PLATFORM_PATH, 'probe_gene_map.pkl'))
keep_unkown_probe, hist_bnd = False, (-2, 1)
udr_genes = []
for dge_X_fpath in fs.listf(DGE_DATA_PATH, pattern='dge_X_.*\.npz', full_path=True):
    dge_X = io.read_df(dge_X_fpath, with_idx=True).replace([np.inf, -np.inf], np.nan).fillna(0)
    if (np.all(pd.isnull(dge_X.as_matrix()))): continue
    # Filter out the probes that cannot be converted to gene symbols
    plfm = idx_sgn_df['platform'].loc[dge_X.index[0]]
    has_plfm = probe_gene_map and probe_gene_map.has_key(plfm) and not probe_gene_map[plfm].empty
    if (has_plfm and not keep_unkown_probe):
        pgmap = probe_gene_map[plfm]
        columns = [col for col in dge_X.columns if col in pgmap.index and pgmap.loc[col] and not pgmap.loc[col].isspace()]
        dge_X = dge_X[columns]

    hist, bin_edges = zip(*[np.histogram(dge_X.iloc[i]) for i in range(dge_X.shape[0])])
    uprg = [dge_X.iloc[i, np.where(dge_X.iloc[i] >= bin_edges[i][hist_bnd[0]])[0]].sort_values(ascending=False) for i in range(dge_X.shape[0])]
    dwrg = [dge_X.iloc[i, np.where(dge_X.iloc[i] <= bin_edges[i][hist_bnd[1]])[0]].sort_values(ascending=True) for i in range(dge_X.shape[0])]
    upr_genes, dwr_genes = [x.index.tolist() for x in uprg], [x.index.tolist() for x in dwrg]
    upr_dges, dwr_dges = [x.tolist() for x in uprg], [x.tolist() for x in dwrg]
    del uprg, dwrg

    # Map to Gene Symbol
    if (has_plfm):
        pgmap = probe_gene_map[plfm]
        upr_genes = [[[x.strip() for x in pgmap.loc[probe].split('///')] if (probe in pgmap.index) else [probe] for probe in probes] for probes in upr_genes]
        uprg_lens = [[len(x) for x in genes] for genes in upr_genes]
        upr_dges = [[[dge] * length for dge, length in zip(dges, lens)] for dges, lens in zip(upr_dges, uprg_lens)]
        upr_genes = [func.flatten_list(probes) for probes in upr_genes]
        upr_dges = [func.flatten_list(dges) for dges in upr_dges]
        dwr_genes = [[[x.strip() for x in pgmap.loc[probe].split('///')] if (probe in pgmap.index) else [probe] for probe in probes] for probes in dwr_genes]
        dwrg_lens = [[len(x) for x in genes] for genes in dwr_genes]
        dwr_dges = [[[dge] * length for dge, length in zip(dges, lens)] for dges, lens in zip(dwr_dges, dwrg_lens)]
        dwr_genes = [func.flatten_list(probes) for probes in dwr_genes]
        dwr_dges = [func.flatten_list(dges) for dges in dwr_dges]
    udr_genes.append(pd.DataFrame(OrderedDict([('up_regulated_genes', ['|'.join(map(str, x[:SGN_MAX_SIZE])) for x in upr_genes]), ('down_regulated_genes', ['|'.join(map(str, x[-SGN_MAX_SIZE:])) for x in dwr_genes]), ('up_regulated_dges', ['|'.join(map(str, x[:SGN_MAX_SIZE])) for x in upr_dges]), ('down_regulated_dges', ['|'.join(map(str, x[-SGN_MAX_SIZE:])) for x in dwr_dges])]), index=dge_X.index))
    del upr_genes, dwr_genes, upr_dges, dwr_dges
    if (has_plfm): del uprg_lens, dwrg_lens
new_sgn_df = pd.concat([idx_sgn_df, pd.concat(udr_genes, axis=0, join='inner')], axis=1, join_axes=[idx_sgn_df.index])
new_sgn_fpath = os.path.join(DATA_PATH, '%s_udrg.xlsx' % _RUN_LABEL)
io.write_df(new_sgn_df, new_sgn_fpath, with_idx=True)
new_sgn_df.to_excel(new_sgn_fpath, encoding='utf8')

Extract Gene Set Parallel


In [ ]:
def dge2udrg(sgn_dge_fpaths, sgn_df, probe_gene_map, keep_unkown_probe=False, hist_bnd=(-2, 1)):
    udr_genes = []
    for sgn_dge_fpath in sgn_dge_fpaths:
        sgn_dge = io.read_df(sgn_dge_fpath, with_idx=True).replace([np.inf, -np.inf], np.nan).fillna(0)
        sgn_dge = sgn_dge.loc[[x for x in sgn_dge.index if x in sgn_df.index]]
        if (np.all(pd.isnull(sgn_dge))): continue
        # Filter out the probes that cannot be converted to gene symbols
        plfm = sgn_df['platform'].loc[sgn_dge.index[0]]
        has_plfm = probe_gene_map and probe_gene_map.has_key(plfm) and not probe_gene_map[plfm].empty
        if (has_plfm and not keep_unkown_probe):
            pgmap = probe_gene_map[plfm]
            columns = [col for col in sgn_dge.columns if col in pgmap.index and pgmap.loc[col] and not pgmap.loc[col].isspace()]
            sgn_dge = sgn_dge[columns]
        
        hist, bin_edges = zip(*[np.histogram(sgn_dge.iloc[i]) for i in range(sgn_dge.shape[0])])
        uprg = [sgn_dge.iloc[i, np.where(sgn_dge.iloc[i] >= bin_edges[i][hist_bnd[0]])[0]].sort_values(ascending=False) for i in range(sgn_dge.shape[0])]
        dwrg = [sgn_dge.iloc[i, np.where(sgn_dge.iloc[i] <= bin_edges[i][hist_bnd[1]])[0]].sort_values(ascending=True) for i in range(sgn_dge.shape[0])]
        upr_genes, dwr_genes = [x.index.tolist() for x in uprg], [x.index.tolist() for x in dwrg]
        upr_dges, dwr_dges = [x.tolist() for x in uprg], [x.tolist() for x in dwrg]
        del uprg, dwrg

        # Map to Gene Symbol
        if (has_plfm):
            pgmap = probe_gene_map[plfm]
            upr_genes = [[[x.strip() for x in pgmap.loc[probe].split('///')] if (probe in pgmap.index) else [probe] for probe in probes] for probes in upr_genes]
            uprg_lens = [[len(x) for x in genes] for genes in upr_genes]
            upr_dges = [[[dge] * length for dge, length in zip(dges, lens)] for dges, lens in zip(upr_dges, uprg_lens)]
            upr_genes = [func.flatten_list(probes) for probes in upr_genes]
            upr_dges = [func.flatten_list(dges) for dges in upr_dges]
            dwr_genes = [[[x.strip() for x in pgmap.loc[probe].split('///')] if (probe in pgmap.index) else [probe] for probe in probes] for probes in dwr_genes]
            dwrg_lens = [[len(x) for x in genes] for genes in dwr_genes]
            dwr_dges = [[[dge] * length for dge, length in zip(dges, lens)] for dges, lens in zip(dwr_dges, dwrg_lens)]
            dwr_genes = [func.flatten_list(probes) for probes in dwr_genes]
            dwr_dges = [func.flatten_list(dges) for dges in dwr_dges]
        filtered_ids = []
        for sid, uprg, dwrg in zip(sgn_dge.index, upr_genes, dwr_genes):
            if (len(uprg) < SGN_MIN_SIZE and len(dwrg) < SGN_MIN_SIZE):
                filtered_ids.append(sid)
        udr_genes.append(pd.DataFrame(OrderedDict([('up_regulated_genes', ['|'.join(map(str, x[:SGN_MAX_SIZE])) for x in upr_genes]), ('down_regulated_genes', ['|'.join(map(str, x[-SGN_MAX_SIZE:])) for x in dwr_genes]), ('up_regulated_dges', ['|'.join(map(str, x[:SGN_MAX_SIZE])) for x in upr_dges]), ('down_regulated_dges', ['|'.join(map(str, x[-SGN_MAX_SIZE:])) for x in dwr_dges])]), index=sgn_dge.index).loc[[sid for sid in sgn_dge.index if sid not in filtered_ids]])
        del upr_genes, dwr_genes, upr_dges, dwr_dges
        if (has_plfm): del uprg_lens, dwrg_lens
    return pd.concat(udr_genes, axis=0, join='inner')

sgn_df = pd.read_excel(os.path.join(DATA_PATH, '%s.xlsx' % _RUN_LABEL))
idx_sgn_df = sgn_df.set_index('id')
keep_unkown_probe, hist_bnd = False, (-4, 3)

numprocs = psutil.cpu_count()
sgn_dge_fpaths = fs.listf(DGE_DATA_PATH, pattern='dge_X_.*\.npz', full_path=True)
task_bnd = njobs.split_1d(len(sgn_dge_fpaths), split_num=numprocs, ret_idx=True)
# udr_genes = dge2udrg(sgn_dge_fpaths=sgn_dge_fpaths, sgn_df=idx_sgn_df, probe_gene_map=probe_gene_map, keep_unkown_probe=keep_unkown_probe, hist_bnd=hist_bnd)
udr_genes = njobs.run_pool(dge2udrg, n_jobs=numprocs, dist_param=['sgn_dge_fpaths'], sgn_dge_fpaths=[sgn_dge_fpaths[task_bnd[i]:task_bnd[i+1]] for i in range(numprocs)], sgn_df=idx_sgn_df, probe_gene_map=probe_gene_map, keep_unkown_probe=keep_unkown_probe, hist_bnd=hist_bnd)
new_sgn_df = pd.concat([idx_sgn_df, pd.concat(udr_genes, axis=0, join='inner')], axis=1, join_axes=[idx_sgn_df.index])
new_sgn_fpath = os.path.join(DATA_PATH, '%s_udrg.xlsx' % _RUN_LABEL)
io.write_df(new_sgn_df, new_sgn_fpath, with_idx=True)
new_sgn_df.to_excel(new_sgn_fpath, encoding='utf8')

Generate Signature Database


In [ ]:
def gen_sgndb(groups, udrg_sgn_df, sgndb_path):
    for geo_id, sgn_ids in groups:
        sub_sgn_df = udrg_sgn_df.loc[sgn_ids]
        # Combined signature database
#         db_content = '\n'.join(['%s\t%s\t%s' % (idx, ('%s:%s:%s'%(row['organism'], row['cell_type'], row[LABEL2OBJ[RUN_LABEL]])).replace(' ', '_'), '\t'.join(row['up_regulated_genes'].split('|')+row['down_regulated_genes'].split('|'))) for idx, row in sub_sgn_df.iterrows()])
#         fs.write_file(db_content, os.path.join(sgndb_path, '%s.gmt'%geo_id), code='utf-8')
#         del db_content
        
        # Up-regulated signature database
        up_db_content = '\n'.join(['%s\t%s\t%s' % (idx, ('%s:%s:%s'%(row['organism'], row['cell_type'], row[LABEL2OBJ[RUN_LABEL]])).replace(' ', '_'), '\t'.join(row['up_regulated_genes'].split('|'))) for idx, row in sub_sgn_df.iterrows()])
        fs.write_file(up_db_content, os.path.join(sgndb_path, '%s_up.gmt'%geo_id), code='utf-8')
        del up_db_content
        # Down-regulated signature database
        down_db_content = '\n'.join(['%s\t%s\t%s' % (idx, ('%s:%s:%s'%(row['organism'], row['cell_type'], row[LABEL2OBJ[RUN_LABEL]])).replace(' ', '_'), '\t'.join(row['down_regulated_genes'].split('|'))) for idx, row in sub_sgn_df.iterrows()])
        fs.write_file(down_db_content, os.path.join(sgndb_path, '%s_down.gmt'%geo_id), code='utf-8')
        del down_db_content
#         print [len(row['up_regulated_genes'].split('|')) for idx, row in sub_sgn_df.iterrows()]
#         print [len(row['down_regulated_genes'].split('|')) for idx, row in sub_sgn_df.iterrows()]


fs.mkdir(SGNDB_PATH)
udrg_sgn_df = io.read_df(os.path.join(DATA_PATH, '%s_udrg.xlsx'%RUN_LABEL.replace(' ', '_')), with_idx=True).dropna()
groups = udrg_sgn_df.groupby('geo_id').groups.items()
numprocs = psutil.cpu_count()
task_bnd = njobs.split_1d(len(groups), split_num=numprocs, ret_idx=True)
_ = njobs.run_pool(gen_sgndb, n_jobs=numprocs, dist_param=['groups'], groups=[groups[task_bnd[i]:task_bnd[i+1]] for i in range(numprocs)], udrg_sgn_df=udrg_sgn_df, sgndb_path=SGNDB_PATH)

Gene Set Enrichment Analysis


In [ ]:
def gsea(groups, udrg_sgn_df, probe_gene_map, sgndb_path, sample_path, method='signal_to_noise', permt_type='phenotype', permt_num=100, min_size=15, max_size=500, out_dir='gsea_output', keep_unkown_probe=False, fmt='xml', numprocs=1):
    if (fmt == 'soft'):
        from bionlp.spider import geo
    else:
        from bionlp.spider import geoxml as geo
    for geo_id, sgn_ids in groups:
        # Select the sub signature table
        sub_sgn_df = udrg_sgn_df.loc[sgn_ids]
        ids = sub_sgn_df['id'] if hasattr(sub_sgn_df, 'id') else sub_sgn_df.index
        # Prepair the gene expression profile and the perturbation labels
        pert_ids, ctrl_ids = list(set('|'.join(sub_sgn_df['pert_ids']).split('|'))), list(set('|'.join(sub_sgn_df['ctrl_ids']).split('|')))
        pert_geo_docs, ctrl_geo_docs = geo.parse_geos([os.path.join(sample_path, '.'.join([pert_id, fmt])) for pert_id in pert_ids], view='full', type='gsm', fmt=fmt), geo.parse_geos([os.path.join(sample_path, '.'.join([ctrl_id, fmt])) for ctrl_id in ctrl_ids], view='full', type='gsm', fmt=fmt)
        pert_ge_dfs, ctrl_ge_dfs = [geo_doc['data']['VALUE'] for geo_doc in pert_geo_docs], [geo_doc['data']['VALUE'] for geo_doc in ctrl_geo_docs]
        pert_df, ctrl_df = pd.concat(pert_ge_dfs, axis=1, join='inner').astype('float32'), pd.concat(ctrl_ge_dfs, axis=1, join='inner').astype('float32')
        pert_lb, ctrl_lb, class_vec = 'pert', 'ctrl', ['pert'] * pert_df.shape[1] + ['ctrl'] * ctrl_df.shape[1]
        join_df = pd.concat([pert_df, ctrl_df], axis=1, join='inner')
        join_df.columns = pert_ids + ctrl_ids
        del pert_geo_docs, ctrl_geo_docs, pert_ge_dfs[:], ctrl_ge_dfs[:], pert_df, ctrl_df
        # Map the probes to gene symbols
        plfm = sub_sgn_df['platform'].iloc[0]
        if (probe_gene_map and probe_gene_map.has_key(plfm) and not probe_gene_map[plfm].empty):
            pgmap = probe_gene_map[plfm]
            if (not keep_unkown_probe):
                probes = [idx for idx in join_df.index if idx in pgmap.index and pgmap.loc[idx] and not pgmap.loc[idx].isspace()]
                join_df = join_df.loc[probes]
            join_df.index = [[x.strip() for x in pgmap.loc[probe].split('///')][0] if (probe in pgmap.index) else [probe] for probe in join_df.index]

        join_df.reset_index(inplace=True)
        join_df.rename(columns={'ID_REF': 'NAME'}, inplace=True)
        join_df['NAME'] = join_df['NAME'].apply(str)
        # Call the GSEA API
#         try:
#             if (not os.path.exists(os.path.join(out_dir,geo_id)) or (os.path.exists(os.path.join(out_dir,geo_id)) and len(fs.read_file(os.path.join(sgndb_path, '%s.gmt'%geo_id))) > len(fs.listf(os.path.join(out_dir,geo_id), pattern='.*\.gsea\.pdf')))):
#                 print 'doing '+geo_id
#                 gs_res = gp.gsea(data=join_df, gene_sets=os.path.join(sgndb_path, '%s.gmt'%geo_id), cls=class_vec, permutation_type=permt_type, permutation_num=permt_num, min_size=min_size, max_size=max_size, outdir=os.path.join(out_dir,geo_id), method=method, processes=numprocs, format='pdf')
#         except Exception as e:
#             print 'Error occured when conducting GSEA for up-regulated genes in %s!' % geo_id
#             print e
        try:
            if (not os.path.exists(os.path.join(out_dir,geo_id+'up')) or (os.path.exists(os.path.join(out_dir,geo_id+'up')) and len(fs.read_file(os.path.join(sgndb_path, '%s_up.gmt'%geo_id))) > len(fs.listf(os.path.join(out_dir,geo_id+'up'), pattern='.*\.gsea\.pdf')))):
                print 'doing '+geo_id+'_up'
                gs_res = gp.gsea(data=join_df, gene_sets=os.path.join(sgndb_path, '%s_up.gmt'%geo_id), cls=class_vec, permutation_type=permt_type, permutation_num=permt_num, min_size=min_size, max_size=max_size, outdir=os.path.join(out_dir,geo_id+'up'), method=method, processes=numprocs, format='pdf')
        except Exception as e:
            print 'Error occured when conducting GSEA for up-regulated genes in %s!' % geo_id
            print e
        try:
            if (not os.path.exists(os.path.join(out_dir,geo_id+'down')) or (os.path.exists(os.path.join(out_dir,geo_id+'down')) and len(fs.read_file(os.path.join(sgndb_path, '%s_down.gmt'%geo_id))) > len(fs.listf(os.path.join(out_dir,geo_id+'down'), pattern='.*\.gsea\.pdf')))):
                print 'doing '+geo_id+'_down'
                gs_res = gp.gsea(data=join_df, gene_sets=os.path.join(sgndb_path, '%s_down.gmt'%geo_id), cls=class_vec, permutation_type=permt_type, permutation_num=permt_num, min_size=min_size, max_size=max_size, outdir=os.path.join(out_dir,geo_id+'down'), method=method, processes=numprocs, format='pdf')
        except Exception as e:
            print 'Error occured when conducting GSEA for down-regulated genes in %s!' % geo_id
            print e
        del join_df
    

udrg_sgn_df = pd.read_excel(os.path.join(DATA_PATH, '%s_udrg.xlsx'%RUN_LABEL.replace(' ', '_')), index_col='id').dropna()
# udrg_sgn_df = udrg_sgn_df[udrg_sgn_df['geo_id'] == 'GSE10809']
method, permt_type, permt_num, keep_unkown_probe = 'signal_to_noise', 'phenotype', 100, False
out_dir = os.path.join('gsea', method, _RUN_LABEL)
# probe_gene_map = io.read_obj('probe_gene_map.pkl')

numprocs = psutil.cpu_count()
groups = udrg_sgn_df.groupby('geo_id').groups.items()
task_bnd = njobs.split_1d(len(groups), split_num=numprocs, ret_idx=True)
gsea(groups, udrg_sgn_df=udrg_sgn_df, probe_gene_map=probe_gene_map, sgndb_path=SGNDB_PATH, sample_path=SAMP_DIR, method=method, permt_type=permt_type, permt_num=permt_num, min_size=SGN_MIN_SIZE, max_size=SGN_MAX_SIZE, out_dir=out_dir, keep_unkown_probe=keep_unkown_probe, numprocs=numprocs)
# _ = njobs.run_pool(gsea, n_jobs=numprocs, dist_param=['groups'], groups=[groups[task_bnd[i]:task_bnd[i+1]] for i in range(numprocs)], udrg_sgn_df=udrg_sgn_df, probe_gene_map=probe_gene_map, sgndb_path=SGNDB_PATH, sample_path=SAMP_DIR, method=method, permt_type=permt_type, permt_num=permt_num, min_size=SGN_MIN_SIZE, max_size=SGN_MAX_SIZE, out_dir=out_dir, keep_unkown_probe=keep_unkown_probe, numprocs=1)

Combine the Results


In [ ]:
udrg_sgn_df = pd.read_excel(os.path.join(DATA_PATH, '%s_udrg.xlsx'%RUN_LABEL.replace(' ', '_')), index_col='id')
method = 'signal_to_noise'
out_dir = os.path.join('gsea', method, _RUN_LABEL)
up_reports, down_reports = [], []

for geo_id, sgn_ids in udrg_sgn_df.groupby('geo_id').groups.items():
    uprep_fpath, downrep_fpath = os.path.join(out_dir, geo_id+'up', 'gseapy.gsea.phenotype.report.csv'), os.path.join(out_dir, geo_id+'down', 'gseapy.gsea.phenotype.report.csv')
    if (os.path.exists(uprep_fpath)):
        up_reports.append(pd.read_csv(uprep_fpath).set_index('Term')[['es','pval','fdr']].rename(columns={'es':'up_es','pval':'up_pval','fdr':'up_fdr'}))
    if (os.path.exists(downrep_fpath)):
        down_reports.append(pd.read_csv(downrep_fpath).set_index('Term')[['es','pval','fdr']].rename(columns={'es':'down_es','pval':'down_pval','fdr':'down_fdr'}))

up_gsea_report = pd.concat(up_reports, axis=0)
down_gsea_report = pd.concat(down_reports, axis=0)
gsea_sgn_df = pd.concat([udrg_sgn_df, up_gsea_report, down_gsea_report], axis=1, join_axes=[udrg_sgn_df.index])
io.write_df(gsea_sgn_df, '%s_udrg_gsea'%RUN_LABEL.replace(' ', '_'), with_idx=True)
gsea_sgn_df.to_excel('%s_udrg_gsea.xlsx'%RUN_LABEL.replace(' ', '_'), encoding='utf8')

Read the Gene Sets from WikiPathways


In [ ]:
wkpw_species, wkpw_annots, wkpw_genes = [[] for x in range(3)]
for fpath in fs.listf(WIKIPATHWAYS_PATH, pattern='.*\.gmt', full_path=True):
    lines = [l.strip('\n').split('\t') for l in fs.read_file(fpath)]
    annots, genes = zip(*[(l[:2], l[2:]) for l in lines])
    annots, genes = list(annots), list(genes)
    wkpw_species.append(os.path.splitext(os.path.basename(fpath))[0].lower().replace(' ', '_')), wkpw_annots.append(list(annots)), wkpw_genes.append(list(genes))

In [ ]:
from bionlp.spider import geoxml as geo
from cStringIO import StringIO

pred_species, pred_gses, pred_gpls, pred_annots, pred_genes = [[] for x in range(5)]
for fpath in fs.listf(SGNDB_PATH, pattern='.*\.gmt', full_path=True):
    lines = [l.strip('\n').split('\t') for l in fs.read_file(fpath)]
    annots, genes = zip(*[(l[:2], l[2:]) for l in lines])
    annots, genes = list(annots), list(genes)
    species = annots[0][1].split(':')[0].lower().replace(' ', '_')
    gse_id = os.path.splitext(os.path.basename(fpath))[0].split('_')[0].lower().replace(' ', '_')
    gse_doc = geo.parse_geo(os.path.join(GSE_DIR, '%s.xml'%gse_id.upper()), type='gse')
    pred_species.append(species), pred_gses.append(gse_id), pred_gpls.append(geo.parse_geo(os.path.join(SAMP_DIR, '%s.xml'%gse_doc['samples'][0]), type='gsm')['platform']), pred_annots.append(list(annots)), pred_genes.append(list(genes))

In [ ]:
def gs_ol(species, gses, gpls, genes, ref_species, ref_genes):
    try:
        pgmap = io.read_obj(os.path.join(PLATFORM_PATH, 'probe_gene_map.pkl'))
    except Exception as e:
        pgmap = None
    for species, gse, gpl, gss in zip(species, gses, gpls, genes):
        has_pgmap = pgmap is not None and pgmap.has_key(gpl)
        try:
            spcs_idx = ref_species.index(species)
        except ValueError as e:
            print e
            continue
        for ref_gs in ref_genes[spcs_idx]:
            for gs in gss:
                if (len(gs) == 0 or not gs[0]): continue
                if (has_pgmap):
                    gs = func.flatten_list(map(lambda x: pgmap[gpl].loc[x].split(SC) if x and x in pgmap[gpl].index else x, gs))
                    gs = [x if x.strip() != '///' else 0 for x in gs]
                    gs = [x for x in gs if float(x) != 0]
                gs_sim = difflib.SequenceMatcher(None, gs, ref_gs).ratio()
                if (gs_sim > 0.2): print 'Found %f%% similar gene set with size %i in series %s' % (gs_sim, len(gs), gse)
                    
numprocs = psutil.cpu_count()
task_bnd = njobs.split_1d(len(pred_gses), split_num=numprocs, ret_idx=True)
# gs_ol(pred_species, pred_gses, pred_gpls, pred_genes, ref_species=wkpw_species, ref_genes=wkpw_genes)
_ = njobs.run_pool(gs_ol, n_jobs=numprocs, dist_param=['species', 'gses', 'gpls', 'genes'], species=[pred_species[task_bnd[i]:task_bnd[i+1]] for i in range(numprocs)], gses=[pred_gses[task_bnd[i]:task_bnd[i+1]] for i in range(numprocs)], gpls=[pred_gpls[task_bnd[i]:task_bnd[i+1]] for i in range(numprocs)], genes=[pred_genes[task_bnd[i]:task_bnd[i+1]] for i in range(numprocs)], ref_species=wkpw_species, ref_genes=wkpw_genes)

Read Constructed Data (DEPRECATED)


In [ ]:
GSE_ID = 'GSE48301'
dge_X = io.read_df(os.path.join(DGE_DATA_PATH, 'dge_X_%s.npz'%GSE_ID), with_idx=True)
dge_Y = io.read_df(os.path.join(DGE_DATA_PATH, 'dge_Y_%s.npz'%GSE_ID), with_idx=True, sparse_fmt='csr')

In [ ]:
sub_dge_dfs=[io.read_df(os.path.join(DGE_PATH, fname), with_idx=True) for fname in ['dge_%s.npz'%x for x in range(71, 107)]]

In [ ]:
set(sub_dge_dfs[0].index) & set(sub_dge_dfs[5].index)

In [ ]:
def gen_mdls(tuned=False, glb_clfnames=[], **kwargs):
    clf_names = []
    for clf_name, clf in [
        ('RandomForest', Pipeline([('clf', func.build_model(RandomForestClassifier, 'Classifier', 'Random Forest', mltl=True, mltp=True, n_jobs=1, random_state=0))]))
    ]:
        yield clf_name, clf
        clf_names.append(clf_name)
    if (len(glb_clfnames) < len(clf_names)):
        del glb_clfnames[:]
        glb_clfnames.extend(clf_names)

txtclf.cross_validate(dge_X, dge_Y, gen_mdls, model_param=dict(tuned=False, glb_filtnames=[], glb_clfnames=[]), avg='micro', kfold=3, global_param=dict(comb=True, pl_names=[], pl_set=set([])), lbid=-1)