In [3]:
import os
import numpy as np
import pandas as pd
from bionlp import nlp
from bionlp.util import io, func

DATA_PATH = '../../data/gesgnext/demo'
SC = ';;'

# ref_files = ['disease_signatures-v1.0.csv', 'single_drug_perturbations-v1.0.csv', 'single_gene_perturbations-v1.0.csv']
# mdf_files = ['post_sgn_0.npz', 'post_sgn_1.npz', 'pre_sgn_2.npz']
ref_files = ['disease_signature.csv', 'drug_perturbation.csv', 'gene_perturbation.csv']
mdf_files = ['pre_sgn_0.npz', 'pre_sgn_1.npz', 'pre_sgn_2.npz']

# col_names = [('disease_name', 'do_id'), ('drug_name', 'drugbank_id'), ('hs_gene_symbol', 'mm_gene_symbol')]
col_names = [('disease_name', 'do_id', 'cell_type'), ('drug_name', 'drugbank_id', 'cell_type'), ('hs_gene_symbol', 'mm_gene_symbol','cell_type')]
idx_cols = [('ctrl_ids', 'pert_ids'), ('geo_id',)]

for ref_f, mdf_f, cols in zip(ref_files, mdf_files, col_names):
    ref_df = pd.read_csv(os.path.join(DATA_PATH, ref_f))
    mdf_df = io.read_df(os.path.join(DATA_PATH, mdf_f), with_idx=True)
    mdf_df.rename(columns={'geo_ids':'geo_id'}, inplace=True)
    idx_map = [{} for x in range(len(idx_cols))]
    idx = [None for x in range(mdf_df.shape[0])]
    for i, idx_col in enumerate(idx_cols):
        key_len = len(idx_col)
        idx_map[i].update(dict(zip([SC.join(keys) for keys in zip(*[ref_df[x] for x in idx_col])], ref_df.index)))
        mdf_key = [SC.join(keys) for keys in zip(*[mdf_df[x] for x in idx_col])]
        new_idx = [idx_map[i][x] if (j is None and idx_map[i].has_key(x)) else None for j, x in zip(idx, mdf_key)]
        idx = [x if nx is None else nx for x, nx in zip(idx, new_idx)]
    columns = [ref_df[col][idx].tolist() for col in cols]
    new_df = pd.concat([mdf_df, pd.DataFrame(data=dict(zip(cols, columns)), index=mdf_df.index)], axis=1, join_axes=[mdf_df.index], copy=False)
    
    cln_sgn_df = new_df.drop(new_df.index[np.where(new_df[cols[0]] == '')[0]], axis=0)
    # Create cell type column
#     cln_sgn_df['ANAT'] = [' '.join([mdf, x]) if x.startswith('cell') else x for mdf, x in zip(map(nlp.clean_txt, cln_sgn_df['mdf_ANAT'].fillna('')), map(nlp.clean_txt, cln_sgn_df['ANAT'].fillna('')))]
#     cln_sgn_df.rename(columns={'ANAT': 'cell_type'}, inplace=True)
#     cln_sgn_df.drop('mdf_ANAT', axis=1, inplace=True)
    # Delete other useless columns
    threshold = 0.5 * cln_sgn_df.shape[0]
    del_cols = [col for col in cln_sgn_df.columns if np.where(cln_sgn_df[col] != '')[0].shape[0] < threshold]
    cln_sgn_df.drop(del_cols, axis=1, inplace=True)
    
    fname = os.path.splitext(os.path.basename(mdf_f))[0]
    io.write_df(cln_sgn_df, os.path.join(DATA_PATH, 'new_%s.npz' % fname), with_idx=True)
    cln_sgn_df.to_csv(os.path.join(DATA_PATH, 'new_%s.csv' % fname), encoding='utf8')
#     cln_sgn_df.to_excel(os.path.join(DATA_PATH, 'new_%s.xlsx' % fname), encoding='utf8')

In [22]:
# Find different rows in two version of signatures

import os
import pandas as pd

DATA_PATH = '../../data/gesgnext'

compared_files = [('disease_signature.csv', 'disease_signature.xlsx'), ('drug_perturbation.csv', 'drug_perturbation.xlsx'), ('gene_perturbation.csv', 'gene_perturbation.xlsx')]
for fpair in compared_files:
    df0 = pd.read_csv(os.path.join(DATA_PATH, fpair[0]))
    df1 = pd.read_excel(os.path.join(DATA_PATH, fpair[1]))
    print 'Difference between %s and %s is: %s; %s' % (fpair[0], fpair[1], sorted(set(df1['id'])-set(df0['id'])), sorted(set(df0['id'])-set(df1['id'])))


Difference between disease_signature.csv and disease_signature.xlsx is: [u'dz:647', u'dz:648', u'dz:649']; []
Difference between drug_perturbation.csv and drug_perturbation.xlsx is: []; []
Difference between gene_perturbation.csv and gene_perturbation.xlsx is: [u'gene:1343', u'gene:1344', u'gene:2098', u'gene:2099', u'gene:2100', u'gene:4206', u'gene:4207', u'gene:4211', u'gene:4212', u'gene:4216', u'gene:4217', u'gene:4328', u'gene:4329', u'gene:4330', u'gene:482', u'gene:483', u'gene:511', u'gene:512', u'gene:513', u'gene:608', u'gene:611', u'gene:612', u'gene:613', u'gene:614', u'gene:615', u'gene:640', u'gene:641', u'gene:642']; []

In [27]:
# Add signature indices

import os
import pandas as pd

DATA_PATH = '../../data/gesgnext/demo'

files_ids = [('disease_signature.csv', 'dz'), ('drug_perturbation.csv', 'drug'), ('gene_perturbation.csv', 'gene')]
for fname, idlb in files_ids:
    df = pd.read_csv(os.path.join(DATA_PATH, fname))
    df['id'] = ['%s:%i' % (idlb, x) for x in range(df.shape[0])]
    df.to_csv(os.path.join(DATA_PATH, 'mdf_'+fname), index=None)