In [ ]:
import os
import numpy as np
import pandas as pd
from bionlp import nlp
from bionlp.util import io, func
DATA_PATH = '../../data/gesgnext/demo'
SC = ';;'
# ref_files = ['disease_signatures-v1.0.csv', 'single_drug_perturbations-v1.0.csv', 'single_gene_perturbations-v1.0.csv']
# mdf_files = ['post_sgn_0.npz', 'post_sgn_1.npz', 'pre_sgn_2.npz']
ref_files = ['disease_signature.csv', 'drug_perturbation.csv', 'gene_perturbation.csv']
mdf_files = ['pre_sgn_0.npz', 'pre_sgn_1.npz', 'pre_sgn_2.npz']
# col_names = [('disease_name', 'do_id'), ('drug_name', 'drugbank_id'), ('hs_gene_symbol', 'mm_gene_symbol')]
col_names = [('disease_name', 'do_id', 'cell_type'), ('drug_name', 'drugbank_id', 'cell_type'), ('hs_gene_symbol', 'mm_gene_symbol','cell_type')]
idx_cols = [('ctrl_ids', 'pert_ids'), ('geo_id',)]
for ref_f, mdf_f, cols in zip(ref_files, mdf_files, col_names):
ref_df = pd.read_csv(os.path.join(DATA_PATH, ref_f))
mdf_df = io.read_df(os.path.join(DATA_PATH, mdf_f), with_idx=True)
mdf_df.rename(columns={'geo_ids':'geo_id'}, inplace=True)
idx_map = [{} for x in range(len(idx_cols))]
idx = [None for x in range(mdf_df.shape[0])]
for i, idx_col in enumerate(idx_cols):
key_len = len(idx_col)
idx_map[i].update(dict(zip([SC.join(keys) for keys in zip(*[ref_df[x] for x in idx_col])], ref_df.index)))
mdf_key = [SC.join(keys) for keys in zip(*[mdf_df[x] for x in idx_col])]
new_idx = [idx_map[i][x] if (j is None and idx_map[i].has_key(x)) else None for j, x in zip(idx, mdf_key)]
idx = [x if nx is None else nx for x, nx in zip(idx, new_idx)]
columns = [ref_df[col][idx].tolist() for col in cols]
new_df = pd.concat([mdf_df, pd.DataFrame(data=dict(zip(cols, columns)), index=mdf_df.index)], axis=1, join_axes=[mdf_df.index], copy=False)
cln_sgn_df = new_df.drop(new_df.index[np.where(new_df[cols[0]] == '')[0]], axis=0)
# Create cell type column
# cln_sgn_df['ANAT'] = [' '.join([mdf, x]) if x.startswith('cell') else x for mdf, x in zip(map(nlp.clean_txt, cln_sgn_df['mdf_ANAT'].fillna('')), map(nlp.clean_txt, cln_sgn_df['ANAT'].fillna('')))]
# cln_sgn_df.rename(columns={'ANAT': 'cell_type'}, inplace=True)
# cln_sgn_df.drop('mdf_ANAT', axis=1, inplace=True)
# Delete other useless columns
threshold = 0.5 * cln_sgn_df.shape[0]
del_cols = [col for col in cln_sgn_df.columns if np.where(cln_sgn_df[col] != '')[0].shape[0] < threshold]
cln_sgn_df.drop(del_cols, axis=1, inplace=True)
fname = os.path.splitext(os.path.basename(mdf_f))[0]
io.write_df(cln_sgn_df, os.path.join(DATA_PATH, 'new_%s.npz' % fname), with_idx=True)
cln_sgn_df.to_csv(os.path.join(DATA_PATH, 'new_%s.csv' % fname), encoding='utf8')
# cln_sgn_df.to_excel(os.path.join(DATA_PATH, 'new_%s.xlsx' % fname), encoding='utf8')
In [ ]:
# Find different rows in two version of signatures
import os
import pandas as pd
DATA_PATH = '../../data/gesgnext'
compared_files = [('disease_signature.csv', 'disease_signature.xlsx'), ('drug_perturbation.csv', 'drug_perturbation.xlsx'), ('gene_perturbation.csv', 'gene_perturbation.xlsx')]
for fpair in compared_files:
df0 = pd.read_csv(os.path.join(DATA_PATH, fpair[0]))
df1 = pd.read_excel(os.path.join(DATA_PATH, fpair[1]))
print 'Difference between %s and %s is: %s; %s' % (fpair[0], fpair[1], sorted(set(df1['id'])-set(df0['id'])), sorted(set(df0['id'])-set(df1['id'])))
In [ ]:
# Add signature indices
import os
import pandas as pd
DATA_PATH = '../../data/gesgnext/demo'
files_ids = [('disease_signature.csv', 'dz'), ('drug_perturbation.csv', 'drug'), ('gene_perturbation.csv', 'gene')]
for fname, idlb in files_ids:
df = pd.read_csv(os.path.join(DATA_PATH, fname))
df['id'] = ['%s:%i' % (idlb, x) for x in range(df.shape[0])]
df.to_csv(os.path.join(DATA_PATH, 'mdf_'+fname), index=None)