In [5]:
import numpy as np
import pandas as pd
import glob
import pickle
from scipy.stats import rankdata, norm
import os
In [2]:
# Patients with WGS data
pats_15_ncm=set()
pats_12_ncm=set()
for line in open('../autoAnno_GeneHancer/TCGA2pat.txt').read().rstrip().split('\n'):
a=line.split('\t')
if a[-1]!='0':
pats_15_ncm.add(a[0][:15])
pats_12_ncm.add(a[0][:12])
In [3]:
hugo = set()
entrez2hugo = {}
with open('HUGO2entrez.txt') as f:
for line in f.read().rstrip().splitlines()[1:]:
row = line.split('\t')
hugo.add(row[0])
if len(row)==2 and row[1]:
entrez2hugo[int(row[1])] = row[0]
In [4]:
# Parse gene expression
fns = glob.glob('/cellar/data/users/wzhang1984/cBioPortal/*/tcga/data_RNA_Seq_v2_expression_median.txt')
dfs = []
pat2disease_dfs = []
for fn in fns:
disease = fn.split('/')[-3]
df = pd.read_table(fn, index_col=0)
entrez_list = df['Entrez_Gene_Id']
df.columns = df.columns.str[:15]
df = df.loc[:,df.columns.isin(pats_15_ncm)]
npats = len(df.columns)
if npats > 1:
pat2disease_df = pd.DataFrame(np.ones((1,npats)), index=[disease], columns=df.columns)
pat2disease_dfs.append(pat2disease_df)
print(disease, npats)
new_index = []
for i in range(len(df.index)):
hugo_i = df.index[i]
entrez_i = entrez_list[i]
if entrez_i in entrez2hugo:
new_index.append(entrez2hugo[entrez_i])
elif hugo_i in hugo:
new_index.append(hugo_i)
else:
new_index.append('null')
df.index = new_index
dfs.append(df)
df_exp = pd.concat(dfs, axis=1)
df_exp = df_exp.groupby(df_exp.index).median()
df_exp.drop('null', axis=0, inplace=True)
pat2disease_df = pd.concat(pat2disease_dfs, axis=1)
pat2disease_df.fillna(0, inplace=True)
In [6]:
# Parse clinical to get patients' gender
fns = glob.glob('/cellar/data/users/wzhang1984/cBioPortal/*/tcga/data_bcr_clinical_data_patient.txt')
dfs = []
for fn in fns:
disease = fn.split('/')[-3]
df = pd.read_table(fn, index_col=1, skiprows=4)
df.index = df.index.str[:12]
df = df.loc[df.index.isin(pats_12_ncm),:]
npats = len(df.index)
if npats > 0:
dfs.append(df)
df_clin = pd.concat(dfs, axis=0)
In [7]:
# Create a file conatining global covariates (cancer types, race, gender)
pat2disease_df.columns = pat2disease_df.columns.str[:12]
pat2covarites_df = pd.concat([pat2disease_df.iloc[:-1,:], pd.DataFrame(df_clin.loc[:,'GENDER']).T])
pat2covarites_df = pat2covarites_df.loc[:,pat2covarites_df.columns.isin(pat2disease_df.columns)]
pat2covarites_df.loc['GENDER',pat2covarites_df.loc['GENDER',:]=='FEMALE'] = 1
pat2covarites_df.loc['GENDER',pat2covarites_df.loc['GENDER',:]=='MALE'] = 0
pat2covarites_df.fillna(0.5, inplace=True)
pat2covarites_df = pat2covarites_df.reindex_axis(sorted(pat2covarites_df.columns), axis=1)
df_race = pd.DataFrame(df_clin.loc[df_clin.index.isin(pat2disease_df.columns), 'RACE'])
df_race = df_race.loc[pat2covarites_df.columns,:]
df_race['BLACK'] = (df_race.RACE=='BLACK OR AFRICAN AMERICAN').astype(int)
df_race['ASIAN'] = (df_race.RACE=='ASIAN').astype(int)
df_race.drop('RACE', axis=1, inplace=True)
pat2covarites_df = pd.concat([pat2covarites_df, df_race.T])
pat2covarites_df = pat2covarites_df.reindex_axis(sorted(pat2covarites_df.columns), axis=1)
pat2covarites_df.to_csv('covariates.txt', sep='\t')
In [8]:
# Sort index and columns
pats_15 = sorted(df_exp.columns)
df_exp.columns = df_exp.columns.str[:12]
df_exp = df_exp.reindex_axis(sorted(df_exp.index), axis=0)
df_exp = df_exp.reindex_axis(sorted(df_exp.columns), axis=1)
df_exp.to_csv('pat2exp.txt', sep='\t')
# Filter the expression data
df_exp = df_exp.loc[df_exp.T.median()>1,:]
df_exp_log2 = np.log2(df_exp.clip(lower=1./8))
df_exp_log2.to_csv('pat2exp_log2.txt', sep='\t')
df_exp_log2_zscore = ((df_exp_log2.T - df_exp_log2.T.mean())/df_exp_log2.T.std()).T
In [9]:
def rank_based_inv_norm(x, c=3./8):
return norm.ppf((rankdata(x) - c) / (x.size - 2*c + 1))
In [11]:
with open('whitelist.txt', 'w') as f:
f.write('\n'.join(df_exp.index))
In [12]:
# Parse CNA
fns = glob.glob('/cellar/data/users/wzhang1984/cBioPortal/*/tcga/data_linear_CNA.txt')
dfs = []
for fn in fns:
disease = fn.split('/')[-3]
df = pd.read_table(fn, index_col=0)
df.columns = df.columns.str[:15]
df = df.loc[:,df.columns.isin(pats_15_ncm)]
npats = len(df.columns)
if npats > 0:
print(disease, npats)
dfs.append(df)
df_cna = pd.concat(dfs, axis=1)
In [13]:
# Finalize CNA features
df_cna.columns = df_cna.columns.str[:12]
# *Check the number of overlapped patients
print len(df_exp.columns & df_cna.columns)
df_cna = df_cna.loc[df_exp.index,df_exp.columns]
df_cna.fillna(0, inplace=True)
In [ ]:
In [14]:
# Parse methylation from probes
tcga_solid_02 = np.load('/cellar/data/users/wzhang1984/TCGA_methylation/tcga_solid_0.2.npy')
tcga_solid_02_index = pd.read_pickle('/cellar/data/users/wzhang1984/TCGA_methylation/tcga_solid_0.2.index.pickle')
with open('/cellar/data/users/wzhang1984/TCGA_methylation/tcga_solid_0.2.row.txt') as f:
tcga_solid_02_row = f.read().rstrip().splitlines()
with open('/cellar/data/users/wzhang1984/TCGA_methylation/tcga_solid_0.2.column.txt') as f:
tcga_solid_02_column = f.read().rstrip().splitlines()
df_me = pd.DataFrame(tcga_solid_02, index=tcga_solid_02_column, columns=tcga_solid_02_row)
df_me.columns = df_me.columns.str[:15]
df_me = df_me.loc[:,df_me.columns.isin(pats_15_ncm)]
In [15]:
# Load methylation probes-promoter mapping
me2gene = pd.read_table('/cellar/data/users/wzhang1984/TCGA_methylation/methyl_probes_ann.txt', index_col=0)
me2gene_dict = me2gene.loc[me2gene.loc[:,'Distance to TSS'].abs()<=1000,'Gene Name'].to_dict()
In [16]:
# Map methylation to promoters
# Take mean values if there is a multiple match
df_me = df_me.loc[df_me.index.isin(me2gene_dict.keys()),:]
df_me.index = df_me.index.to_series().map(me2gene_dict)
df_me = df_me.groupby(df_me.index).mean()
df_me = df_me.groupby(df_me.columns, axis=1).mean()
In [17]:
df_me.columns = df_me.columns.str[:12]
pat2disease_df.loc[:,set(df_exp.columns) - set(df_me.columns)].T.sum()
Out[17]:
In [18]:
# Finalize methylation features
df_me.columns = df_me.columns.str[:12]
# *Check the number of overlapped patients
print len(df_exp.columns & df_me.columns)
df_me = df_me.loc[df_exp.index,:]
df_me.fillna(0, inplace=True)
df_me = df_me.loc[df_exp.index,df_exp.columns]
df_me = df_me.T.fillna(df_me.T.mean()).T
In [27]:
gene2loci = {}
loci2pats = {}
loci2concentrate = {}
for line in open('../autoAnno_GeneHancer/TCGA_snv_mnv_merged_-1_anno_promoter_genehancer.txt'):
row = line.split('\n')[0].split('\t')
loci = '{}:{}-{}'.format(row[0], row[1], row[2])
pats = set()
mut2pats = {}
for mut in row[4].split(','):
[pat, Chr_StartEnd, source, alt] = mut.split("__")
pat = pat[:12]
pats.add(pat)
if Chr_StartEnd not in mut2pats:
mut2pats[Chr_StartEnd] = set()
mut2pats[Chr_StartEnd].add(pat)
loci2pats[loci] = pats
loci2concentrate[loci] = 0
for mut in mut2pats:
loci2concentrate[loci] = max(loci2concentrate[loci], len(mut2pats[mut])/float(len(pats)))
for g in row[-1].split(','):
[gene, tp] = g.split('|')
if gene not in gene2loci:
gene2loci[gene] = set()
gene2loci[gene].add(loci)
In [28]:
line_factors = ''
with open('PEER/log2_factors_20.txt') as f:
for line in f.read().splitlines():
row = line.split('\t')
if row[0] == 'mean':
continue
# try:
# if row[0][0] == 'h' and int(row[0][1:])>20:
# continue
# except:
# pass
line_factors += line + '\n'
In [29]:
os.system('mkdir -p data4lm_peer lm_coef_p lm_model_p log')
os.system('rm data4lm_peer/* lm_coef_p/* lm_model_p/* log/*')
Out[29]:
In [ ]:
cut_npats = 5
line_index2gene = ''
i = 0
line_header = '\t{}\n'.format('\t'.join(df_exp.columns))
pats_set = set(df_exp.columns)
for gene, row in df_exp.iterrows():
if gene in gene2loci:
line_loci = ''
for locus in gene2loci[gene]:
if len(set(loci2pats[locus])&pats_set) < cut_npats:
continue
if loci2concentrate[locus]<0.35:
continue
line_loci += locus
for pat in df_exp.columns:
if pat in loci2pats[locus]:
line_loci += '\t1'
else:
line_loci += '\t0'
line_loci += '\n'
if line_loci:
i += 1
print(i, gene)
line_index2gene += '{}\t{}\n'.format(i, gene)
line_out = line_header + line_loci
line_out += 'CNA\t{}\n'.format('\t'.join(df_cna.loc[gene,:].astype(str)))
line_out += 'methylation\t{}\n'.format('\t'.join(df_me.loc[gene,:].astype(str)))
line_out += line_factors
line_out += 'exp\t{}\n'.format('\t'.join(df_exp_log2_zscore.loc[gene,:].astype(str)))
with open('archive/merge_-1_log2_PEER_20/data4lm_peer/{}.txt'.format(i), 'w') as f:
f.write(line_out)
with open('archive/merge_-1_log2_PEER_20/index2gene4lm.txt', 'w') as f:
f.write(line_index2gene)
In [ ]:
In [ ]:
In [ ]:
In [ ]: