In [5]:
import numpy as np
import pandas as pd
import glob
import pickle
from scipy.stats import rankdata, norm
import os

Parse expression and clinical data


In [2]:
# Patients with WGS data
pats_15_ncm=set()
pats_12_ncm=set()
for line in open('../autoAnno_GeneHancer/TCGA2pat.txt').read().rstrip().split('\n'):
    a=line.split('\t')
    if a[-1]!='0':
        pats_15_ncm.add(a[0][:15])
        pats_12_ncm.add(a[0][:12])

In [3]:
hugo = set()
entrez2hugo = {}
with open('HUGO2entrez.txt') as f:
    for line in f.read().rstrip().splitlines()[1:]:
        row = line.split('\t')
        hugo.add(row[0])
        if len(row)==2 and row[1]:
            entrez2hugo[int(row[1])] = row[0]

In [4]:
# Parse gene expression
fns = glob.glob('/cellar/data/users/wzhang1984/cBioPortal/*/tcga/data_RNA_Seq_v2_expression_median.txt')
dfs = []
pat2disease_dfs = []
for fn in fns:
    disease = fn.split('/')[-3]
    df = pd.read_table(fn, index_col=0)
    entrez_list = df['Entrez_Gene_Id']
    df.columns = df.columns.str[:15]
    df = df.loc[:,df.columns.isin(pats_15_ncm)]
    npats = len(df.columns)
    if npats > 1:
        pat2disease_df = pd.DataFrame(np.ones((1,npats)), index=[disease], columns=df.columns)
        pat2disease_dfs.append(pat2disease_df)
        print(disease, npats)
        new_index = []
        for i in range(len(df.index)):
            hugo_i = df.index[i]
            entrez_i = entrez_list[i]
            if entrez_i in entrez2hugo:
                new_index.append(entrez2hugo[entrez_i])
            elif hugo_i in hugo:
                new_index.append(hugo_i)
            else:
                new_index.append('null')
        df.index = new_index
        dfs.append(df)
df_exp = pd.concat(dfs, axis=1)
df_exp = df_exp.groupby(df_exp.index).median()
df_exp.drop('null', axis=0, inplace=True)
pat2disease_df = pd.concat(pat2disease_dfs, axis=1)
pat2disease_df.fillna(0, inplace=True)


('brca', 99)
('prad', 20)
('coadread', 27)
('blca', 23)
('hnsc', 50)
('sarc', 34)
('lihc', 52)
('kich', 49)
('skcm', 38)
('gbm', 34)
('laml', 27)
('luad', 50)
('kirc', 41)
('cesc', 20)
('stad', 35)
('kirp', 36)
('thca', 48)
('lusc', 50)
('dlbc', 7)
('ov', 24)
('lgg', 19)

In [6]:
# Parse clinical to get patients' gender
fns = glob.glob('/cellar/data/users/wzhang1984/cBioPortal/*/tcga/data_bcr_clinical_data_patient.txt')
dfs = []
for fn in fns:
    disease = fn.split('/')[-3]
    df = pd.read_table(fn, index_col=1, skiprows=4)
    df.index = df.index.str[:12]
    df = df.loc[df.index.isin(pats_12_ncm),:]
    npats = len(df.index)
    if npats > 0:
        dfs.append(df)
df_clin = pd.concat(dfs, axis=0)

In [7]:
# Create a file conatining global covariates (cancer types, race, gender)
pat2disease_df.columns = pat2disease_df.columns.str[:12]
pat2covarites_df = pd.concat([pat2disease_df.iloc[:-1,:], pd.DataFrame(df_clin.loc[:,'GENDER']).T])
pat2covarites_df = pat2covarites_df.loc[:,pat2covarites_df.columns.isin(pat2disease_df.columns)]
pat2covarites_df.loc['GENDER',pat2covarites_df.loc['GENDER',:]=='FEMALE'] = 1
pat2covarites_df.loc['GENDER',pat2covarites_df.loc['GENDER',:]=='MALE'] = 0
pat2covarites_df.fillna(0.5, inplace=True)
pat2covarites_df = pat2covarites_df.reindex_axis(sorted(pat2covarites_df.columns), axis=1)

df_race = pd.DataFrame(df_clin.loc[df_clin.index.isin(pat2disease_df.columns), 'RACE'])
df_race = df_race.loc[pat2covarites_df.columns,:]
df_race['BLACK'] = (df_race.RACE=='BLACK OR AFRICAN AMERICAN').astype(int)
df_race['ASIAN'] = (df_race.RACE=='ASIAN').astype(int)
df_race.drop('RACE', axis=1, inplace=True)

pat2covarites_df = pd.concat([pat2covarites_df, df_race.T])
pat2covarites_df = pat2covarites_df.reindex_axis(sorted(pat2covarites_df.columns), axis=1)
pat2covarites_df.to_csv('covariates.txt', sep='\t')

In [8]:
# Sort index and columns
pats_15 = sorted(df_exp.columns)
df_exp.columns = df_exp.columns.str[:12]
df_exp = df_exp.reindex_axis(sorted(df_exp.index), axis=0)
df_exp = df_exp.reindex_axis(sorted(df_exp.columns), axis=1)
df_exp.to_csv('pat2exp.txt', sep='\t')
# Filter the expression data
df_exp = df_exp.loc[df_exp.T.median()>1,:]
df_exp_log2 = np.log2(df_exp.clip(lower=1./8))
df_exp_log2.to_csv('pat2exp_log2.txt', sep='\t')
df_exp_log2_zscore = ((df_exp_log2.T - df_exp_log2.T.mean())/df_exp_log2.T.std()).T

In [9]:
def rank_based_inv_norm(x, c=3./8):
    return norm.ppf((rankdata(x) - c) / (x.size - 2*c + 1))

In [11]:
with open('whitelist.txt', 'w') as f:
    f.write('\n'.join(df_exp.index))

Parse copy number


In [12]:
# Parse CNA
fns = glob.glob('/cellar/data/users/wzhang1984/cBioPortal/*/tcga/data_linear_CNA.txt')
dfs = []
for fn in fns:
    disease = fn.split('/')[-3]
    df = pd.read_table(fn, index_col=0)
    df.columns = df.columns.str[:15]
    df = df.loc[:,df.columns.isin(pats_15_ncm)]
    npats = len(df.columns)
    if npats > 0:
        print(disease, npats)
        dfs.append(df)
df_cna = pd.concat(dfs, axis=1)


('brca', 95)
('prad', 20)
('coadread', 65)
('blca', 22)
('hnsc', 50)
('sarc', 34)
('lihc', 52)
('kich', 49)
('skcm', 36)
('gbm', 42)
('ucec', 51)
('laml', 30)
('luad', 50)
('kirc', 41)
('cesc', 19)
('stad', 40)
('kirp', 36)
('thca', 50)
('lusc', 50)
('dlbc', 7)
('ov', 50)
('lgg', 19)

In [13]:
# Finalize CNA features
df_cna.columns = df_cna.columns.str[:12]
# *Check the number of overlapped patients
print len(df_exp.columns & df_cna.columns)
df_cna = df_cna.loc[df_exp.index,df_exp.columns]
df_cna.fillna(0, inplace=True)


761

In [ ]:

Parse methylation


In [14]:
# Parse methylation from probes
tcga_solid_02 = np.load('/cellar/data/users/wzhang1984/TCGA_methylation/tcga_solid_0.2.npy')

tcga_solid_02_index = pd.read_pickle('/cellar/data/users/wzhang1984/TCGA_methylation/tcga_solid_0.2.index.pickle')

with open('/cellar/data/users/wzhang1984/TCGA_methylation/tcga_solid_0.2.row.txt') as f:
    tcga_solid_02_row = f.read().rstrip().splitlines()

with open('/cellar/data/users/wzhang1984/TCGA_methylation/tcga_solid_0.2.column.txt') as f:
    tcga_solid_02_column = f.read().rstrip().splitlines()

df_me = pd.DataFrame(tcga_solid_02, index=tcga_solid_02_column, columns=tcga_solid_02_row)

df_me.columns = df_me.columns.str[:15]
df_me = df_me.loc[:,df_me.columns.isin(pats_15_ncm)]

In [15]:
# Load methylation probes-promoter mapping
me2gene = pd.read_table('/cellar/data/users/wzhang1984/TCGA_methylation/methyl_probes_ann.txt', index_col=0)
me2gene_dict = me2gene.loc[me2gene.loc[:,'Distance to TSS'].abs()<=1000,'Gene Name'].to_dict()

In [16]:
# Map methylation to promoters
# Take mean values if there is a multiple match
df_me = df_me.loc[df_me.index.isin(me2gene_dict.keys()),:]
df_me.index = df_me.index.to_series().map(me2gene_dict)
df_me = df_me.groupby(df_me.index).mean()
df_me = df_me.groupby(df_me.columns, axis=1).mean()

In [17]:
df_me.columns = df_me.columns.str[:12]
pat2disease_df.loc[:,set(df_exp.columns) - set(df_me.columns)].T.sum()


Out[17]:
blca         0.0
brca        47.0
cesc         0.0
coadread     3.0
dlbc         0.0
gbm         17.0
hnsc         0.0
kich         0.0
kirc        19.0
kirp        14.0
laml         0.0
lgg          0.0
lihc         0.0
luad        20.0
lusc        29.0
ov          24.0
prad         0.0
sarc         0.0
skcm         0.0
stad         5.0
thca         0.0
dtype: float64

In [18]:
# Finalize methylation features
df_me.columns = df_me.columns.str[:12]
# *Check the number of overlapped patients
print len(df_exp.columns & df_me.columns)
df_me = df_me.loc[df_exp.index,:]
df_me.fillna(0, inplace=True)
df_me = df_me.loc[df_exp.index,df_exp.columns]
df_me = df_me.T.fillna(df_me.T.mean()).T


605

Parse noncoding mutations


In [27]:
gene2loci = {}
loci2pats = {}
loci2concentrate = {}
for line in open('../autoAnno_GeneHancer/TCGA_snv_mnv_merged_-1_anno_promoter_genehancer.txt'):
    row = line.split('\n')[0].split('\t')
    loci = '{}:{}-{}'.format(row[0], row[1], row[2])
    pats = set()
    mut2pats = {}
    for mut in row[4].split(','):
        [pat, Chr_StartEnd, source, alt] = mut.split("__")
        pat = pat[:12]
        pats.add(pat)
        if Chr_StartEnd not in mut2pats:
            mut2pats[Chr_StartEnd] = set()
        mut2pats[Chr_StartEnd].add(pat)
    loci2pats[loci] = pats
    loci2concentrate[loci] = 0
    for mut in mut2pats:
        loci2concentrate[loci] = max(loci2concentrate[loci], len(mut2pats[mut])/float(len(pats)))

    for g in row[-1].split(','):
        [gene, tp] = g.split('|')
        if gene not in gene2loci:
            gene2loci[gene] = set()
        gene2loci[gene].add(loci)

Load covariates


In [28]:
line_factors = ''
with open('PEER/log2_factors_20.txt') as f:
    for line in f.read().splitlines():
        row = line.split('\t')
        if row[0] == 'mean':
            continue
#         try:
#             if row[0][0] == 'h' and int(row[0][1:])>20:
#                 continue
#         except:
#             pass
        line_factors += line + '\n'

Finalize data for linear regression


In [29]:
os.system('mkdir -p data4lm_peer lm_coef_p lm_model_p log')
os.system('rm data4lm_peer/* lm_coef_p/* lm_model_p/* log/*')


Out[29]:
512

In [ ]:
cut_npats = 5

line_index2gene = ''
i = 0
line_header = '\t{}\n'.format('\t'.join(df_exp.columns))
pats_set = set(df_exp.columns)
for gene, row in df_exp.iterrows():
    if gene in gene2loci:
        line_loci = ''
        for locus in gene2loci[gene]:
            if len(set(loci2pats[locus])&pats_set) < cut_npats:
                continue
            if loci2concentrate[locus]<0.35:
                continue
            line_loci += locus
            for pat in df_exp.columns:
                if pat in loci2pats[locus]:
                    line_loci += '\t1'
                else:
                    line_loci += '\t0'
            line_loci += '\n'
        if line_loci:
            i += 1
            print(i, gene)
            line_index2gene += '{}\t{}\n'.format(i, gene)
            line_out = line_header + line_loci
            line_out += 'CNA\t{}\n'.format('\t'.join(df_cna.loc[gene,:].astype(str)))
            line_out += 'methylation\t{}\n'.format('\t'.join(df_me.loc[gene,:].astype(str)))
            line_out += line_factors
            line_out += 'exp\t{}\n'.format('\t'.join(df_exp_log2_zscore.loc[gene,:].astype(str)))
            with open('archive/merge_-1_log2_PEER_20/data4lm_peer/{}.txt'.format(i), 'w') as f:
                f.write(line_out)

with open('archive/merge_-1_log2_PEER_20/index2gene4lm.txt', 'w') as f:
    f.write(line_index2gene)

In [ ]:


In [ ]:


In [ ]:


In [ ]: