In [6]:
import os
import gzip

#data_dir = '../../data/TissueMap'
data_dir = '../../data/Flounder'

## RPKM * (library size in Million) / read_len == 1 for 1x coverage
lib_size = 10
read_len = 200
#out_type = 'count'
#out_format = '%d'
out_type = 'rpkm'
out_format = '%.2f'
query_word = 'Liver'
filename_out = 'Kwon201608_KoreanFlounder_%s.%s.txt'%(query_word, out_type)

def read_exp(filename):
    #SeqID	ReadCount	PairCount	ReadCPM	PairCPM	SeqLen	ReadRPKM	PairRPKM
    rv = dict()
    
    f = open(filename,'r')
    if filename.endswith('.gz'):
        f = gzip.open(filename,'rt')
        
    for line in f:
        if line.startswith('#'):
            continue
        tokens = line.strip().split("\t")
        #print(tokens)
        seq_id = tokens[0]
        total_count = int(tokens[1])
        pair_count = int(tokens[2])
        total_cpm = float(tokens[3])
        pair_cpm = float(tokens[4])
        seq_len = int(tokens[5])
        total_rpkm = float(tokens[6])
        pair_rpkm = float(tokens[7])
        #rv[seq_id] = {'count':pair_count, 'cpm':pair_cpm, 'rpkm':pair_rpkm, 'len':seq_len}
        rv[seq_id] = {'count':total_count, 'cpm':total_cpm, 'rpkm':total_rpkm, 'len':seq_len}
    f.close()
    return rv

exp_list = dict()
sample_list = []
seq_list = []
for filename in os.listdir(data_dir):
    if filename.find('.rpkm+cpm') < 0:
        continue
    if filename.find(query_word) < 0:
        continue
    sample_name = filename.split('.')[0]
    exp_list[sample_name] = read_exp(os.path.join(data_dir, filename))
    seq_list += exp_list[sample_name].keys()
    sample_list.append(sample_name)

selected_seq_list = []

print(sample_list)
f_out = open(os.path.join(data_dir, filename_out),'w')
#f_out.write("SeqID\t%s\n"%("\t".join(['%s_%s'%(x.split('_')[2], x.split('_')[5]) for x in sample_list])))
f_out.write("SeqID\t%s\n"%("\t".join([x.split('_')[2] for x in sample_list])))

#print("SeqID\t%s"%("\t".join(['%s_%s'%(x.split('_')[2], x.split('_')[5]) for x in sample_list])))
for tmp_seq_id in sorted(list(set(seq_list))):
    out_str = [tmp_seq_id]
    is_low = 0
    for tmp_sample_id in sample_list:
        if not tmp_seq_id in exp_list[tmp_sample_id]:
            is_low += 1
            exp_list[tmp_sample_id][tmp_seq_id] = {'rpkm': 0.0}
            out_str.append(out_format%(0.0))
            continue
            
        tmp_rpkm = exp_list[tmp_sample_id][tmp_seq_id]['rpkm']
        tmp_exp = exp_list[tmp_sample_id][tmp_seq_id][out_type]
        if tmp_rpkm * read_len / lib_size < 1:
            is_low += 1
            out_str.append(out_format%(0.0))
        else:
            out_str.append(out_format%(tmp_exp))
    
    if is_low >= len(sample_list)*2/3:
        continue
    
    selected_seq_list.append(tmp_seq_id)
    #print("\t".join(out_str))
    f_out.write("\t".join(out_str) + "\n")


['Kwon201608_KoreanFlounder_LiverA', 'Kwon201608_KoreanFlounder_LiverB', 'Kwon201608_KoreanFlounder_LiverC']

In [7]:
import scipy.stats as stat

seq_list = sorted(list(set(seq_list)))

for i in range(0,len(sample_list)):
    sample_i = sample_list[i]
    exp_i_list = [exp_list[sample_i][x]['rpkm'] for x in selected_seq_list]
    for j in range(i+1,len(sample_list)):
        sample_j = sample_list[j]
        exp_j_list = [exp_list[sample_j][x]['rpkm'] for x in selected_seq_list]
        print( "Pearson", sample_i, sample_j )
        print( stat.pearsonr(exp_i_list, exp_j_list) )
        print( "Spearman", sample_i, sample_j )
        print( stat.spearmanr(exp_i_list, exp_j_list), "\n" )


Pearson Kwon201608_KoreanFlounder_LiverA Kwon201608_KoreanFlounder_LiverB
(0.95324617948840995, 0.0)
Spearman Kwon201608_KoreanFlounder_LiverA Kwon201608_KoreanFlounder_LiverB
SpearmanrResult(correlation=0.89871853414530556, pvalue=0.0) 

Pearson Kwon201608_KoreanFlounder_LiverA Kwon201608_KoreanFlounder_LiverC
(0.98830340000999595, 0.0)
Spearman Kwon201608_KoreanFlounder_LiverA Kwon201608_KoreanFlounder_LiverC
SpearmanrResult(correlation=0.81583877705416608, pvalue=0.0) 

Pearson Kwon201608_KoreanFlounder_LiverB Kwon201608_KoreanFlounder_LiverC
(0.97809553820690609, 0.0)
Spearman Kwon201608_KoreanFlounder_LiverB Kwon201608_KoreanFlounder_LiverC
SpearmanrResult(correlation=0.80906969741772305, pvalue=0.0) 


In [ ]: