In [6]:
import os
import gzip
#data_dir = '../../data/TissueMap'
data_dir = '../../data/Flounder'
## RPKM * (library size in Million) / read_len == 1 for 1x coverage
lib_size = 10
read_len = 200
#out_type = 'count'
#out_format = '%d'
out_type = 'rpkm'
out_format = '%.2f'
query_word = 'Liver'
filename_out = 'Kwon201608_KoreanFlounder_%s.%s.txt'%(query_word, out_type)
def read_exp(filename):
#SeqID ReadCount PairCount ReadCPM PairCPM SeqLen ReadRPKM PairRPKM
rv = dict()
f = open(filename,'r')
if filename.endswith('.gz'):
f = gzip.open(filename,'rt')
for line in f:
if line.startswith('#'):
continue
tokens = line.strip().split("\t")
#print(tokens)
seq_id = tokens[0]
total_count = int(tokens[1])
pair_count = int(tokens[2])
total_cpm = float(tokens[3])
pair_cpm = float(tokens[4])
seq_len = int(tokens[5])
total_rpkm = float(tokens[6])
pair_rpkm = float(tokens[7])
#rv[seq_id] = {'count':pair_count, 'cpm':pair_cpm, 'rpkm':pair_rpkm, 'len':seq_len}
rv[seq_id] = {'count':total_count, 'cpm':total_cpm, 'rpkm':total_rpkm, 'len':seq_len}
f.close()
return rv
exp_list = dict()
sample_list = []
seq_list = []
for filename in os.listdir(data_dir):
if filename.find('.rpkm+cpm') < 0:
continue
if filename.find(query_word) < 0:
continue
sample_name = filename.split('.')[0]
exp_list[sample_name] = read_exp(os.path.join(data_dir, filename))
seq_list += exp_list[sample_name].keys()
sample_list.append(sample_name)
selected_seq_list = []
print(sample_list)
f_out = open(os.path.join(data_dir, filename_out),'w')
#f_out.write("SeqID\t%s\n"%("\t".join(['%s_%s'%(x.split('_')[2], x.split('_')[5]) for x in sample_list])))
f_out.write("SeqID\t%s\n"%("\t".join([x.split('_')[2] for x in sample_list])))
#print("SeqID\t%s"%("\t".join(['%s_%s'%(x.split('_')[2], x.split('_')[5]) for x in sample_list])))
for tmp_seq_id in sorted(list(set(seq_list))):
out_str = [tmp_seq_id]
is_low = 0
for tmp_sample_id in sample_list:
if not tmp_seq_id in exp_list[tmp_sample_id]:
is_low += 1
exp_list[tmp_sample_id][tmp_seq_id] = {'rpkm': 0.0}
out_str.append(out_format%(0.0))
continue
tmp_rpkm = exp_list[tmp_sample_id][tmp_seq_id]['rpkm']
tmp_exp = exp_list[tmp_sample_id][tmp_seq_id][out_type]
if tmp_rpkm * read_len / lib_size < 1:
is_low += 1
out_str.append(out_format%(0.0))
else:
out_str.append(out_format%(tmp_exp))
if is_low >= len(sample_list)*2/3:
continue
selected_seq_list.append(tmp_seq_id)
#print("\t".join(out_str))
f_out.write("\t".join(out_str) + "\n")
In [7]:
import scipy.stats as stat
seq_list = sorted(list(set(seq_list)))
for i in range(0,len(sample_list)):
sample_i = sample_list[i]
exp_i_list = [exp_list[sample_i][x]['rpkm'] for x in selected_seq_list]
for j in range(i+1,len(sample_list)):
sample_j = sample_list[j]
exp_j_list = [exp_list[sample_j][x]['rpkm'] for x in selected_seq_list]
print( "Pearson", sample_i, sample_j )
print( stat.pearsonr(exp_i_list, exp_j_list) )
print( "Spearman", sample_i, sample_j )
print( stat.spearmanr(exp_i_list, exp_j_list), "\n" )
In [ ]: