In [8]:
import sys
## 201612 data
#filename_raw = '../../data/03_ShawnJe/Je201710_Lipidomics/201612_OrganoidLipidomics.raw.txt'
#filename_out = '../../data/03_ShawnJe/Je201710_Lipidomics/201612_OrganoidLipidomics.refined.txt'
## 201710 data
filename_raw = '../../data/03_ShawnJe/Je201710_Lipidomics/201708_OrganoidLipidomics.raw.txt'
filename_out = '../../data/03_ShawnJe/Je201710_Lipidomics/201708_OrganoidLipidomics.refined.txt'
f_raw = open(filename_raw,'r')
target_list = [x.split(',')[0].replace(' ','').replace('"','').replace(' ','_').strip() for x in f_raw.readline().split("\t")[1:]]
#print("\n".join(sorted(target_list)))
t2s = dict()
for tmp_t in target_list:
t2s[tmp_t] = dict()
sample_list = []
for line in f_raw:
tmp_tokens = line.strip().split("\t")
if len(tmp_tokens) <= 2:
continue
tmp_sample_name = tmp_tokens[0].replace(' ','_').replace('Hg','').replace('Lipo','Div')
sample_list.append(tmp_sample_name)
is_missed = 0
for i in range(1,len(tmp_tokens)):
if tmp_tokens[i] != '_' and tmp_tokens[i] != '-' :
t2s[target_list[i-1]][tmp_sample_name] = float(tmp_tokens[i])
else:
t2s[target_list[i-1]][tmp_sample_name] = 0.0
f_raw.close()
count = 0
f_out = open(filename_out,'w')
f_out.write("TargetName\t%s\n"%("\t".join(sorted(sample_list))))
for tmp_t in sorted(target_list):
out_str_list = []
count_zero = 0
for tmp_s in sorted(sample_list):
if tmp_s in t2s[tmp_t]:
out_str_list.append( '%.05f'%t2s[tmp_t][tmp_s] )
if t2s[tmp_t][tmp_s] == 0:
count_zero += 1
else:
count_zero += 1
if len(out_str_list) != len(sample_list):
sys.stderr.write('Skip: %s\n'%tmp_t)
continue
if count_zero > len(sample_list)*0.2:
sys.stderr.write('Skip due to zeros: %s\n'%tmp_t)
continue
count += 1
f_out.write( "%s\t%s\n"%(tmp_t, "\t".join(out_str_list)) )
f_out.close()
In [10]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,6))
ax1 = fig.add_subplot(1,1,1)
missed_values = []
for tmp_t in target_list:
tmp_values = [t2s[tmp_t][tmp_s] for tmp_s in sample_list]
tmp_missed_fraction = len([x for x in tmp_values if x == 0])/len(tmp_values)*100.0
missed_values.append( tmp_missed_fraction )
ax1.bar(range(0,len(missed_values)), sorted(missed_values))
ax1.grid()
ax1.set_xlabel("Lipid components")
ax1.set_ylabel("Fraction of missing and zero values")
ax1.set_xticks(range(0,len(target_list), 10))
ax1.set_yticks(range(0,101,10))
plt.show()
print("< 20 pct missing values: %d"%len([x for x in missed_values if x < 20]))
In [109]:
import math
filename_limma = '../../data/03_ShawnJe/Je201710_Lipidomics/201612_OrganoidLipidomics.GBA-WT.limma_out.txt'
f_limma = open(filename_limma,'r')
headers = f_limma.readline().replace('"','').strip().split()
#print(headers)
idx_FDR = headers.index('adj.P.Val') + 1
idx_CIL = headers.index('CI.L') + 1
idx_CIR = headers.index('CI.R') + 1
idx_AveExpr = headers.index('AveExpr') + 1
idx_logFC = headers.index('logFC') + 1
M_list = []
A_list = []
M_DE_list = []
A_DE_list = []
M_Glc_list = []
A_Glc_list = []
M_SM_list = []
A_SM_list = []
print("Compound\tMeanSignal\tFDR\tlog2FC\t95pctCI.L\t95pctCI.R")
for line in f_limma:
tokens = line.strip().split()
tmp_id = tokens[0].replace('"','')
tmp_FDR = float(tokens[idx_FDR])
tmp_CIL = float(tokens[idx_CIL])
tmp_CIR = float(tokens[idx_CIR])
tmp_logFC = float(tokens[idx_logFC])
tmp_avg = float(tokens[idx_AveExpr])
#print(tmp_id.split('(')[0])
M_list.append(tmp_avg)
A_list.append(tmp_logFC)
if tmp_id.startswith('SM'):
M_SM_list.append(tmp_avg)
A_SM_list.append(tmp_logFC)
if tmp_id.startswith('Glc'):
M_Glc_list.append(tmp_avg)
A_Glc_list.append(tmp_logFC)
if tmp_FDR < 0.05 and abs(tmp_logFC) > math.log(1.5)/math.log(2):
#if tmp_id.startswith('DHSM'):
M_DE_list.append(tmp_avg)
A_DE_list.append(tmp_logFC)
if tmp_logFC > 0:
print("%s\t%.2f\t%.2e\t%.2f\t%.2f\t%.2f"%(tmp_id, tmp_avg, tmp_FDR, tmp_logFC, tmp_CIL, tmp_CIR))
else:
print("%s\t%.2f\t%.2e\t%.2f\t%.2f\t%.2f"%(tmp_id, tmp_avg, tmp_FDR, tmp_logFC, tmp_CIR, tmp_CIL))
f_limma.close()
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(6,6))
ax1 = fig.add_subplot(1,1,1)
ax1.plot(M_list, A_list, 'k.')
ax1.plot(M_SM_list, A_SM_list, 'c*', ms=9, label='Sphingomyelin(SM)')
ax1.plot(M_Glc_list, A_Glc_list, 'm*', ms=9, label='Glucosylceramide(GlcCer)')
ax1.plot(M_PC_list, A_PC_list, 'b*', ms=9, label='Phosphatydilcholine(PC)')
ax1.plot(M_DE_list, A_DE_list, 'o', ms=8, markerfacecolor="None", markeredgecolor='red', markeredgewidth=1, label='Significantly different')
ax1.grid()
ax1.legend()
ax1.set_xlabel("Mean Signal")
ax1.set_ylabel("log2(fold change)")
plt.show()
In [127]:
filename_limma = '../../data/03_ShawnJe/Je201710_Lipidomics/201612_OrganoidLipidomics.GBA_Div90-Div30.limma_out.txt'
f_limma = open(filename_limma,'r')
headers = f_limma.readline().replace('"','').strip().split()
#print(headers)
idx_FDR = headers.index('adj.P.Val') + 1
idx_CIL = headers.index('CI.L') + 1
idx_CIR = headers.index('CI.R') + 1
idx_AveExpr = headers.index('AveExpr') + 1
idx_logFC = headers.index('logFC') + 1
M_list = []
A_list = []
M_DE_list = []
A_DE_list = []
M_Glc_list = []
A_Glc_list = []
M_SM_list = []
A_SM_list = []
M_PC_list = []
A_PC_list = []
print("Compound\tMeanSignal\tFDR\tlog2FC\t95pctCI.L\t95pctCI.R")
for line in f_limma:
tokens = line.strip().split()
tmp_id = tokens[0].replace('"','')
tmp_FDR = float(tokens[idx_FDR])
tmp_CIL = float(tokens[idx_CIL])
tmp_CIR = float(tokens[idx_CIR])
tmp_logFC = float(tokens[idx_logFC])
tmp_avg = float(tokens[idx_AveExpr])
M_list.append(tmp_avg)
A_list.append(tmp_logFC)
if tmp_id.startswith('SM'):
M_SM_list.append(tmp_avg)
A_SM_list.append(tmp_logFC)
if tmp_id.startswith('Glc'):
M_Glc_list.append(tmp_avg)
A_Glc_list.append(tmp_logFC)
if tmp_id.startswith('PC'):
M_PC_list.append(tmp_avg)
A_PC_list.append(tmp_logFC)
if tmp_FDR < 0.05 and abs(tmp_logFC) > math.log(1.5)/math.log(2):
#if tmp_id.startswith('DHSM'):
M_DE_list.append(tmp_avg)
A_DE_list.append(tmp_logFC)
if tmp_logFC > 0:
print("%s\t%.2f\t%.2e\t%.2f\t%.2f\t%.2f"%(tmp_id, tmp_avg, tmp_FDR, tmp_logFC, tmp_CIL, tmp_CIR))
else:
print("%s\t%.2f\t%.2e\t%.2f\t%.2f\t%.2f"%(tmp_id, tmp_avg, tmp_FDR, tmp_logFC, tmp_CIR, tmp_CIL))
f_limma.close()
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(6,6))
ax1 = fig.add_subplot(1,1,1)
ax1.plot(M_list, A_list, 'k.')
ax1.plot(M_SM_list, A_SM_list, 'c*', ms=9, label='Sphingomyelin(SM)')
ax1.plot(M_Glc_list, A_Glc_list, 'm*', ms=9, label='Glucosylceramide(GlcCer)')
ax1.plot(M_PC_list, A_PC_list, 'b*', ms=9, label='Phosphatydilcholine(PC)')
ax1.plot(M_DE_list, A_DE_list, 'o', ms=8, markerfacecolor="None", markeredgecolor='red', markeredgewidth=1, label='Significantly different')
ax1.grid()
ax1.legend(loc='upper right')
ax1.set_xlabel("Mean Signal")
ax1.set_ylabel("log2(fold change)")
ax1.set_ylim( -2, 2 )
plt.show()
In [ ]: