Test the fast cosine
In [78]:
import sys
import math
sys.path.append('/home/simon/git/lda/code/')
import glob
input_files = glob.glob('/home/simon/Dropbox/BioResearch/Meta_clustering/MS2LDA/fingerid-104-traindata/spectra_massbank/*.ms')
In [131]:
from ms2lda_feature_extraction import LoadGNPS
l = LoadGNPS()
ms1,ms2,metadata = l.load_spectra(input_files[:2000])
In [132]:
spec_dict = {}
for m in ms2:
tempms1 = m[3]
if not tempms1 in spec_dict:
spec_dict[tempms1] = []
spec_dict[tempms1].append((m[0],m[2]))
In [133]:
for tempms1,spec in spec_dict.items():
spec = sorted(spec,key = lambda x: x[0])
spec_dict[tempms1] = spec
In [134]:
def sqrt_normalise(spec):
# this is weird
intermediate_spec = []
total = 0.0
for mz,intensity in spec:
total += intensity
intermediate_spec.append((mz,math.sqrt(intensity)))
output_spec = []
norm_fac = math.sqrt(total)
for mz,intensity in intermediate_spec:
output_spec.append((mz,intensity/norm_fac))
return output_spec
def fast_cosine(spec1,spec2,tol = 0.5):
# spec 1 and spec 2 have to be sorted by mz
if len(spec1) == 0 or len(spec2) == 0:
return 0.0
# find all the matching pairs
spec1 = sqrt_normalise(spec1)
spec2 = sqrt_normalise(spec2)
matching_pairs = []
spec2lowpos = 0
spec2length = len(spec2)
# for mz,i in spec1:
# print mz
# print
# print
# for mz,i in spec2:
# print mz
for idx,(mz,intensity) in enumerate(spec1):
# do we need to increase the lower idx?
while spec2lowpos < spec2length and spec2[spec2lowpos][0]<mz-tol:
spec2lowpos += 1
if spec2lowpos == spec2length:
break
# print "mz = {}, current low = {}".format(mz,spec2[spec2lowpos][0])
spec2pos = spec2lowpos
while(spec2pos < spec2length and spec2[spec2pos][0] < mz + tol):
matching_pairs.append((idx,spec2pos,intensity*spec2[spec2pos][1]))
# print "added {} and {}".format(mz,spec2[spec2pos][0])
spec2pos += 1
matching_pairs = sorted(matching_pairs,key = lambda x:x[2],reverse = True)
used1 = set()
used2 = set()
score = 0.0
used_matches = []
for m in matching_pairs:
if not m[0] in used1 and not m[1] in used2:
score += m[2]
used1.add(m[0])
used2.add(m[1])
used_matches.append(m)
return score,used_matches
In [137]:
score_list = []
for i in range(len(spec_dict)-1):
if i%100 == 0:
print i
for k in range(i+1,len(spec_dict)):
spec1 = spec_dict.keys()[i]
spec2 = spec_dict.keys()[k]
score,match = fast_cosine(spec_dict[spec1],spec_dict[spec2])
score_list.append((spec1,spec2,score,match))
In [138]:
import numpy as np
score_list = sorted(score_list,key = lambda x: x[2],reverse = True)
for s in score_list[:10]:
print s[2]
In [153]:
s1 = spec_dict[score_list[2][0]]
In [154]:
s2 = spec_dict[score_list[2][1]]
In [155]:
for i,(mz,inte) in enumerate(s1):
print i,mz,inte
print metadata[score_list[2][1].name]
In [156]:
for i,(mz,inte) in enumerate(s2):
print i,mz,inte
print metadata[score_list[2][0].name]
In [149]:
print dir(ms1[0])
print ms1[0].file_name
In [ ]: