Test the fast cosine


In [78]:
import sys
import math
sys.path.append('/home/simon/git/lda/code/')
import glob
input_files = glob.glob('/home/simon/Dropbox/BioResearch/Meta_clustering/MS2LDA/fingerid-104-traindata/spectra_massbank/*.ms')

In [131]:
from ms2lda_feature_extraction import LoadGNPS
l = LoadGNPS()
ms1,ms2,metadata = l.load_spectra(input_files[:2000])


Processed 100 spectra
Processed 200 spectra
Processed 300 spectra
Processed 400 spectra
Processed 500 spectra
Processed 600 spectra
Processed 700 spectra
Processed 800 spectra
Processed 900 spectra
Processed 1000 spectra
Processed 1100 spectra
Processed 1200 spectra
Processed 1300 spectra
Processed 1400 spectra
Processed 1500 spectra
Processed 1600 spectra
Processed 1700 spectra
Processed 1800 spectra
Processed 1900 spectra
Processed 2000 spectra

In [132]:
spec_dict = {}
for m in ms2:
    tempms1 = m[3]
    if not tempms1 in spec_dict:
        spec_dict[tempms1] = []
    spec_dict[tempms1].append((m[0],m[2]))

In [133]:
for tempms1,spec in spec_dict.items():
    spec = sorted(spec,key = lambda x: x[0])
    spec_dict[tempms1] = spec

In [134]:
def sqrt_normalise(spec):
    # this is weird
    intermediate_spec = []
    total = 0.0
    for mz,intensity in spec:
        total += intensity
        intermediate_spec.append((mz,math.sqrt(intensity)))
    output_spec = []
    norm_fac = math.sqrt(total)
    for mz,intensity in intermediate_spec:
        output_spec.append((mz,intensity/norm_fac))
    return output_spec

def fast_cosine(spec1,spec2,tol = 0.5):
    # spec 1 and spec 2 have to be sorted by mz
    if len(spec1) == 0 or len(spec2) == 0:
        return 0.0
    # find all the matching pairs
    
    spec1 = sqrt_normalise(spec1)
    spec2 = sqrt_normalise(spec2)
    
    matching_pairs = []
    spec2lowpos = 0
    spec2length = len(spec2)
    
#     for mz,i in spec1:
#         print mz
#     print
#     print
#     for mz,i in spec2:
#         print mz
    
    for idx,(mz,intensity) in enumerate(spec1):
        # do we need to increase the lower idx?
        while spec2lowpos < spec2length and spec2[spec2lowpos][0]<mz-tol:
            spec2lowpos += 1
        if spec2lowpos == spec2length:
            break
#         print "mz = {}, current low = {}".format(mz,spec2[spec2lowpos][0])
        spec2pos = spec2lowpos
        while(spec2pos < spec2length and spec2[spec2pos][0] < mz + tol):
            matching_pairs.append((idx,spec2pos,intensity*spec2[spec2pos][1]))
#             print "added {} and {}".format(mz,spec2[spec2pos][0])
            spec2pos += 1
        
        
    matching_pairs = sorted(matching_pairs,key = lambda x:x[2],reverse = True)
    used1 = set()
    used2 = set()
    score = 0.0
    used_matches = []
    for m in matching_pairs:
        if not m[0] in used1 and not m[1] in used2:
            score += m[2]
            used1.add(m[0])
            used2.add(m[1])
            used_matches.append(m)
    return score,used_matches

In [137]:
score_list = []
for i in range(len(spec_dict)-1):
    if i%100 == 0:
        print i
    for k in range(i+1,len(spec_dict)):
        spec1 = spec_dict.keys()[i]
        spec2 = spec_dict.keys()[k]

        score,match = fast_cosine(spec_dict[spec1],spec_dict[spec2])
        score_list.append((spec1,spec2,score,match))


0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900

In [138]:
import numpy as np
score_list = sorted(score_list,key = lambda x: x[2],reverse = True)
for s in score_list[:10]:
    print s[2]


0.998742677557
0.998566366824
0.998360302483
0.997830760216
0.997730310821
0.997698848171
0.997466375006
0.997457402543
0.997002342294
0.9962959127

In [153]:
s1 = spec_dict[score_list[2][0]]

In [154]:
s2 = spec_dict[score_list[2][1]]

In [155]:
for i,(mz,inte) in enumerate(s1):
    print i,mz,inte
print metadata[score_list[2][1].name]


0 67.0542 2888.7
1 75.0229 102349.8
2 76.0307 5352.6
3 77.0385 6545281.8
4 81.0335 83150.0
5 89.0385 10566.8
6 91.0542 10377.5
7 94.0413 49069.1
8 95.049 3629064.3
9 101.0387 19379.6
10 102.0464 812258.5
11 103.0541 19458714.9
12 104.0494 16721.8
13 105.0446 2642234.2
14 106.0416 5325.6
15 119.0491 23268.8
16 128.0495 667393.7
17 129.0448 227387.6
18 129.0573 49215.1
19 130.0649 150136344.5
{'smiles': 'N1=CC=C2C=CC=CC2=C1', 'InChIKey': 'AWJUIBRHMBBTKR-UHFFFAOYSA-N', 'ionization': '[M + H]+', 'collision': '180', 'compound': 'ufz_0066', 'InChI': 'InChI=1S/C9H7N/c1-2-4-9-7-10-6-5-8(9)3-1/h1-7H', 'formula': 'C9H7N', 'parentmass': '130.0651', 'annotation': 'ufz_0066'}

In [156]:
for i,(mz,inte) in enumerate(s2):
    print i,mz,inte
print metadata[score_list[2][0].name]


0 77.0383 277874.4
1 95.0488 199224.2
2 102.0459 31261.3
3 103.0539 1038691.0
4 105.0443 148718.5
5 115.0539 3104.5
6 128.0489 26143.6
7 129.0442 10391.1
8 130.0645 6980513.1
{'smiles': 'N1=CC=C2C=CC=CC2=C1', 'InChIKey': 'AWJUIBRHMBBTKR-UHFFFAOYSA-N', 'ionization': '[M + H]+', 'collision': '180', 'compound': 'ufz_0006', 'InChI': 'InChI=1S/C9H7N/c1-2-4-9-7-10-6-5-8(9)3-1/h1-7H', 'formula': 'C9H7N', 'parentmass': '130.0651', 'annotation': 'ufz_0006'}

In [149]:
print dir(ms1[0])
print ms1[0].file_name


['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'file_name', 'id', 'intensity', 'mz', 'name', 'rt', 'scan_number']
gnps

In [ ]: