this script calculates morgan similarity features

test compounds are compared to themselves and a number of other known odorants


In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.ML.Descriptors import MoleculeDescriptors

In [2]:
# get train, leaderboard and test CIDs
with open('CIDs.txt') as f: 
    content = f.readlines()
CIDs = list(content)  
CIDs = [int(x) for x in CIDs]

# get smiles
smiles = pd.read_csv('all_smiles.csv', index_col=0) # load smiles if the file already exists

In [3]:
# function to calculate the similarity features from Morgan fingerprints
# creates the fingerprints and calculates similarities 
# Inputs: 
#       list of ids
#       Morgan radius
# Returns:
#       feature vector with size of len(cids) x number of features

def calulate_similarities(ids, radius):
    ms = [Chem.MolFromSmiles(x) for x in smiles.smiles]
    fps = [AllChem.GetMorganFingerprint(x,radius) for x in ms]
    all_features =[]
    for idx, cid in enumerate(ids):
        ms_sample = Chem.MolFromSmiles(smiles.loc[cid].smiles)
        fp_sample = AllChem.GetMorganFingerprint(ms_sample,radius)
        features = [cid]
        for fp in fps:
            features.append(DataStructs.DiceSimilarity(fp,fp_sample))
        all_features.append(features)
    all_features = pd.DataFrame(all_features)
    all_features = all_features.set_index(0)
    all_features.columns = smiles.index
    return all_features

In [4]:
# get the similarity features
features_sim = calulate_similarities(CIDs, 5)

print(features_sim.shape)


(476, 2437)

In [5]:
features_sim.head()


Out[5]:
CID 58 102 107 125 126 174 176 177 178 179 ... 91305518 91411526 91541756 91552833 91563027 91595028 91614181 91617014 91617930 91618238
0
126 0.086957 0.379310 0.268657 0.586207 1.000000 0.052632 0.108108 0.171429 0.054054 0.095238 ... 0.003578 0.018182 0.118421 0.017192 0.145251 0.013652 0.056206 0.047945 0.011730 0.105263
176 0.480000 0.054054 0.217391 0.054054 0.108108 0.117647 1.000000 0.285714 0.625000 0.571429 ... 0.011152 0.014320 0.091603 0.030488 0.037975 0.009695 0.024631 0.036900 0.015129 0.064516
177 0.173913 0.000000 0.045455 0.000000 0.171429 0.000000 0.285714 1.000000 0.285714 0.210526 ... 0.003731 0.009592 0.031008 0.018405 0.025641 0.002774 0.009901 0.014870 0.006070 0.043956
180 0.320000 0.000000 0.130435 0.000000 0.054054 0.000000 0.625000 0.285714 0.625000 0.571429 ... 0.011152 0.014320 0.061069 0.030488 0.025316 0.009695 0.024631 0.044280 0.015129 0.043011
196 0.375000 0.100000 0.289855 0.100000 0.066667 0.200000 0.256410 0.054054 0.153846 0.181818 ... 0.032086 0.027149 0.116883 0.096866 0.044199 0.028630 0.055944 0.074830 0.046784 0.103448

5 rows × 2437 columns


In [11]:
# save it 
features_sim.to_csv('morgan_sim.csv')

In [ ]: