In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.ML.Descriptors import MoleculeDescriptors
In [2]:
# get train, leaderboard and test CIDs
with open('CIDs.txt') as f:
content = f.readlines()
CIDs = list(content)
CIDs = [int(x) for x in CIDs]
# get smiles
smiles = pd.read_csv('all_smiles.csv', index_col=0) # load smiles if the file already exists
In [3]:
# function to calculate the similarity features from Morgan fingerprints
# creates the fingerprints and calculates similarities
# Inputs:
# list of ids
# Morgan radius
# Returns:
# feature vector with size of len(cids) x number of features
def calulate_similarities(ids, radius):
ms = [Chem.MolFromSmiles(x) for x in smiles.smiles]
fps = [AllChem.GetMorganFingerprint(x,radius) for x in ms]
all_features =[]
for idx, cid in enumerate(ids):
ms_sample = Chem.MolFromSmiles(smiles.loc[cid].smiles)
fp_sample = AllChem.GetMorganFingerprint(ms_sample,radius)
features = [cid]
for fp in fps:
features.append(DataStructs.DiceSimilarity(fp,fp_sample))
all_features.append(features)
all_features = pd.DataFrame(all_features)
all_features = all_features.set_index(0)
all_features.columns = smiles.index
return all_features
In [4]:
# get the similarity features
features_sim = calulate_similarities(CIDs, 5)
print(features_sim.shape)
In [5]:
features_sim.head()
Out[5]:
In [11]:
# save it
features_sim.to_csv('morgan_sim.csv')
In [ ]: