In [150]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
# from rdkit.Chem import Draw
# from rdkit.Chem import rdMolDescriptors
import sys
sys.path.append('/global/project/projectdirs/openmsi/jupyterhub_libs/anaconda/lib/python2.7/site-packages')
from rdkit import Chem
# from rdkit.Chem import AllChem
file = '/global/homes/b/bpb/notebooks/meta-iq/midas_lbl/MetaCyc.mdb'
import csv
reader = csv.DictReader(open(file, 'rb'), delimiter='\t')
mols = []
name = []
for row in reader:
mols.append(Chem.MolFromInchi(row['InChI']))
name.append(row['Name'])
In [138]:
print row.keys()
In [151]:
from rdkit.Chem.Fingerprints import FingerprintMols
"""
- mol: the molecule to use
- minPath: (optional) minimum number of bonds to include in the subgraphs
Defaults to 1.
- maxPath: (optional) maximum number of bonds to include in the subgraphs
Defaults to 7.
- fpSize: (optional) number of bits in the fingerprint
Defaults to 2048.
- nBitsPerHash: (optional) number of bits to set per path
Defaults to 2.
- useHs: (optional) include paths involving Hs in the fingerprint if the molecule
has explicit Hs.
Defaults to True.
- tgtDensity: (optional) fold the fingerprint until this minimum density has
been reached
Defaults to 0.
- minSize: (optional) the minimum size the fingerprint will be folded to when
trying to reach tgtDensity
Defaults to 128.
- branchedPaths: (optional) if set both branched and unbranched paths will be
used in the fingerprint.
Defaults to True.
- useBondOrder: (optional) if set both bond orders will be used in the path hashes
Defaults to True.
- atomInvariants: (optional) a sequence of atom invariants to use in the path hashes
Defaults to empty.
- fromAtoms: (optional) a sequence of atom indices. If provided, only paths/subgraphs
starting from these atoms will be used.
Defaults to empty.
- atomBits: (optional) an empty list. If provided, the result will contain a list
containing the bits each atom sets.
Defaults to empty."""
N = 1024
fps = [FingerprintMols.FingerprintMol(x,minPath=1,maxPath=7,fpSize=N,bitsPerHash=2,useHs=True,tgtDensity=0,minSize=N,branchedPaths=True,
useBondOrder=True,atomInvariants=[],fromAtoms=[],atomBits=[]) for x in mols]
In [152]:
#alanine = 7604
# 6674 (<i>S</i>)-lactate
import numpy as np
from rdkit import DataStructs
# for i in range(9000):
# if 'lactate' in name[i]:
# print i,name[i]
fp_mat = np.zeros((len(fps),N))
for i,f in enumerate(fps):
for j in range(N):
fp_mat[i,j] = float(DataStructs.BitVectToText(fps[i])[j])
In [114]:
from minisom import MiniSom
from numpy import genfromtxt,array,linalg,zeros,mean,std,apply_along_axis
"""
This script shows how to use MiniSom on the Iris dataset.
In partucular it shows how to train MiniSom and how to visualize the result.
ATTENTION: pylab is required for the visualization.
"""
# reading the iris dataset in the csv format
# (downloaded from http://aima.cs.berkeley.edu/data/iris.csv)
data = fp_mat
# data = apply_along_axis(lambda x: x/linalg.norm(x),1,data) # data normalization
### Initialization and training ###
som = MiniSom(100,100,N,sigma=1.0,learning_rate=0.5)
# som.random_weights_init(data)
In [169]:
8*60*60/(8.702/200)
Out[169]:
In [ ]:
import time
print("Training...")
t0= time.time()
som.train_batch(data,100000) # random training
t1 = time.time()
print t1 - t0
print("\n...ready!")
In [171]:
print("Getting Coordinates")
x = []
y = []
for cnt,xx in enumerate(name):
w = som.winner(data[cnt,:]) # getting the winner
x.append(w[0]+0.5)
y.append(w[1]+0.5)
In [174]:
plt.bone()
plt.pcolor(som.distance_map().T) # plotting the distance map as background
plt.axis([0,som.weights.shape[0],0,som.weights.shape[1]])
plt.scatter(x,y,c='r',alpha=0.33)
plt.xlim(0,50)
plt.ylim(0,50)
plt.axes().set_aspect('equal')
f = plt.gcf()
f.set_size_inches(12, 12)
plt.show() # show the figure
In [173]:
reader = csv.DictReader(open(file, 'rb'), delimiter='\t')
mols = []
name = []
fid = open('50_50_100k_training_out.tab','w')
fid.write('InChI\tIdentifier\tName\tLinks\tx\ty\n')
for i,row in enumerate(reader):
mols.append(Chem.MolFromInchi(row['InChI']))
name.append(row['Name'])
for k in row.keys():
fid.write('%s\t'%row[k])
fid.write('%d\t%d\n'%(x[i],y[i]))
fid.close()
In [149]:
# uniform_data = np.random.rand(10, 12)
# ax = sns.heatmap(som.distance_map().T)
# ax = sns.kdeplot(np.asarray(x), np.asarray(y), n_levels=100)#, cmap="Purples_d")
In [147]:
# norm = plt.normalize(0, 10)
# hb1 = plt.hexbin(x, y, norm=norm)
# # hb2 = plt.hexbin(x2, y2, norm=norm)
In [ ]: