In [150]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg' 
# from rdkit.Chem import Draw
# from rdkit.Chem import rdMolDescriptors
import sys
sys.path.append('/global/project/projectdirs/openmsi/jupyterhub_libs/anaconda/lib/python2.7/site-packages')

from rdkit import Chem
# from rdkit.Chem import AllChem
file = '/global/homes/b/bpb/notebooks/meta-iq/midas_lbl/MetaCyc.mdb'
import csv
reader = csv.DictReader(open(file, 'rb'), delimiter='\t')
mols = []
name = []
for row in reader:
    mols.append(Chem.MolFromInchi(row['InChI']))
    name.append(row['Name'])

In [138]:
print row.keys()


['InChI', 'Identifier', 'Name', 'Links']

In [151]:
from rdkit.Chem.Fingerprints import FingerprintMols
"""
- mol: the molecule to use

- minPath: (optional) minimum number of bonds to include in the subgraphs
  Defaults to 1.

- maxPath: (optional) maximum number of bonds to include in the subgraphs
  Defaults to 7.

- fpSize: (optional) number of bits in the fingerprint
  Defaults to 2048.

- nBitsPerHash: (optional) number of bits to set per path
  Defaults to 2.

- useHs: (optional) include paths involving Hs in the fingerprint if the molecule
  has explicit Hs.
  Defaults to True.

- tgtDensity: (optional) fold the fingerprint until this minimum density has
  been reached
  Defaults to 0.

- minSize: (optional) the minimum size the fingerprint will be folded to when
  trying to reach tgtDensity
  Defaults to 128.

- branchedPaths: (optional) if set both branched and unbranched paths will be
  used in the fingerprint.
  Defaults to True.

- useBondOrder: (optional) if set both bond orders will be used in the path hashes
  Defaults to True.

- atomInvariants: (optional) a sequence of atom invariants to use in the path hashes
  Defaults to empty.

- fromAtoms: (optional) a sequence of atom indices. If provided, only paths/subgraphs 
  starting from these atoms will be used.
  Defaults to empty.

- atomBits: (optional) an empty list. If provided, the result will contain a list 
  containing the bits each atom sets.
  Defaults to empty."""
N = 1024
fps = [FingerprintMols.FingerprintMol(x,minPath=1,maxPath=7,fpSize=N,bitsPerHash=2,useHs=True,tgtDensity=0,minSize=N,branchedPaths=True,
                                     useBondOrder=True,atomInvariants=[],fromAtoms=[],atomBits=[]) for x in mols]

In [152]:
#alanine = 7604
# 6674 (<i>S</i>)-lactate
import numpy as np
from rdkit import DataStructs
# for i in range(9000):
#     if 'lactate' in name[i]:
#         print i,name[i]
fp_mat = np.zeros((len(fps),N))
for i,f in enumerate(fps):
    for j in range(N):
        fp_mat[i,j] = float(DataStructs.BitVectToText(fps[i])[j])

In [114]:
from minisom import MiniSom
from numpy import genfromtxt,array,linalg,zeros,mean,std,apply_along_axis

"""
    This script shows how to use MiniSom on the Iris dataset.
    In partucular it shows how to train MiniSom and how to visualize the result.
    ATTENTION: pylab is required for the visualization.        
"""

# reading the iris dataset in the csv format    
# (downloaded from http://aima.cs.berkeley.edu/data/iris.csv)
data = fp_mat
# data = apply_along_axis(lambda x: x/linalg.norm(x),1,data) # data normalization

### Initialization and training ###
som = MiniSom(100,100,N,sigma=1.0,learning_rate=0.5)
# som.random_weights_init(data)

In [169]:
8*60*60/(8.702/200)


Out[169]:
661916.8007354632

In [ ]:
import time

print("Training...")
t0= time.time()
som.train_batch(data,100000) # random training
t1 = time.time()
print t1 - t0

print("\n...ready!")

In [171]:
print("Getting Coordinates")
x = []
y = []
for cnt,xx in enumerate(name):
    w = som.winner(data[cnt,:]) # getting the winner
    x.append(w[0]+0.5)
    y.append(w[1]+0.5)


Getting Coordinates

In [174]:
plt.bone()
plt.pcolor(som.distance_map().T) # plotting the distance map as background

plt.axis([0,som.weights.shape[0],0,som.weights.shape[1]])

plt.scatter(x,y,c='r',alpha=0.33)
plt.xlim(0,50)
plt.ylim(0,50)
plt.axes().set_aspect('equal')

f = plt.gcf()
f.set_size_inches(12, 12)
plt.show() # show the figure



In [173]:
reader = csv.DictReader(open(file, 'rb'), delimiter='\t')
mols = []
name = []
fid = open('50_50_100k_training_out.tab','w')
fid.write('InChI\tIdentifier\tName\tLinks\tx\ty\n')
for i,row in enumerate(reader):
    mols.append(Chem.MolFromInchi(row['InChI']))
    name.append(row['Name'])
    for k in row.keys():
        fid.write('%s\t'%row[k])
    fid.write('%d\t%d\n'%(x[i],y[i]))
fid.close()

In [149]:
# uniform_data = np.random.rand(10, 12)
# ax = sns.heatmap(som.distance_map().T)

# ax = sns.kdeplot(np.asarray(x), np.asarray(y), n_levels=100)#, cmap="Purples_d")

In [147]:
# norm = plt.normalize(0, 10)
# hb1 = plt.hexbin(x, y, norm=norm)
# # hb2 = plt.hexbin(x2, y2, norm=norm)

In [ ]: