Jim's MetaD convergence script, also shows fast file readin using streaming vs slow file readin vs. np.getfromtext



In [ ]:

    
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
from matplotlib.patches import Rectangle



In [ ]:

    
# define all variables for convergence script
# these will pass to the bash magic below used to call plumed sum_hills 
dir="MetaD_converge" #where the intermediate fes will be stored
hills="MetaD/HILLS"  #your HILLS file from the simulation
finalfes='MetaD/fes.dat' #the final fes.dat file 
stride=1000
kT=8.314e-3*300 #throughout we convert to kcal, but the HILLS are assumed to be in GROMACS units (kJ)


## here is where you set the boxes to define convergence regions
C1=[-1.5,1.0] #center of box 1
C2=[1.0,-.5]
edge1=1.0 #edge of box1 
edge2=1.0



In [ ]:

    
%%bash -s "$dir" "$hills" "$stride" "$kT"
# calling sum hills and output to devnul 
HILLSFILE=HILLS
rm -rf $1
mkdir $1
cp $2 $1 
cd $1 
plumed sum_hills --hills $HILLSFILE --kt $4 --stride $3  >& /dev/null

Graph the final FES and plot the two squares on top of it



In [ ]:

    
%matplotlib inline

#read the data in from a text file 
fesdata = np.genfromtxt(finalfes,comments='#');

fesdata = fesdata[:,0:3]
#what was your grid size? this calculates it 
dim=int(np.sqrt(np.size(fesdata)/3))

#some post-processing to be compatible with contourf 
X=np.reshape(fesdata[:,0],[dim,dim],order="F")  #order F was 20% faster than A/C
Y=np.reshape(fesdata[:,1],[dim,dim],order="F") 
Z=np.reshape((fesdata[:,2]-np.min(fesdata[:,2]))/4.184,[dim,dim],order="F")  #convert to kcal/mol

#what spacing do you want?  assume units are in kJ/mol
spacer=1  #this means 1kcal/mol spacing
lines=20
levels=np.linspace(0,lines*spacer,num=(lines+1),endpoint=True)

fig=plt.figure(figsize=(8,6)) 
axes = fig.add_subplot(111)

xlabel='$\Phi$'
ylabel='$\Psi$'
plt.contourf(X, Y, Z, levels, cmap=plt.cm.bone,)
plt.colorbar()
axes.set_xlabel(xlabel, fontsize=20)
axes.set_ylabel(ylabel, fontsize=20)

currentAxis = plt.gca()
currentAxis.add_patch(Rectangle((C1[0]-edge1/2, C1[1]-edge1/2), edge1, edge1,facecolor='none',edgecolor='yellow',linewidth='3'))
currentAxis.add_patch(Rectangle((C2[0]-edge2/2, C2[1]-edge2/2), edge2, edge2,facecolor='none',edgecolor='yellow',linewidth='3'))

plt.show()

The two functions below calculate the average free energy of a region by integrating over whichever boxes you defined above. Since the FES is discrete and points are equally spaced, this is trivially taken as a summation:

$F_A = \sum_A exp\left(-F_{Ai}/k_BT\right) $

Don't forget that this is formally a free-energy plus some trivial constant but that the constant is equal for both regions $A$ and $B$ so that you will obtain the same free-energy difference irrespective of the reference point.

On the other hand, it doesn't make much sense to just use the arbitrary nubmers coming from sum_hills, which are related only to the amount of aggregate bias produced in your simulation. This is why we reference the lowest point to zero on the contour plots.

I left both functions in as a teaching tool to show how slow np.genfromtext is



In [ ]:

    
def diffNP(file):
    #read the data in from a text file 
    # note - this is very slow 
    fesdata = np.genfromtxt(file,comments='#');
    A=0.0
    B=0.0

    dim=np.shape(fesdata)[0]
    for i in range(0, dim):
        x=fesdata[i][0]
        y=fesdata[i][1]
        z=fesdata[i][2]
        if  x < C1[0]+edge1/2 and x > C1[0]-edge1/2 and y > C1[1]-edge1/2 and y < C1[1]+edge1/2:
           A+=np.exp(-z/kT)
        if  x < C2[0]+edge2/2 and x > C2[0]-edge2/2 and y > C2[1]-edge2/2 and y < C2[1]+edge2/2:
           B+=np.exp(-z/kT)

    A=-kT*np.log(A)
    B=-kT*np.log(B)
    diff=(A-B)/4.184 #output in kcal
    return diff



In [ ]:

    
def diff(file):

    kT=8.314e-3*300
    A=0.0
    B=0.0
    f = open(file, 'r')
    for line in f:
        if line[:1] != '#':
            line=line.strip()
            if line:
                columns = line.split()
                x=float(columns[0])
                y=float(columns[1])
                z=float(columns[2])
                if  x < C1[0]+edge1/2 and x > C1[0]-edge1/2 and y > C1[1]-edge1/2 and y < C1[1]+edge1/2:
                    A+=np.exp(-z/kT)
                if  x < C2[0]+edge2/2 and x > C2[0]-edge2/2 and y > C2[1]-edge2/2 and y < C2[1]+edge2/2:
                    B+=np.exp(-z/kT)

    f.close

    A=-kT*np.log(A)
    B=-kT*np.log(B)
    diff=(A-B)/4.184
    return diff



In [ ]:

    
diffvec=None
rootdir = '/Users/jpfaendt/Learning/Python/ALA2_MetaD/MetaD_converge'
i=0
diffvec=np.zeros((1,2))

#the variable func defines which function you are going to call to read in your data files fes_*.dat
#func=diffNP uses the numpy read in (SLOW)
#func=diff streams in data from a text file
#to experience the differnece , uncomment out the print statements and run each way

func=diff
for infile in glob.glob( os.path.join(rootdir, 'fes_?.dat') ):
    if i >= i:
            diffvec.resize((i+1,2))
    #print "current file is: " + infile
    diffvec[i][0]=i*1.0
    diffvec[i][1]=func(infile)
    i+=1

for infile in glob.glob( os.path.join(rootdir, 'fes_??.dat') ):
    if i >= i:
            diffvec.resize((i+1,2))
    #print "current file is: " + infile
    diffvec[i][0]=i*1.0
    diffvec[i][1]=func(infile)
    i+=1

for infile in glob.glob( os.path.join(rootdir, 'fes_???.dat') ):
    if i >= i:
            diffvec.resize((i+1,2))
    #print "current file is: " + infile
    diffvec[i][0]=i*1.0
    diffvec[i][1]=func(infile)
    i+=1
    
fig = plt.figure(figsize=(6,6)) 
axes = fig.add_subplot(111)
xlabel='time (generic)'
ylabel='diff (A-B) (kcal/mol)'
axes.plot(diffvec[:,0],diffvec[:,1])
axes.set_xlabel(xlabel, fontsize=20)
axes.set_ylabel(ylabel, fontsize=20)

plt.show()

Below this is all testing of different read-in options:



In [ ]:

    
## 
#read the data in from a text file using genfrom txt
fesdata = np.genfromtxt('MetaD_converge/fes_1.dat',comments='#');
kT=8.314e-3*300
A=0.0
B=0.0
                   
dim=np.shape(fesdata)[0]
for i in range(0, dim):
    x=fesdata[i][0]
    y=fesdata[i][1]
    z=fesdata[i][2]
    if  x < C1[0]+edge1/2 and x > C1[0]-edge1/2 and y > C1[1]-edge1/2 and y < C1[1]+edge1/2:
       A+=np.exp(-z/kT)
    if  x < C2[0]+edge2/2 and x > C2[0]-edge2/2 and y > C2[1]-edge2/2 and y < C2[1]+edge2/2:
       B+=np.exp(-z/kT)
   
A=-kT*np.log(A)
B=-kT*np.log(B)
diff=(A-B)/4.184
diff



In [ ]:

    
## 
#read the data in from a text file using read in commands
kT=8.314e-3*300
A=0.0
B=0.0


f = open('MetaD_converge/fes_1.dat', 'r')
for line in f:
    if line[:1] != '#':
        line=line.strip()
        if line:
            columns = line.split()
            x=float(columns[0])
            y=float(columns[1])
            z=float(columns[2])
            if  x < C1[0]+edge1/2 and x > C1[0]-edge1/2 and y > C1[1]-edge1/2 and y < C1[1]+edge1/2:
                A+=np.exp(-z/kT)
            if  x < C2[0]+edge2/2 and x > C2[0]-edge2/2 and y > C2[1]-edge2/2 and y < C2[1]+edge2/2:
                B+=np.exp(-z/kT)

f.close
            

A=-kT*np.log(A)
B=-kT*np.log(B)
diff=(A-B)/4.184
diff

Profiling speed of different read in options:



In [ ]:

    
file='MetaD/fes.dat'



In [ ]:

    
%timeit diffNP(file)



In [ ]:

    
%timeit diff(file)



In [ ]: