In [ ]:
# As before...
import numpy as np
import scipy as sp
import pandas as pd
nat = 195 # Number of atoms
a = 12.55 # Cell size
In [ ]:
xyz = pd.read_hdf('xyz.hdf5', 'xyz')
xyz.head()
This is arguably the most difficult part of this tutorial. How do we account for periodicity?
Lets start by considering free boundary conditions first!
Computing all atom to atom distances (per frame) increases factorially,
\begin{equation} \frac{nat!}{2!\left(nat - 2\right)!} \left(= \frac{1}{2}\left(nat * \left(nat - 1\right)\right)\right) \end{equation}in computations (where nat is the number of atoms). Fortunately for us, computing the distances can be passed off to scipy's pdist.
CODING TIME: Write a function to compute all of the atom to atom distances in every frame assuming free boundary conditions
In [ ]:
from scipy.spatial.distance import pdist
def skeleton_free_boundary_distances(frame): # Note that this is frame, not DataFrame
'''
Compute all of the atom to atom distances with free boundary conditions
'''
# Compute distances
xyz = frame.loc[:, ['x', 'y', 'z']]
distances = pdist(xyz)
# Compute the symbols
# ...
symbols = None
return pd.DataFrame.from_dict({'distances': distances, 'symbols': symbols})
In [ ]:
twobody = xyz.groupby(level='frame').apply(skeleton_free_boundary_distances)
twobody.head()
Distances are no good to us, unless we know where they came from (or at least what two symbols they represent)...
HINT: Check out the "combinations" function in the itertools library (part of the Python standard library)
In [ ]:
from itertools import combinations
In [ ]:
%load -s free_boundary_distances, snippets/distances.py
In [ ]:
twobody = xyz.groupby(level='frame').apply(free_boundary_distances)
twobody.head()
In [ ]:
twobody.loc[0].head()
In [ ]:
first = xyz.loc[0, ['x', 'y', 'z']].values
for i in range(1, 6):
print(((first[0, :] - first[i, :])**2).sum()**0.5)
That was fun but it doesn't do what we need it to! Periodic boundaries can be handled a number of ways, here is one algorithm:
Since this is complicated, we will walk through the pieces of the code individually (applying them to a single frame) before putting it all together.
In [ ]:
frame = xyz.loc[0]
CODING TIME: Put all atoms back into the unit cell
In [ ]:
def skeleton_create_unit(df, a):
'''
Put all atoms back into the cubic unit cell
'''
#...
In [ ]:
unit_frame = skeleton_create_unit(frame, a) # a is defined above
unit_frame
The % (modulo) operator is nice for such tasks...
In [ ]:
%load -s create_unit, snippets/distances.py
In [ ]:
unit_frame = create_unit(frame, a)
unit_frame.shape
In [ ]:
print(unit_frame is not frame) # True if objects are not the same in memory
print(np.all(unit_frame.values == frame.values)) # True if objects' xyz positions are identical
CODING TIME: Generate the 3x3x3 superframe from the unit_frame
In [ ]:
def skeleton_superframe(frame, a):
'''
Generate a 3x3x3 supercell of a given frame.
'''
v = [-1, 0, 1]
n = len(frame)
unit = frame.loc[:, ['x', 'y', 'z']].values
coords = np.empty((n * 27, 3))
h = 0
for i in v:
for j in v:
for k in v:
#for ...
return coords
In [ ]:
big_frame = skeleton_superframe(unit_frame, a)
big_frame.shape
One solution...or if you want to have some fun...
In [ ]:
%load -s superframe, snippets/distances.py
In [ ]:
big_frame = superframe(unit_frame, a)
big_frame
For a single frame's supercell (to start with) we need to compute the distances from the central frame.
Let's use a nice feature of scipy/scikit-learn (and of course the mathematicians that developed it): the KDTree
See also: wiki
We are going to use the Cythonized version of the KDTree implementation.
In [ ]:
from scipy.spatial import cKDTree
In [ ]:
kd = cKDTree(big_frame)
k = 194 # Number of distances to compute
distances, indices = kd.query(unit_frame.loc[:, ['x', 'y', 'z']], k=k)
distances.shape
In [ ]:
indices.shape
We have the distances but we need to shape them into a DataFrame and figure out what symbol pair each distance belongs too (that last part is critical for the third task).
The first column in the indices are the indices of the source atom from which we are looking.
The rest of the columns contain the indices of the paired atom to which we are computing the distances.
We map superframe indices back onto the unit_frame indices:
In [ ]:
def map_x_to_y(x, y):
'''
Using the indices in x, generate an array of the same
length populated by values from y.
'''
mapped = np.empty((len(x), ), dtype=np.int)
for i, index in enumerate(x):
mapped[i] = y[index]
return mapped
In [ ]:
unit_frame_indices = unit_frame.index.get_level_values('atom').tolist() * 27
repeated_source = np.repeat(indices[:, 0], k)
atom1_indices = pd.Series(map_x_to_y(repeated_source, unit_frame_indices))
atom2_indices = pd.Series(map_x_to_y(indices.flatten(), unit_frame_indices))
Now let's convert these Series (a pandas Series is simply a column in a DataFrame) to symbols using the map function.
In [ ]:
symbols = unit_frame['symbol'].to_dict()
In [ ]:
atom1_symbols = atom1_indices.map(symbols)
atom2_symbols = atom2_indices.map(symbols)
In [ ]:
symbols = [''.join((first, atom2_symbols[i])) for i, first in enumerate(atom1_symbols)]
Now let's finish this by generating our (periodic) two body DataFrame for this first frame..
In [ ]:
frame_twobody = pd.DataFrame.from_dict({'distance': distances.flatten(),
'symbols': symbols})
frame_twobody = frame_twobody.loc[(frame_twobody['distance'] > 0.3) & (frame_twobody['distance'] < 8.3)]
frame_twobody.head()
We should probably do a check here, but in the interest of time, and because, with our current implementation this is not trivial, let's just skip this...
In [ ]:
from scipy.spatial import cKDTree
from snippets.distances import superframe_numba, map_x_to_y_numba, create_unit
def cubic_periodic_distances(xyz, a, nat, k=None):
'''
Computes atom to atom distances for a periodic cubic cell.
Args:
xyz: Properly indexed pandas DataFrame
a: Cubic cell dimension
Returns:
twobody: DataFrame of distances
'''
k = nat - 1 if k is None else k
# Since the unit cell size doesn't change between frames,
# let's put all of the atoms (in every frame) back in the
# unit cell at the same time.
unit_xyz = create_unit(xyz, a)
# Now we will define another function which will do the
# steps we outlined above (see below) and apply this
# function to every frame of the unit_xyz
twobody = unit_xyz.groupby(level='frame').apply(_compute, k=k) # <== This is the meat and potatoes
# Filter the meaningful distances
# ... # <== EDIT THIS LINE!
# Pair the symbols
twobody.loc[:, 'symbols'] = twobody['atom1'] + twobody['atom2']
# Name the indices
twobody.index.names = ['frame', 'two']
return twobody
In [ ]:
#%load -s cubic_periodic_distances, snippets/distances.py
What should the function _compute look like/do?
NOT QUITE CODING TIME: Just load the function
In [ ]:
%load -s _compute, snippets/distances.py
In [ ]:
# WARNING: On a fast machine, this operation takes ~10 s
%time twobody = cubic_periodic_distances(xyz, a, nat)
twobody.head()
What can we do with twobody data??
In [ ]:
twobody.loc[((twobody.symbols == 'HO') | (twobody.symbols == 'OH')) & (twobody.distance < 1.2), 'distance'].describe()
In [ ]:
twobody.loc[((twobody.symbols == 'HO') | (twobody.symbols == 'OH')) & (twobody.distance < 1.2), 'distance'].median()
In [ ]:
# WARNING: On a fast machine, this operation takes ~10 s
store = pd.HDFStore('twobody.hdf5', mode='w')
%time store.put('twobody', twobody)
store.close()
Again, though there are a bunch of improvements/features we could make, let's move on...
Python has a way to optimize big loops.
This is for learning and fun, remember the first rule of optimization: optimize the slowest step first!
numba is a beautiful and powerful way to "just-in-time" compile python code into native machine code...
In [ ]:
from numba import jit, float64
In [ ]:
%load -s superframe, snippets/distances.py
In [ ]:
%load -s superframe_numba, snippets/distances.py
In [ ]:
n = 100
%timeit -n $n superframe(unit_frame, a)
%timeit -n $n superframe_numba(unit_frame.loc[:, ['x', 'y', 'z']].values, a)
~10x speedup (for 1ish lines of code)