In [13]:
import pandas as pd

In [66]:
x_pbe = pd.read_csv('X_pbe.csv', header=None)
y_pbe = pd.read_csv('Y_pbe.csv', header=None)
z_pbe = pd.read_csv('q_pbe.csv', header=None)

In [67]:
print(x_pbe.shape)
print(z_pbe.shape)


(47733, 29)
(47733, 15)

In [68]:
x_pbe.head()


Out[68]:
0 1 2 3 4 5 6 7 8 9 ... 19 20 21 22 23 24 25 26 27 28
0 C 1.350508 0.779024 0.663087 H 1.825709 1.257877 -0.189170 H 1.848891 ... 0.559417 C -1.832366 1.293790 -2.492349 N -1.370433 2.318893 -2.582710 NaN
1 C 1.366716 0.804290 0.682895 H 1.759809 1.361483 -0.178094 H 1.814160 ... 0.594701 C -1.830842 1.305028 -2.473094 N -1.354430 2.319260 -2.596931 NaN
2 C 1.438315 1.060388 0.832979 H 1.571861 1.944143 0.147314 H 1.577013 ... 0.736083 C -1.806108 1.415889 -2.311002 N -1.218195 2.315162 -2.715039 NaN
3 C 1.948153 3.622904 2.081111 H 2.049626 3.765616 1.000288 H 2.693285 ... 2.399583 C -0.414736 2.940226 -2.453864 N -0.876473 1.892105 -2.385047 NaN
4 C 2.649527 -0.695892 -3.065414 H 2.440014 -0.515189 -4.127145 H -0.446906 ... -2.347429 C 0.249546 -3.627168 -3.224564 N 0.433013 -4.760249 -3.264235 NaN

5 rows × 29 columns


In [69]:
rm_col = range(0,32,4)
x_pbe = x_pbe.drop(rm_col, axis=1)
x_pbe.shape


Out[69]:
(47733, 21)

In [70]:
x_pbe.columns=['C1x', 'C1y', 'C1z', 'H1x', 'H1y', 'H1z', 'H2x', 'H2y', 'H2z', 'H3x', 'H3y', 'H3z', 'H4x', 'H4y', 'H4z', 'C2x', 'C2y', 'C2z', 'Nx', 'Ny', 'Nz']

In [71]:
y_pbe.columns=['energy_pbe']

In [72]:
rm_col = range(0,16,2)

In [73]:
z_pbe = z_pbe.drop(rm_col, axis=1)
z_pbe.head()


Out[73]:
1 3 5 7 9 11 13
0 6.55438 0.86062 0.86105 0.86199 0.86194 6.18917 6.81086
1 6.54250 0.86282 0.86075 0.86827 0.86548 6.18986 6.81033
2 5.93576 0.71892 0.83033 0.69595 0.81927 6.66135 7.33841
3 6.54921 0.86531 0.85578 0.87014 0.85956 6.17456 6.82544
4 6.45450 0.84487 0.82359 0.84965 0.85046 6.04506 7.13187

In [74]:
z_pbe.columns=['C1q', 'H1q','H2q','H3q','H4q','C2q','Nq']

In [75]:
xzy_pbe = pd.concat([x_pbe, z_pbe, y_pbe], axis=1)
xzy_pbe.head()


Out[75]:
C1x C1y C1z H1x H1y H1z H2x H2y H2z H3x ... Ny Nz C1q H1q H2q H3q H4q C2q Nq energy_pbe
0 1.350508 0.779024 0.663087 1.825709 1.257877 -0.189170 1.848891 1.089646 1.577782 0.303730 ... 2.318893 -2.582710 6.55438 0.86062 0.86105 0.86199 0.86194 6.18917 6.81086 -131.356978
1 1.366716 0.804290 0.682895 1.759809 1.361483 -0.178094 1.814160 1.125417 1.600315 0.283706 ... 2.319260 -2.596931 6.54250 0.86282 0.86075 0.86827 0.86548 6.18986 6.81033 -131.353254
2 1.438315 1.060388 0.832979 1.571861 1.944143 0.147314 1.577013 1.523847 1.780924 0.419598 ... 2.315162 -2.715039 5.93576 0.71892 0.83033 0.69595 0.81927 6.66135 7.33841 -131.317326
3 1.948153 3.622904 2.081111 2.049626 3.765616 1.000288 2.693285 2.923731 2.344320 2.305255 ... 1.892105 -2.385047 6.54921 0.86531 0.85578 0.87014 0.85956 6.17456 6.82544 -131.353906
4 2.649527 -0.695892 -3.065414 2.440014 -0.515189 -4.127145 -0.446906 -2.739519 -2.943782 3.260095 ... -4.760249 -3.264235 6.45450 0.84487 0.82359 0.84965 0.85046 6.04506 7.13187 -131.348082

5 rows × 29 columns


In [76]:
xzy_pbe_u = xzy_pbe.drop_duplicates(inplace=False)

In [77]:
print xzy_pbe.shape
print xzy_pbe_u.shape


(47733, 29)
(47717, 29)

In [78]:
x_b3lyp = pd.read_csv('X_b3lyp.csv', header=None)
y_b3lyp = pd.read_csv('Y_b3lyp.csv', header=None)

In [79]:
print(x_b3lyp.shape)
print(y_b3lyp.shape)


(17756, 29)
(17756, 1)

In [80]:
rm_col = range(0,32,4)
x_b3lyp = x_b3lyp.drop(rm_col, axis=1)
x_b3lyp.shape


Out[80]:
(17756, 21)

In [81]:
x_b3lyp.columns=['C1x', 'C1y', 'C1z', 'H1x', 'H1y', 'H1z', 'H2x', 'H2y', 'H2z', 'H3x', 'H3y', 'H3z', 'H4x', 'H4y', 'H4z', 'C2x', 'C2y', 'C2z', 'Nx', 'Ny', 'Nz']

In [82]:
y_b3lyp.columns=['energy_b3lyp']

In [83]:
xy_b3lyp = pd.concat([x_b3lyp, y_b3lyp], axis=1)

In [84]:
xy_b3lyp.head()


Out[84]:
C1x C1y C1z H1x H1y H1z H2x H2y H2z H3x ... H4x H4y H4z C2x C2y C2z Nx Ny Nz energy_b3lyp
0 1.350508 0.779024 0.663087 1.825709 1.257877 -0.189170 1.848891 1.089646 1.577782 0.303730 ... 1.423537 -0.300512 0.559417 -1.832366 1.293790 -2.492349 -1.370433 2.318893 -2.582710 -133.198709
1 1.366716 0.804290 0.682895 1.759809 1.361483 -0.178094 1.814160 1.125417 1.600315 0.283706 ... 1.449777 -0.299335 0.594701 -1.830842 1.305028 -2.473094 -1.354430 2.319260 -2.596931 -133.195259
2 -3.803611 -2.267309 0.764900 -3.796598 -3.150483 1.493531 -4.393698 -1.399793 1.007952 -4.217465 ... -2.655651 -0.866691 1.937266 -2.613919 0.179199 2.330572 -2.536911 1.241336 2.765962 -133.215182
3 -2.927686 0.153773 -2.512119 -2.469427 0.822793 -1.821670 -2.475899 -0.852233 -2.570246 -3.057747 ... -4.018957 0.011604 -1.970984 -3.879463 -0.205050 0.063446 -4.423705 -0.138430 1.097049 -133.191188
4 -3.450782 3.334512 4.114650 -4.447155 3.741390 4.341071 -4.552629 0.118057 4.181421 -3.188349 ... -2.697286 3.171041 4.908800 -4.440723 -0.970431 3.946023 -4.649342 -2.117911 3.776332 -133.232016

5 rows × 22 columns


In [87]:
xy_b3lyp_u = xy_b3lyp.drop_duplicates(inplace=False)
print(xy_b3lyp.shape)
print(xy_b3lyp_u.shape)


(17756, 22)
(17751, 22)

In [94]:
jcols = list(xzy_pbe_u.columns)[:-8]
print(jcols)


['C1x', 'C1y', 'C1z', 'H1x', 'H1y', 'H1z', 'H2x', 'H2y', 'H2z', 'H3x', 'H3y', 'H3z', 'H4x', 'H4y', 'H4z', 'C2x', 'C2y', 'C2z', 'Nx', 'Ny', 'Nz']

In [97]:
xy = xzy_pbe_u.merge(right=xy_b3lyp_u, how='right', on=jcols, suffixes=('_b3lyp', '_pbe'))
print(xy.shape)
xy.head()


(17751, 30)
Out[97]:
C1x C1y C1z H1x H1y H1z H2x H2y H2z H3x ... Nz C1q H1q H2q H3q H4q C2q Nq energy_pbe energy_b3lyp
0 1.350508 0.779024 0.663087 1.825709 1.257877 -0.189170 1.848891 1.089646 1.577782 0.303730 ... -2.582710 6.55438 0.86062 0.86105 0.86199 0.86194 6.18917 6.81086 -131.356978 -133.198709
1 1.366716 0.804290 0.682895 1.759809 1.361483 -0.178094 1.814160 1.125417 1.600315 0.283706 ... -2.596931 6.54250 0.86282 0.86075 0.86827 0.86548 6.18986 6.81033 -131.353254 -133.195259
2 1.948153 3.622904 2.081111 2.049626 3.765616 1.000288 2.693285 2.923731 2.344320 2.305255 ... -2.385047 6.54921 0.86531 0.85578 0.87014 0.85956 6.17456 6.82544 -131.353906 -133.191297
3 -4.295067 -3.402007 -0.431639 -3.743290 -3.885034 0.405351 -4.703227 -2.395147 -0.357631 -4.912492 ... 2.771718 6.46360 0.84451 0.84585 0.84602 0.82700 6.04259 7.13044 -131.361634 -133.232957
4 -4.329216 -3.413882 -0.419582 -3.788901 -3.958677 0.381595 -4.796726 -2.417300 -0.361777 -4.924910 ... 2.769733 6.46663 0.84255 0.84259 0.84821 0.82560 6.04077 7.13365 -131.359263 -133.231602

5 rows × 30 columns


In [98]:
print(list(xy.columns))


['C1x', 'C1y', 'C1z', 'H1x', 'H1y', 'H1z', 'H2x', 'H2y', 'H2z', 'H3x', 'H3y', 'H3z', 'H4x', 'H4y', 'H4z', 'C2x', 'C2y', 'C2z', 'Nx', 'Ny', 'Nz', 'C1q', 'H1q', 'H2q', 'H3q', 'H4q', 'C2q', 'Nq', 'energy_pbe', 'energy_b3lyp']

In [99]:
xy.to_csv('pbe_b3lyp_partQ.csv')

In [ ]: