In [3]:
from mdf_forge import Forge
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Helper Functions


In [2]:
import collections

def flatten(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
        if isinstance(v, collections.MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

def shape_data(res):
    features = []
    for r in res:
        feat = {}
        feat.update(r['projects']['verde'])
        feat.update(r['molecule'])
        features.append(flatten(feat))
    return pd.DataFrame(features)

Instantiate Forge Client


In [9]:
mdf = Forge()

In [10]:
?Forge


Init signature: Forge(index='mdf', local_ep=None, anonymous=False, clear_old_tokens=False, **kwargs)
Docstring:     
Forge fetches metadata and files from the Materials Data Facility.
Forge is intended to be the best way to access MDF data for all users.
An internal Query object is used to make queries. From the user's perspective,
an instantiation of Forge will black-box searching.
Init docstring:
Create an MDF Forge Client.

Arguments:
    index (str): The Search index to search on. **Default:** ``"mdf"``.
    local_ep (str): The endpoint ID of the local Globus Connect Personal endpoint.
            If needed but not provided, the local endpoint will be autodetected
            if possible.
    anonymous (bool): If ``True``, will not authenticate with Globus Auth.
            If ``False``, will require authentication.
            **Default:** ``False``.

            Caution:
                Authentication is required for some Forge functionality,
                including viewing private datasets and using Globus Transfer.

    clear_old_tokens (bool): If ``True``, will force reauthentication.
            If ``False``, will use existing tokens if possible.
            Has no effect if ``anonymous`` is ``True``.
            **Default:** ``False``.

Keyword Arguments:
    services (list of str): *Advanced users only.* The services to authenticate with,
            using Toolbox. An empty list will disable authenticating with Toolbox.
            Note that even overwriting clients (with other keyword arguments)
            does not stop Toolbox authentication. Only a blank ``services`` argument
            will disable Toolbox authentication.
    search_client (globus_sdk.SearchClient): An authenticated SearchClient
            to overwrite the default.
    transfer_client (globus_sdk.TransferClient): An authenticated TransferClient
            to override the default.
    data_mdf_authorizer (globus_sdk.GlobusAuthorizer): An authenticated GlobusAuthorizer
            to overwrite the default for accessing the MDF NCSA endpoint.
    petrel_authorizer (globus_sdk.GlobusAuthorizer): An authenticated GlobusAuthorizer
            to override the default.
File:           ~/miniconda3/envs/rdkit/lib/python3.6/site-packages/mdf_forge/forge.py
Type:           type

Match All VERDE Records


In [11]:
res = mdf.match_resource_types("record")
res = res.match_source_names("abrehabiruk_virtual_db")
res = res.search()

Inspect the Metadata for One Record


In [12]:
res[0]


Out[12]:
{'files': [{'data_type': 'ASCII text',
   'filename': 'GKVDXUXIAHWQIK-UHFFFAOYSA-N_cat-rad_vac_freq.log',
   'globus': 'globus://82f1b5c6-6e9b-11e5-ba47-22000b92c6ec/verde/abrehabiruk_virtual_db_v1.1/verde_calcs.tar/GKVDXUXIAHWQIK-UHFFFAOYSA-N/GKVDXUXIAHWQIK-UHFFFAOYSA-N_cat-rad_vac_freq.log',
   'length': 62551,
   'mime_type': 'text/plain',
   'sha512': '412cb4474716f7c2364e3b88f9605c8c04c9d942ff26ca14e98443b9e56e383dbe15fa4aa800ed650c89d8ce4eddd129a370f1a9bd35b07300a21e0da21a35fb',
   'url': 'https://data.materialsdatafacility.org/verde/abrehabiruk_virtual_db_v1.1/verde_calcs.tar/GKVDXUXIAHWQIK-UHFFFAOYSA-N/GKVDXUXIAHWQIK-UHFFFAOYSA-N_cat-rad_vac_freq.log'},
  {'data_type': 'ASCII text',
   'filename': 'GKVDXUXIAHWQIK-UHFFFAOYSA-N_S1_solv_freq.log',
   'globus': 'globus://82f1b5c6-6e9b-11e5-ba47-22000b92c6ec/verde/abrehabiruk_virtual_db_v1.1/verde_calcs.tar/GKVDXUXIAHWQIK-UHFFFAOYSA-N/GKVDXUXIAHWQIK-UHFFFAOYSA-N_S1_solv_freq.log',
   'length': 84937,
   'mime_type': 'text/plain',
   'sha512': '6a786868ff2a6671683b7bcc97b26c82c654772258ea7150d33a238ba26363cab1267c25a77d2932931876bcd3c1d90f2ce5578f138d978b33e2eb2a36c9f1f5',
   'url': 'https://data.materialsdatafacility.org/verde/abrehabiruk_virtual_db_v1.1/verde_calcs.tar/GKVDXUXIAHWQIK-UHFFFAOYSA-N/GKVDXUXIAHWQIK-UHFFFAOYSA-N_S1_solv_freq.log'},
  {'data_type': 'ASCII text',
   'filename': 'GKVDXUXIAHWQIK-UHFFFAOYSA-N_S0_solv_freq.log',
   'globus': 'globus://82f1b5c6-6e9b-11e5-ba47-22000b92c6ec/verde/abrehabiruk_virtual_db_v1.1/verde_calcs.tar/GKVDXUXIAHWQIK-UHFFFAOYSA-N/GKVDXUXIAHWQIK-UHFFFAOYSA-N_S0_solv_freq.log',
   'length': 57818,
   'mime_type': 'text/plain',
   'sha512': 'f15f9abea04bf3db986280c40c0bdab6cc981859d0205757fed1a21f173e1ea89eeacd61f9232e1777d16c91704dd812a9cbd6aeb5d5bd808c2382b9e2cbfad2',
   'url': 'https://data.materialsdatafacility.org/verde/abrehabiruk_virtual_db_v1.1/verde_calcs.tar/GKVDXUXIAHWQIK-UHFFFAOYSA-N/GKVDXUXIAHWQIK-UHFFFAOYSA-N_S0_solv_freq.log'},
  {'data_type': 'ASCII text',
   'filename': 'GKVDXUXIAHWQIK-UHFFFAOYSA-N_S0_vac_freq.log',
   'globus': 'globus://82f1b5c6-6e9b-11e5-ba47-22000b92c6ec/verde/abrehabiruk_virtual_db_v1.1/verde_calcs.tar/GKVDXUXIAHWQIK-UHFFFAOYSA-N/GKVDXUXIAHWQIK-UHFFFAOYSA-N_S0_vac_freq.log',
   'length': 54120,
   'mime_type': 'text/plain',
   'sha512': '15e7feb5e0c18eec8ffb8ef77bddc2298b909f60abab34275a1d50d5f9b72c3bcc91befe709341ff3d277dc7aeeae0f83748772c29621db90fbeb3df37565524',
   'url': 'https://data.materialsdatafacility.org/verde/abrehabiruk_virtual_db_v1.1/verde_calcs.tar/GKVDXUXIAHWQIK-UHFFFAOYSA-N/GKVDXUXIAHWQIK-UHFFFAOYSA-N_S0_vac_freq.log'},
  {'data_type': 'ASCII text',
   'filename': 'GKVDXUXIAHWQIK-UHFFFAOYSA-N_cat-rad_solv_freq.log',
   'globus': 'globus://82f1b5c6-6e9b-11e5-ba47-22000b92c6ec/verde/abrehabiruk_virtual_db_v1.1/verde_calcs.tar/GKVDXUXIAHWQIK-UHFFFAOYSA-N/GKVDXUXIAHWQIK-UHFFFAOYSA-N_cat-rad_solv_freq.log',
   'length': 66305,
   'mime_type': 'text/plain',
   'sha512': 'f2c81eac36141f86b890bcab54a3f7aa1a0d640e7955acdc8b0f43dd2710d2cf457e1f89c4938b079391e201442cd3ae9dfe65f46890416d4c68c9da7d7504b2',
   'url': 'https://data.materialsdatafacility.org/verde/abrehabiruk_virtual_db_v1.1/verde_calcs.tar/GKVDXUXIAHWQIK-UHFFFAOYSA-N/GKVDXUXIAHWQIK-UHFFFAOYSA-N_cat-rad_solv_freq.log'},
  {'data_type': 'ASCII text',
   'filename': 'GKVDXUXIAHWQIK-UHFFFAOYSA-N.json',
   'globus': 'globus://82f1b5c6-6e9b-11e5-ba47-22000b92c6ec/verde/abrehabiruk_virtual_db_v1.1/verde_calcs.tar/GKVDXUXIAHWQIK-UHFFFAOYSA-N/GKVDXUXIAHWQIK-UHFFFAOYSA-N.json',
   'length': 579,
   'mime_type': 'text/plain',
   'sha512': '4adb1c347069d975c914eaa42ffd4c1b6a7474903c34d6e0b06061aa883d522724d064566635e8d573c36d6b4942270ba83ed951970490601a71c8f209b5f5ff',
   'url': 'https://data.materialsdatafacility.org/verde/abrehabiruk_virtual_db_v1.1/verde_calcs.tar/GKVDXUXIAHWQIK-UHFFFAOYSA-N/GKVDXUXIAHWQIK-UHFFFAOYSA-N.json'},
  {'data_type': 'ASCII text',
   'filename': 'GKVDXUXIAHWQIK-UHFFFAOYSA-N_T1_solv_freq.log',
   'globus': 'globus://82f1b5c6-6e9b-11e5-ba47-22000b92c6ec/verde/abrehabiruk_virtual_db_v1.1/verde_calcs.tar/GKVDXUXIAHWQIK-UHFFFAOYSA-N/GKVDXUXIAHWQIK-UHFFFAOYSA-N_T1_solv_freq.log',
   'length': 66369,
   'mime_type': 'text/plain',
   'sha512': '2b6c1ead705f5c6fa6f44605185c8c82a3826ae0ba60c77933351366e43e7e9c974a8002cd0c8da7092d1fd4a1d54ec72f8fce142aeb5618cb393a2c154d07e4',
   'url': 'https://data.materialsdatafacility.org/verde/abrehabiruk_virtual_db_v1.1/verde_calcs.tar/GKVDXUXIAHWQIK-UHFFFAOYSA-N/GKVDXUXIAHWQIK-UHFFFAOYSA-N_T1_solv_freq.log'}],
 'mdf': {'ingest_date': '2019-09-13T18:15:15.069466Z',
  'organizations': ['Virtual Excited State Reference for the Discovery of Electronic Materials Database'],
  'resource_type': 'record',
  'scroll_id': 1,
  'source_id': 'abrehabiruk_virtual_db_v1.1',
  'source_name': 'abrehabiruk_virtual_db',
  'version': 1},
 'molecule': {'homo': -8.165865,
  'inchi_key': 'GKVDXUXIAHWQIK-UHFFFAOYSA-N',
  'lumo': -1.967927,
  'smiles': 'N1=NC1'},
 'projects': {'verde': {'0_0': {'S1': 3.53, 'T1': 3.01},
   'dipole_moment': {'S0': 2.2142, 'S1': 2.2142, 'T1': 3.3892},
   'ionization_potential': 10.57,
   'redox_potential': {'S0': 3.38, 'S1': -0.15, 'T1': 0.37},
   'vertical_excitation_energy': 3.51}}}

VERDE-specific Metadata


In [13]:
res[0]['projects']['verde']


Out[13]:
{'0_0': {'S1': 3.53, 'T1': 3.01},
 'dipole_moment': {'S0': 2.2142, 'S1': 2.2142, 'T1': 3.3892},
 'ionization_potential': 10.57,
 'redox_potential': {'S0': 3.38, 'S1': -0.15, 'T1': 0.37},
 'vertical_excitation_energy': 3.51}

Information on the Molecule


In [14]:
res[0]['molecule']


Out[14]:
{'homo': -8.165865,
 'inchi_key': 'GKVDXUXIAHWQIK-UHFFFAOYSA-N',
 'lumo': -1.967927,
 'smiles': 'N1=NC1'}

In [15]:
df = shape_data(res)
df.head()


Out[15]:
0_0_S1 0_0_T1 dipole_moment_S0 dipole_moment_S1 dipole_moment_T1 homo inchi_key ionization_potential lumo redox_potential_S0 redox_potential_S1 redox_potential_T1 smiles vertical_excitation_energy
0 3.53 3.01 2.21420 2.2142 3.38920 -8.165865 GKVDXUXIAHWQIK-UHFFFAOYSA-N 10.57 -1.967927 3.38 -0.15 0.37 N1=NC1 3.51
1 NaN 0.76 0.69833 NaN 5.91455 -5.916844 BVBWGQQLKITHLL-FOCLMDBBSA-N 6.64 -2.936653 1.01 NaN 0.25 CC#Cc1ccc(o1)/N=N/c1ccc(o1)C#CC NaN
2 NaN 1.07 4.70836 NaN 6.89634 -5.982151 KRUMPIYRKNRXGF-SZWZUZNMSA-N 6.83 -2.584265 1.14 NaN 0.07 O/C=C\c1ccc(cc1)/N=N/c1ccc(cc1)/C=C\O NaN
3 NaN 1.06 0.42030 NaN 3.72330 -6.095350 RYSCLVZXACDRPK-QURGRASLSA-N 6.82 -2.741547 1.15 NaN 0.09 c1csc(c1)c1ccc(cc1)/N=N/c1ccc(cc1)c1cccs1 NaN
4 NaN 0.20 3.13260 NaN 2.78480 -5.793304 IIEZTXLLWBCGON-DQSJHHFOSA-N 6.65 -2.994069 0.83 NaN 0.63 FC(c1ccc([nH]1)c1ccc(s1)/N=N\c1ccc(s1)c1ccc([n... NaN

In [16]:
plt.scatter(df.ionization_potential, df.homo)


Out[16]:
<matplotlib.collections.PathCollection at 0x1240a4e48>

Matching Ranges

Here we match records where the 8.5 < ionization_potential < 11.0


In [17]:
res = mdf.match_resource_types("record")
res = res.match_source_names("abrehabiruk_virtual_db")
res = res.match_range('projects.verde.ionization_potential', start=8.5, stop=11.0)
res = res.search()
len(res)


Out[17]:
155

In [18]:
df = shape_data(res)
plt.scatter(df.ionization_potential, df.homo)


Out[18]:
<matplotlib.collections.PathCollection at 0x1234280f0>

Pull the Raw Calculations for analysis


In [19]:
res = mdf.match_resource_types("record")
res = res.match_source_names("abrehabiruk_virtual_db")
res = res.match_range('projects.verde.ionization_potential', start=10.0, stop=11.0)
res = res.search()
len(res)


Out[19]:
18

In [22]:
# NBVAL_SKIP
status = mdf.globus_download(res, dest="/Users/ben/Desktop/globus/verde", preserve_dir=True)


Processing records:   0%|          | 0/18 [00:00<?, ?it/s]
Processing records: 100%|██████████| 18/18 [00:00<00:00, 1486.46it/s]
Transferring data:   0%|          | 0/1 [00:00<?, ?it/s]
Transferring data: 100%|██████████| 1/1 [02:58<00:00, 178.61s/it]
All transfers processed
1 transfers succeeded
0 transfers failed