In [1]:
import requests
import json
import numpy as np
import pandas as pd
from pandas import DataFrame

US EPA ChemView web services

The documentation lists several ways of accessing data in ChemView.


In [2]:
URIBASE = 'http://java.epa.gov/chemview/'

Getting 'chemicals' data from ChemView

As a start... this downloads data for all chemicals. Let's see what we get.


In [3]:
uri = URIBASE + 'chemicals'
r = requests.get(uri, headers = {'Accept': 'application/json, */*'})
j = json.loads(r.text)

In [4]:
print(len(j))


9976

In [5]:
df = DataFrame(j)
df.tail()


Out[5]:
accessionNo casNo epaId id pmnNo synonyms
9971 None 998-30-1 None 4572865 None [{'isWorkPlan': False, 'id': 4572866, 'isTscaI...
9972 None 998-40-3 None 5280997 None [{'isWorkPlan': False, 'id': 5280999, 'isTscaI...
9973 None 99811-86-6 None 4472271 None [{'isWorkPlan': False, 'id': 4472272, 'isTscaI...
9974 None 999-21-3 None 5117441 None [{'isWorkPlan': False, 'id': 5117446, 'isTscaI...
9975 None 999-97-3 None 4468897 None [{'isWorkPlan': False, 'id': 4468902, 'isTscaI...

In [6]:
# Save this dataset so that I don't have to re-request it again later. 
df.to_pickle('../data/chemicals.pickle')

In [14]:
df = pd.read_pickle('../data/chemicals.pickle')

Data wrangling


In [15]:
# want to interpret 'None' as NaN
def scrub_None(x):
    s = str(x).strip()
    if s == 'None' or s == '':
        return np.nan
    else:
        return s

for c in list(df.columns)[:-1]:
    df[c] = df[c].apply(scrub_None)

In [16]:
df.tail()


Out[16]:
accessionNo casNo epaId id pmnNo synonyms
9971 NaN 998-30-1 NaN 4572865 NaN [{'isUnregistered': False, 'isIupac': True, 'i...
9972 NaN 998-40-3 NaN 5280997 NaN [{'isUnregistered': True, 'isIupac': True, 'id...
9973 NaN 99811-86-6 NaN 4472271 NaN [{'isUnregistered': False, 'isIupac': True, 'i...
9974 NaN 999-21-3 NaN 5117441 NaN [{'isUnregistered': False, 'isIupac': True, 'i...
9975 NaN 999-97-3 NaN 4468897 NaN [{'isUnregistered': False, 'isIupac': True, 'i...

How many unique CASRNs, PMN numbers?


In [17]:
# CASRNS
len(df.casNo.value_counts())


Out[17]:
9123

In [18]:
# PMN numbers
len(df.pmnNo.value_counts())


Out[18]:
518

What's in 'synonyms'?


In [19]:
DataFrame(df.loc[4,'synonyms'])


Out[19]:
chemicalName id isIupac isRegistry isSystematic isTscaInv isUnregistered isWorkPlan sortOrder
0 2-methyl-3-phenylpropanal 3510038 False False False False False False 5

How many 'synonyms' for each entry?


In [20]:
df.synonyms.apply(len).describe()


Out[20]:
count    9976.000000
mean        4.837310
std         3.525957
min         1.000000
25%         3.000000
50%         4.000000
75%         6.000000
max        63.000000
Name: synonyms, dtype: float64

Do the data objects in synonyms all have the same attributes?


In [22]:
def getfields(x):
    k = set()
    for d in x:
        j = set(d.keys())
        k = k | j
    return ','.join(sorted(k))

df.synonyms.apply(getfields).head()


Out[22]:
0    chemicalName,id,isIupac,isRegistry,isSystemati...
1    chemicalName,id,isIupac,isRegistry,isSystemati...
2    chemicalName,id,isIupac,isRegistry,isSystemati...
3    chemicalName,id,isIupac,isRegistry,isSystemati...
4    chemicalName,id,isIupac,isRegistry,isSystemati...
Name: synonyms, dtype: object

In [23]:
len(df.synonyms.apply(getfields).value_counts())


Out[23]:
1

All of the synonyms fields contain a variable number of objects with a uniform set of fields.

Tell me more about those items with PMN numbers...


In [24]:
pmns = df.loc[df.pmnNo.notnull()]
pmns.head()


Out[24]:
accessionNo casNo epaId id pmnNo synonyms
4 NaN NaN NaN 3510037 P-05-0055 [{'isUnregistered': False, 'isIupac': False, '...
5 NaN NaN NaN 3510068 P-05-0057 [{'isUnregistered': False, 'isIupac': True, 'i...
7 NaN NaN NaN 3510134 P-05-0059 [{'isUnregistered': False, 'isIupac': True, 'i...
8 NaN NaN NaN 3510165 P-05-0060 [{'isUnregistered': False, 'isIupac': True, 'i...
9 NaN NaN NaN 3510196 P-05-0061 [{'isUnregistered': False, 'isIupac': True, 'i...

Are there any that have CASRN too? ... No.


In [25]:
len(pmns.casNo.dropna())


Out[25]:
0