In [1]:
import requests
import json
import numpy as np
import pandas as pd
from pandas import DataFrame
The documentation lists several ways of accessing data in ChemView.
In [2]:
URIBASE = 'http://java.epa.gov/chemview/'
In [3]:
uri = URIBASE + 'chemicals'
r = requests.get(uri, headers = {'Accept': 'application/json, */*'})
j = json.loads(r.text)
In [4]:
print(len(j))
In [5]:
df = DataFrame(j)
df.tail()
Out[5]:
In [6]:
# Save this dataset so that I don't have to re-request it again later.
df.to_pickle('../data/chemicals.pickle')
In [14]:
df = pd.read_pickle('../data/chemicals.pickle')
In [15]:
# want to interpret 'None' as NaN
def scrub_None(x):
s = str(x).strip()
if s == 'None' or s == '':
return np.nan
else:
return s
for c in list(df.columns)[:-1]:
df[c] = df[c].apply(scrub_None)
In [16]:
df.tail()
Out[16]:
In [17]:
# CASRNS
len(df.casNo.value_counts())
Out[17]:
In [18]:
# PMN numbers
len(df.pmnNo.value_counts())
Out[18]:
In [19]:
DataFrame(df.loc[4,'synonyms'])
Out[19]:
In [20]:
df.synonyms.apply(len).describe()
Out[20]:
Do the data objects in synonyms all have the same attributes?
In [22]:
def getfields(x):
k = set()
for d in x:
j = set(d.keys())
k = k | j
return ','.join(sorted(k))
df.synonyms.apply(getfields).head()
Out[22]:
In [23]:
len(df.synonyms.apply(getfields).value_counts())
Out[23]:
All of the synonyms fields contain a variable number of objects with a uniform set of fields.
In [24]:
pmns = df.loc[df.pmnNo.notnull()]
pmns.head()
Out[24]:
Are there any that have CASRN too? ... No.
In [25]:
len(pmns.casNo.dropna())
Out[25]: