In [236]:
import asyncio
import aiohttp
import operator
import os
import numpy as np
import pandas as pd
import requests
from functools import wraps
from itertools import chain
from urllib.parse import urljoin
In [2]:
# Copied from pyencoded-tools/encodedcc.py to avoid dependency.
class ENC_Key:
def __init__(self, keyfile, keyname):
if os.path.isfile(str(keyfile)):
keys_f = open(keyfile, 'r')
keys_json_string = keys_f.read()
keys_f.close()
keys = json.loads(keys_json_string)
else:
keys = keyfile
key_dict = keys[keyname]
self.authid = key_dict['key']
self.authpw = key_dict['secret']
self.server = key_dict['server']
if not self.server.endswith("/"):
self.server += "/"
class ENC_Connection(object):
def __init__(self, key):
self.headers = {'content-type': 'application/json', 'accept': 'application/json'}
self.server = key.server
self.auth = (key.authid, key.authpw)
In [3]:
# Define key if private data desired.
key = ENC_Key(os.path.expanduser("~/keypairs.json"), 'prod')
auth = (key.authid, key.authpw)
base_url = 'https://www.encodeproject.org'
associated_search = urljoin(base_url, '/search/?type={}&{}={}&{}')
json_all = 'limit=all&format=json'
json_only = 'format=json'
request_auth = aiohttp.BasicAuth(key.authid, key.authpw)
loop = asyncio.get_event_loop()
In [4]:
def create_session():
connector = aiohttp.TCPConnector(keepalive_timeout=100, limit=100)
return aiohttp.ClientSession(connector=connector)
In [5]:
session = create_session()
In [93]:
# Utils.
def get_data(url):
r = requests.get(url, auth=auth)
try:
assert r.status_code == 200
except AssertionError as e:
raise Exception(url, r.text) from e
try:
return r.json()['@graph']
except KeyError:
return r.json()
async def async_get_data(url, session, request_auth=request_auth):
r = await session.get(url, auth=request_auth)
try:
assert r.status == 200
except AssertionError as e:
raise Exception(url, await r.text()) from e
return await r.json()
def quick_grab_data(urls, session=session, loop=loop):
f = [async_get_data(url, session) for url in urls]
results = loop.run_until_complete(asyncio.gather(*f))
try:
return [subobject for item in results for subobject in item['@graph']]
except KeyError:
return results
def get_associated(item_type, related_field, related_ids):
urls = [associated_search.format(item_type,
related_field,
related_id,
json_all)
for related_id in related_ids]
return quick_grab_data(urls)
def print_relation(relation):
for k, v in sorted(relation.items()):
print('tech_rep', k,
'in:', [(p[0], p[1]) for p in v['parents']],
'out:', [(c[0], c[1]) for c in v['children']], end='\n\n')
In [7]:
#session.close()
In [235]:
operator_map = {'equals': operator.eq,
'not_equals': operator.ne,
'contains': operator.contains}
def process_stream(processors):
"""
Apply all processors to stream. Requires yield_files(files) to be passed
in as first item in list.
"""
stream = ()
for processor in processors:
stream = processor(stream)
return stream
def processor(f):
"""
Return processor function applied to stream.
"""
@wraps(f)
def new_func(*args, **kwargs):
def processor(stream):
return f(stream, *args, **kwargs)
return processor
return new_func
def generator(f):
"""
Return function that provides original data to stream.
"""
@wraps(f)
@processor
def new_func(stream, *args, **kwargs):
yield from f(*args, **kwargs)
return new_func
@generator
def yield_files(files):
"""
Initiate processing stream with files.
"""
for file in files:
yield file
@processor
def filter_field_by_comparison(stream, field=None, value=None, comparison='equals'):
"""
Filter list of dictionaries based on field value, filter value, and comparison.
Parameters
----------
stream : generator
Original data plus applied processing steps passed in by @processor decorator.
field : string
Name of field in dictionary.
value : string or list (for contains)
Filter value to compare to field value.
comparison : {'equals' | 'not_equals' | 'contains'}
Operator used for comparing values. Default is equals.
"""
if operator_map.get(comparison) is None:
raise ValueError('Comparison must be one of: {}'.format(list(operator_map.keys())))
for file in stream:
# Make sure filter value exists.
if value is None:
raise ValueError('Must specify value')
# Continue if key not in specific file.
if file.get(field) is None:
continue
# Order of variables matter for contains.
left, right = value, file.get(field)
if comparison == 'contains' and isinstance(value, str):
left, right = file.get(field), value
# Yield only files that match filter.
if operator_map[comparison](left, right):
yield file
def match(data, *args):
"""
Pass in data and filters.
"""
yield from process_stream([s for s in chain([yield_files(data)], [*args])])
def _find_relation(data, in_type, out_type, experiment):
parents = list(match(data,
filter_field_by_comparison(field='output_type',
value=in_type['output_type']),
filter_field_by_comparison(field='file_type',
value=in_type['file_type']),
filter_field_by_comparison(field='status',
value=in_type['status'],
comparison='contains')))
children = list(match(data,
filter_field_by_comparison(field='output_type',
value=out_type['output_type']),
filter_field_by_comparison(field='file_type',
value=out_type['file_type']),
filter_field_by_comparison(field='status',
value=out_type['status'],
comparison='contains')))
return {'parents': [(p.get('accession', p.get('uuid')),
p.get('file_type'),
p.get('output_type'),
p.get('status'),
experiment) for p in parents],
'children': [(c.get('accession', c.get('uuid')),
c.get('file_type'),
c.get('output_type'),
c.get('status'),
experiment) for c in children]}
def _extract_values_from_pattern(field, in_type, out_type):
"""
Returns set of values in both in_type and out_type for given field.
"""
values = []
for value in chain([in_type], [out_type]):
# Flatten if value is a list.
if isinstance(value.get(field), list):
values.extend(value.get(field))
else:
values.append(value.get(field))
if None in values:
raise ValueError('Must specify {} in pattern.'.format(field))
return set(values)
def basic_pattern(experiment, in_type, out_type, **kwargs):
"""
Returns relationship of Files in Experiment given in_type and out_type.
Parameters
----------
experiment : string
Accession of the Experiment of interest.
in_type : dict
(file_type, output_type, status, match_rep=True/False)
out_type : tuple
(file_type, output_type, status, match_rep=True/False)
**kwargs : string
Field name and filter value for additonal filters to apply to all associated files.
"""
additional_filters = []
calculated_relationships = {}
if in_type['match_rep'] != out_type['match_rep']:
raise ValueError('Match_rep mismatch between in and out pattern.')
# Pull pattern data.
filter_statuses = _extract_values_from_pattern('status', in_type, out_type)
filter_types = _extract_values_from_pattern('file_type', in_type, out_type)
filter_replicate = _extract_values_from_pattern('match_rep', in_type, out_type).pop()
# Optional filtering by fields passed to kwargs.
if kwargs:
for k,v in kwargs.items():
additional_filters.append(filter_field_by_comparison(field=k,
value=v,
comparison='equals'))
# Get all files associated with experiment.
associated_files = get_associated(item_type='File', related_field='dataset', related_ids=[experiment])
# Filter by status, file_formats, and additional_filters:
by_status_format_additional = list(match(associated_files,
filter_field_by_comparison(field='status',
value=list(filter_statuses),
comparison='contains'),
filter_field_by_comparison(field='file_type',
value=list(filter_types),
comparison='contains'),
*additional_filters))
if filter_replicate:
# Flatten lists of tech_reps.
tech_reps = set([tech_rep for f in by_status_format_additional
for tech_rep in f['technical_replicates']])
# Match files by tech_rep.
for rep in tech_reps:
pairs = list(match(by_status_format_additional,
filter_field_by_comparison(field='technical_replicates',
value=str(rep),
comparison='contains')))
calculated_relationships[rep] = _find_relation(pairs, in_type, out_type, experiment)
else:
calculated_relationships['None'] = _find_relation(by_status_format_additional, in_type, out_type, experiment)
return calculated_relationships
In [238]:
# RIP-seq patterns:
in_type1 = dict(file_type='fastq', output_type='reads', status='released', match_rep=True)
out_type1 = dict(file_type='bam', output_type='alignments', status='released', match_rep=True)
in_type2 = dict(file_type='bam', output_type='alignments', status='released', match_rep=True)
out_type2 = dict(file_type='bigWig', output_type='signal', status='released', match_rep=True)
in_type3 = dict(file_type='bam', output_type='alignments', status='released', match_rep=False)
out_type3 = dict(file_type='bed broadPeak', output_type='peaks', status='released', match_rep=False)
in_type4 = dict(file_type='bed broadPeak', output_type='peaks', status='released', match_rep=False)
out_type4 = dict(file_type='bigBed broadPeak', output_type='peaks', status='released', match_rep=False)
# Build list of results.
relationships = [basic_pattern(experiment_id, in_type1, out_type1),
basic_pattern(experiment_id, in_type2, out_type2),
basic_pattern(experiment_id, in_type3, out_type3),
basic_pattern(experiment_id, in_type4, out_type4)]
# Gingeras RNA microarray patterns:
url = 'https://www.encodeproject.org/search/?type=Experiment&assay_title=RNA+microarray&audit.INTERNAL_ACTION.category=missing+derived_from&award.rfa=ENCODE2&lab.title=Thomas+Gingeras%2C+CSHL&format=json&limit=all'
in_type1 = dict(file_type='bed broadPeak', output_type='transcribed fragments', status=['revoked', 'released'], match_rep=False)
out_type1 = dict(file_type='bigBed broadPeak', output_type='transcribed fragments', status=['revoked', 'released'], match_rep=False)
in_type2 = dict(file_type='bed broadPeak', output_type='filtered transcribed fragments', status=['revoked', 'released'], match_rep=False)
out_type2 = dict(file_type='bigBed broadPeak', output_type='filtered transcribed fragments', status=['revoked', 'released'], match_rep=False)
in_type3 = dict(file_type='bed broadPeak', output_type='transcribed fragments', status=['revoked', 'released'], match_rep=False)
out_type3 = dict(file_type='bed broadPeak', output_type='filtered transcribed fragments', status=['revoked', 'released'], match_rep=False)
url = 'https://www.encodeproject.org/search/?type=Experiment&assay_title=RNA+microarray&lab.title=Gregory+Crawford%2C+Duke&audit.INTERNAL_ACTION.category=missing+derived_from&format=json&limit=all'
# Crawford RNA microarray pattern:
in_type1 = dict(file_type='bed broadPeak', output_type='exon quantifications', status=['released', 'deleted', 'revoked'], match_rep=True)
out_type1 = dict(file_type='bigBed broadPeak', output_type='exon quantifications', status=['released', 'deleted', 'revoked'], match_rep=True)
In [292]:
url = 'https://www.encodeproject.org/search/?type=Experiment&assay_title=RNA+microarray&lab.title=Gregory+Crawford%2C+Duke&audit.INTERNAL_ACTION.category=missing+derived_from&format=json&limit=all'
exp = [f['@id'] for f in quick_grab_data([url])]
len(exp)
Out[292]:
In [293]:
results = []
for e in exp:
experiment_id = e
relationships = [basic_pattern(experiment_id, in_type1, out_type1)]
results.append(relationships)
In [294]:
dfpd = []
for x in results:
for y in x:
for k, v in y.items():
d = {'accession': ' '.join([t[0] for t in v['children']]),
'derived_from:list': ','.join(['/files/{}/'.format(t[0]) for t in v['parents']])}
dfpd.append(d)
In [299]:
pd.DataFrame(dfpd).replace('', np.nan).dropna().reset_index(drop=True)\
#.to_csv('../../calculated_derived_from_crawford_rna_microarray_patch_10_20_2017.tsv', sep='\t', index=False)
Out[299]:
In [ ]:
In [144]:
url = 'https://www.encodeproject.org/search/?type=Experiment&assay_title=RNA+microarray&audit.INTERNAL_ACTION.category=missing+derived_from&award.rfa=ENCODE2&lab.title=Thomas+Gingeras%2C+CSHL&format=json&limit=all'
exp = [f['accession'] for f in quick_grab_data([url])]
in_type1 = dict(file_type='bed broadPeak', output_type='transcribed fragments', status=['revoked', 'released'], match_rep=False)
out_type1 = dict(file_type='bigBed broadPeak', output_type='transcribed fragments', status=['revoked', 'released'], match_rep=False)
in_type2 = dict(file_type='bed broadPeak', output_type='filtered transcribed fragments', status=['revoked', 'released'], match_rep=False)
out_type2 = dict(file_type='bigBed broadPeak', output_type='filtered transcribed fragments', status=['revoked', 'released'], match_rep=False)
in_type3 = dict(file_type='bed broadPeak', output_type='transcribed fragments', status=['revoked', 'released'], match_rep=False)
out_type3 = dict(file_type='bed broadPeak', output_type='filtered transcribed fragments', status=['revoked', 'released'], match_rep=False)
In [145]:
len(exp)
Out[145]:
In [147]:
for e in exp:
print()
experiment_id = '/experiments/{}/'.format(e)
relationships = [basic_pattern(experiment_id, in_type1, out_type1),
basic_pattern(experiment_id, in_type2, out_type2),
basic_pattern(experiment_id, in_type3, out_type3)]
print('For:', experiment_id, end='\n\n')
for i, relation in enumerate(relationships):
print_relation(relation)
In [ ]:
In [ ]: