In [84]:
import encode_utils.stream as es
import encode_utils.grab as grab
In [85]:
def find_pair(experiments, filter_dict):
filters = []
for k, v in filter_dict.items():
filters.append(es.filter_field_by_comparison(field=k, value=v[0], comparison=v[1]))
yield from es.match(experiments, *filters)
def has_pair(parsed_data, get_data=False):
for d in parsed_data:
matches = list(find_pair(parsed_data, {'biosample_term_name': (d['biosample_term_name'], 'equals'),
'biosample_term_id': (d['biosample_term_id'], 'equals'),
'biosample_type': (d['biosample_type'], 'equals'),
'biosample_summary': (d['biosample_summary'], 'equals'),
'target': (d['target'], 'equals'),
'lab': (d['lab'], 'equals'),
'assay_title': (d['assay_title'], 'equals')}))
if len(matches) == 2:
has_pair = True
else:
has_pair = False
if get_data:
if has_pair:
yield matches
else:
yield d['accession'], has_pair
def official_accession(value):
if 'ENC' in value:
return True
return False
def get_redirect(x, base_url):
print('Getting redirect from {}'.format(base_url))
if official_accession(x):
return x
else:
r = grab.get_data('{}{}?datastore=database'.format(base_url, x))
accession = r.get('accession', None)
assert accession is not None
r = grab.get_data('{}/{}?datastore=database'.format(base_url, accession))
new_accession = r.get('@id', None)
assert new_accession is not None
return new_accession
def collapse_replacements(values, base_url):
return [get_redirect(x, base_url) for x in values]
def get_unique_pairs(pairs):
unique_set = set()
for pair in pairs:
pair_tup_one = (pair[0]['accession'], pair[1]['accession'])
pair_tup_two = (pair[1]['accession'], pair[0]['accession'])
if pair_tup_one in unique_set or pair_tup_two in unique_set:
continue
else:
unique_set.add(pair_tup_one)
return unique_set
def parse_experiments(experiments):
for e in experiments:
yield {'accession': e.get('accession', e.get('uuid')),
'biosample_summary': e.get('biosample_summary', ''),
'biosample_term_name': e.get('biosample_term_name', ''),
'biosample_term_id': e.get('biosample_term_id', ''),
'biosample_type': e.get('biosample_type', ''),
'target': e.get('target', {}).get('name', ''),
'lab': e.get('lab', {}).get('name', ''),
'assay_title': e.get('assay_title', ''),
'alternate_accessions': e.get('alternate_accessions', []),
'submitter_comment': e.get('submitter_comment', ''),
'possible_controls': [r.get('@id') for r in e.get('possible_controls', [])]}
def get_pair_data(pair, base_url=grab.base_url):
data = grab.quick_grab_data(['{}/{}/?{}&frame=embedded&datastore=database'.format(base_url,
p,
grab.json_only) for p in pair])
return data
def get_merge_info(e):
return {'reps': [(r.get('uuid'),
r.get('experiment'),
r.get('biological_replicate_number'),
r.get('technical_replicate_number'),
r.get('library',
{}).get('biosample',
{}).get('donor',
{}).get('accession'),
r.get('library',
{}).get('biosample',
{}).get('accession')) for r in e.get('replicates', {})],
'original_files': e.get('original_files', {}),
'dbxrefs': e.get('dbxrefs', []),
'aliases': e.get('aliases', []),
'documents': e.get('documents', []),
'alternate_accessions': e.get('alternate_accessions', []),
'submitter_comment': e.get('submitter_comment', ''),
'possible_controls': [r.get('@id') for r in e.get('possible_controls', [])]}
def determine_base_exp(pair_a, pair_b):
"""
Return True if pair_a should be base experiment else False.
"""
bio_reps_a = [b[2] for b in get_merge_info(pair_a)['reps']]
bio_reps_b = [b[2] for b in get_merge_info(pair_b)['reps']]
if min(bio_reps_a) == min(bio_reps_b):
# Take one with more tech_reps if bio_rep same.
if len(bio_reps_b) > len(bio_reps_a):
return False
return True
# Take one with lower bio_rep_number in general.
if min(bio_reps_a) <= min(bio_reps_b):
return True
return False
def rep_patch(base_exp, merge_exp):
updated_replicates = []
base_exp_id = base_exp['@id']
base_donors = donor_set(get_merge_info(base_exp))
merge_donors = donor_set(get_merge_info(merge_exp))
next_bio_rep = max([r[2] for r in get_merge_info(base_exp)['reps']]) + 1
merge_exp_reps_sorted = sorted(get_merge_info(merge_exp)['reps'], key=lambda x: x[2])
for j, bio_rep in enumerate(sorted(set([r[2] for r in merge_exp_reps_sorted]))):
for i, y in enumerate(sorted([r for r in merge_exp_reps_sorted if r[2] == bio_rep], key=lambda x: x[3])):
d = ({'uuid': y[0],
'experiment': base_exp_id,
'biological_replicate_number': next_bio_rep + j,
'technical_replicate_number': i + 1},
'REPLICATE')
updated_replicates.append(d)
return updated_replicates
def original_files_patch(base_exp, merg_exp):
updated_original_files = []
base_exp_id = base_exp['@id']
for file in get_merge_info(merg_exp)['original_files']:
d = ({'accession': file,
'dataset': base_exp_id},
'ORIGINAL FILE')
updated_original_files.append(d)
return updated_original_files
def parse_pair(pair):
if determine_base_exp(pair[0], pair[1]):
base_exp, merge_exp = pair[0], pair[1]
else:
base_exp, merge_exp = pair[1], pair[0]
return base_exp, merge_exp
def values_from_both(field):
return list(set([*get_merge_info(base_exp)[field],
*get_merge_info(merge_exp)[field]]))
def patch_item(url, item, data, auth, show_output=False):
r = grab.requests.patch(url,
auth=grab.auth,
json=data)
print('PATCHING: {}'.format(item), data)
#print(json.dumps(data, indent=4, sort_keys=True), '\n')
if show_output or r.status_code != 200:
print(json.dumps(r.json(), indent=4, sort_keys=True))
return r.status_code
def parse_patch_set(patch):
data = patch[0].copy()
item = data.get('accession', data.get('uuid'))
[data.pop(field, None) for field in ['accession', 'uuid']]
return item, data
def make_patch(patch_set, base_url):
print('Patching on: {}'.format(base_url))
for p in patch_set:
item, data = parse_patch_set(p)
url = grab.urljoin(base_url, item)
patch_item(url, item, data, grab.auth)
def donor_set(merge_info):
donors = [d[4] for d in merge_info['reps']]
return set(donors)
def biosample_set(merge_info):
biosamples = [d[5] for d in merge_info['reps']]
return set(biosamples)
def filter_uuid_duplicates(reps):
uuids = set()
new_reps = []
for r in reps:
if r[0] in uuids:
continue
else:
new_reps.append(r)
uuids.add(r[0])
return new_reps
def biosample_based_rep_patch(base_exp, merge_exp):
"""Tech rep if matching biosample, bio rep if different donor."""
updated_replicates = []
base_exp_id = base_exp['@id']
base_exp_info = get_merge_info(base_exp)
merge_exp_info = get_merge_info(merge_exp)
all_biosamples = biosample_set(base_exp_info).union(biosample_set(merge_exp_info))
all_reps = [r for r in es.chain(base_exp_info['reps'], merge_exp_info['reps'])]
for i, biosample in enumerate(sorted(all_biosamples)):
reps_with_biosample = filter_uuid_duplicates([r for r in all_reps if r[5] == biosample])
for j, rep in enumerate(reps_with_biosample):
d = ({'uuid': rep[0],
'experiment': base_exp_id,
'biological_replicate_number': i + 1,
'technical_replicate_number': j + 1},
'REPLICATE')
updated_replicates.append(d)
return updated_replicates
def donor_based_rep_patch(base_exp, merge_exp):
"""Tech rep if matching donor, bio rep if different donor."""
updated_replicates = []
base_exp_id = base_exp['@id']
base_exp_info = get_merge_info(base_exp)
merge_exp_info = get_merge_info(merge_exp)
all_donors = donor_set(base_exp_info).union(donor_set(merge_exp_info))
all_reps = [r for r in es.chain(base_exp_info['reps'], merge_exp_info['reps'])]
for i, donor in enumerate(sorted(all_donors)):
reps_with_donor = filter_uuid_duplicates([r for r in all_reps if r[4] == donor])
for j, rep in enumerate(reps_with_donor):
d = ({'uuid': rep[0],
'experiment': base_exp_id,
'biological_replicate_number': i + 1,
'technical_replicate_number': j + 1},
'REPLICATE')
updated_replicates.append(d)
return updated_replicates
def clear_conflicting_rep_numbers(base_exp, merge_exp):
updated_replicates = []
base_exp_id = base_exp['@id']
base_exp_info = get_merge_info(base_exp)
merge_exp_info = get_merge_info(merge_exp)
all_reps = [r for r in es.chain(base_exp_info['reps'], merge_exp_info['reps'])]
for i, rep in enumerate(filter_uuid_duplicates(all_reps)):
d = ({'uuid': rep[0],
'biological_replicate_number': i + 9999},
'CLEAR REPLICATE')
updated_replicates.append(d)
return updated_replicates
In [3]:
# Step 0: Get embedded data.
url = 'https://www.encodeproject.org/search/?type=Experiment'\
'&biosample_term_name=common+myeloid+progenitor%2C+CD34-positive'\
'&replicates.library.biosample.life_stage=adult&frame=embedded'\
'&replicates.library.biosample.donor.age=27&format=json&limit=all'\
'&accession!=ENCSR850RTJ&accession!=ENCSR681HMF'
assert 'embedded' in url
data = grab.quick_grab_data([url])
len(data)
Out[3]:
In [608]:
# Step I: Match on
# - biosample_term_name
# - biosample_term_id
# - biosample_type
# - target.name
# - lab.name
parsed_data = list(parse_experiments(data))
pairs = list(has_pair(parsed_data, get_data=True))
unique_pairs = get_unique_pairs(pairs)
In [609]:
len(unique_pairs)
Out[609]:
In [610]:
print(*unique_pairs, sep='\n')
In [611]:
list(has_pair(parsed_data))
Out[611]:
In [612]:
# Print experiments with more than one match.
for d in parsed_data:
matches = list(find_pair(parsed_data, {'biosample_term_name': (d['biosample_term_name'], 'equals'),
'biosample_term_id': (d['biosample_term_id'], 'equals'),
'biosample_type': (d['biosample_type'], 'equals'),
'biosample_summary': (d['biosample_summary'], 'equals'),
'target': (d['target'], 'equals'),
'lab': (d['lab'], 'equals'),
'assay_title': (d['assay_title'], 'equals')}))
if len(matches) != 2:
print('\n\n')
print('SEARCH')
for k, v in sorted(d.items()):
print(k, v)
print()
print('MATCHES:', len(matches))
for m in matches:
for k, v in sorted(m.items()):
print(k, v)
print()
In [ ]:
[
'ENCSR522XTC',
'ENCSR858ATS',
'ENCSR838JUD'
]
'ENCSR505TBF', 'ENCSR072QNY'
'ENCSR024OWH', 'ENCSR905SHH'
'ENCSR316YOE', 'ENCSR231FDF'
'ENCSR267TWX', 'ENCSR631BPS'
'ENCSR256FCW', 'ENCSR694CDP'
'ENCSR404BUR', 'ENCSR797GOJ'
'ENCSR853ZXZ', 'ENCSR835OJV'
'ENCSR741MIH', 'ENCSR538NBB'
In [4]:
# Group 1
['ENCSR505TBF', 'ENCSR072QNY']
['ENCSR024OWH', 'ENCSR905SHH']
['ENCSR316YOE', 'ENCSR231FDF']
['ENCSR267TWX', 'ENCSR631BPS']
['ENCSR256FCW', 'ENCSR694CDP']
['ENCSR404BUR', 'ENCSR797GOJ']
['ENCSR853ZXZ', 'ENCSR835OJV']
Out[4]:
In [ ]:
# Group 2
['ENCSR741MIH', 'ENCSR538NBB']
['ENCSR780PDB', 'ENCSR769HUN']
['ENCSR515TBC', 'ENCSR136ZNV']
['ENCSR354LJH', 'ENCSR115VDV']
['ENCSR005YWW', 'ENCSR367VRA']
['ENCSR446FIP', 'ENCSR162VXO']
['ENCSR841ADQ', 'ENCSR201GCJ']
In [56]:
['ENCSR522XTC', 'ENCSR858ATS', 'ENCSR838JUD']
['ENCSR300MDI', 'ENCSR694MZZ', 'ENCSR312CKN']
['ENCSR794QZO', 'ENCSR541PBW', 'ENCSR995UTR']
['ENCSR686BAF', 'ENCSR910LOX', 'ENCSR023MFG']
['ENCSR994YFY', 'ENCSR964LDA', 'ENCSR002EZN']
['ENCSR160JSN', 'ENCSR433VDH', 'ENCSR819HSS']
['ENCSR983SYA', 'ENCSR581BGH', 'ENCSR100HGO']
['ENCSR395BYT', 'ENCSR071TJB', 'ENCSR525XER']
In [ ]:
# group 4
['ENCSR974EGY', 'ENCSR208IMU']
['ENCSR185IVZ', 'ENCSR902BOX']
['ENCSR718NBZ', 'ENCSR113FMX']
['ENCSR995KPB', 'ENCSR621XWM']
['ENCSR265OGV', 'ENCSR277WTO']
In [31]:
# group 5
['ENCSR859IKY', 'ENCSR568WPC', 'ENCSR185URR']
['ENCSR264MNL', 'ENCSR262VXI']
['ENCSR947HPP', 'ENCSR691XRZ', 'ENCSR555QHZ']
['ENCSR993MJT', 'ENCSR912CWB', 'ENCSR196LEI']
['ENCSR538WKF', 'ENCSR140JQU', 'ENCSR004AKD']
['ENCSR849ILZ', 'ENCSR629XLG', 'ENCSR006GPM']
['ENCSR866UWZ', 'ENCSR818XLT', 'ENCSR439EHQ']
In [ ]:
# group 6
['ENCSR055UQS','ENCSR726WVB']
In [ ]:
In [142]:
# Using biosample-based replicate match. Replicates with differnet
# biosamples become biological replicates. // biosample_based_rep_patch(base_exp, merge_exp)
# Can change to use donor-based replicate match, in which case replicates
# that share the same donor are techincal replicates. // donor_based_rep_patch(base_exp, merge_exp)
# Experiments to merge.
exps = ['ENCSR055UQS','ENCSR726WVB']
# Set server location.
base_url = 'https://www.encodeproject.org'
grab.base_url = base_url
# Change to True to really patch.
update = True
# Must update:
# rep number in EXP_B replicates
# EXP_B.status to replaced
# EXP_A.alternate_accession.append(EXP_B)
# EXP_A.possible_controls.append(EXP_B)
# original_files.dataset in EXP_B to point to EXP_A
# dbxrefs in EXP_A to be list(set(EXP_A.dbxrefs, EXP_B.dbxrefs))
# aliases in EXP_A to be list(set(EXP_A.aliases, EXP_B.aliases))
# remove aliases from EXP_B.
# check for unique documents in EXP_B
# add submitter_comment to EXP_A explaining the merge
parent_base = exps.pop()
for z, exp in enumerate(exps):
# {'accession': 'UUID', 'field': 'value'} to update.
pair = (parent_base, exp)
patch_set = []
a, b = pair[0], pair[1]
pair = get_pair_data([a, b], base_url)
base_exp, merge_exp = parse_pair(pair)
parent_base = base_exp['accession']
assert base_exp['uuid'] != merge_exp['uuid'], 'Cannot merge same experiment'
# Clear rep number conflicts.
patch_set.extend(clear_conflicting_rep_numbers(base_exp, merge_exp))
# Calculate new replicate numbers and point to base experiment.
patch_set.extend(biosample_based_rep_patch(base_exp, merge_exp))
# Replace EXP_B.
patch_set.append(({'accession': merge_exp['uuid'],
'status': 'replaced'},
'REPLACE'))
# Remove alternate_accessions from EXP_B.
patch_set.append(({'accession': merge_exp['uuid'],
'alternate_accessions': []},
'CLEAR ALTERNATE ACCESSIONS'))
# Add EXP_B to EXP_A alternate_accessions.
patch_set.append(({'accession': base_exp['accession'],
'alternate_accessions': list(set([a for a in
grab.chain(values_from_both('alternate_accessions'),
[merge_exp['accession']])]))},
'ALTERNATE ACCESSIONS'))
# Remove possible_controls from EXP_B.
patch_set.append(({'accession': merge_exp['uuid'],
'possible_controls': []},
'CLEAR POSSIBLE CONTROLS'))
# Add EXP_B possible_controls to EXP_A possible_controls.
patch_set.append(({'accession': base_exp['accession'],
'possible_controls': list(set([a for a in
collapse_replacements(
values_from_both('possible_controls'), base_url)]
))},
'POSSIBLE CONTROLS'))
# Update dataset in orignal files of EXP_B.
patch_set.extend(original_files_patch(base_exp, merge_exp))
# Update dbxrefs in EXP_A.
patch_set.append(({'accession': base_exp['accession'],
'dbxrefs': values_from_both('dbxrefs')},
'UPDATE DBXREFS'))
# Remove aliases from EXP_B.
patch_set.append(({'accession': merge_exp['uuid'],
'aliases': []},
'CLEAR ALIASES'))
# Update aliases in EXP_A.
patch_set.append(({'accession': base_exp['accession'],
'aliases': values_from_both('aliases')},
'UPDATE ALIASES'))
# Update documents in EXP_A.
patch_set.append(({'accession': base_exp['accession'],
'documents': values_from_both('documents')},
'UPDATE DOCUMENTS'))
comment = 'Experiment {} merged into base.'.format(merge_exp['uuid'])
sub_comment = ' '.join([get_merge_info(base_exp)['submitter_comment'],
get_merge_info(merge_exp)['submitter_comment']]).strip()
patch_set.append(({'accession': base_exp['accession'],
'submitter_comment': comment if len(sub_comment) == 0 else '{} {}'.format(sub_comment,
comment)},
'ADD COMMENT'))
print('*PAIR {}*\n-----------'.format(z + 1))
print('Base:', base_exp['accession'],
'({})\n'.format(base_exp['uuid']),
json.dumps(get_merge_info(base_exp), indent=4, sort_keys=True),
'\n\nMerge:', merge_exp['accession'],
'({})\n'.format(merge_exp['uuid']),
json.dumps(get_merge_info(merge_exp), indent=4, sort_keys=True), '\n')
for i, x in enumerate(patch_set):
print(x[1])
for k, v in sorted(x[0].items()):
print('{}:'.format(k), v)
print()
if update:
print('MAKE PATCH')
make_patch(patch_set, base_url)
print()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: