In [84]:
import encode_utils.stream as es
import encode_utils.grab as grab

In [85]:
def find_pair(experiments, filter_dict):
    filters = []
    for k, v in filter_dict.items():
        filters.append(es.filter_field_by_comparison(field=k, value=v[0], comparison=v[1]))
    yield from es.match(experiments, *filters)
    

def has_pair(parsed_data, get_data=False):
    for d in parsed_data:
        matches = list(find_pair(parsed_data, {'biosample_term_name': (d['biosample_term_name'], 'equals'),
                                               'biosample_term_id': (d['biosample_term_id'], 'equals'),
                                               'biosample_type': (d['biosample_type'], 'equals'),
                                               'biosample_summary': (d['biosample_summary'], 'equals'),
                                               'target': (d['target'], 'equals'),
                                               'lab': (d['lab'], 'equals'),
                                               'assay_title': (d['assay_title'], 'equals')}))
        if len(matches) == 2:
            has_pair = True
        else:
            has_pair = False
        if get_data:
            if has_pair:
                yield matches
        else:
            yield d['accession'], has_pair


def official_accession(value):
    if 'ENC' in value:
        return True
    return False
            

def get_redirect(x, base_url):
    print('Getting redirect from {}'.format(base_url))
    if official_accession(x):
        return x
    else:
        r = grab.get_data('{}{}?datastore=database'.format(base_url, x))
        accession = r.get('accession', None)
        assert accession is not None
        r = grab.get_data('{}/{}?datastore=database'.format(base_url, accession))
        new_accession = r.get('@id', None)
        assert new_accession is not None
        return new_accession        
                      
    
def collapse_replacements(values, base_url):
    return [get_redirect(x, base_url) for x in values]


def get_unique_pairs(pairs):           
    unique_set = set()
    for pair in pairs:
        pair_tup_one = (pair[0]['accession'], pair[1]['accession'])
        pair_tup_two = (pair[1]['accession'], pair[0]['accession'])
        if pair_tup_one in unique_set or pair_tup_two in unique_set:
            continue
        else:
            unique_set.add(pair_tup_one)
    return unique_set

    
def parse_experiments(experiments):
    for e in experiments:
        yield {'accession': e.get('accession', e.get('uuid')),
               'biosample_summary': e.get('biosample_summary', ''),
               'biosample_term_name': e.get('biosample_term_name', ''),
               'biosample_term_id': e.get('biosample_term_id', ''), 
               'biosample_type': e.get('biosample_type', ''),
               'target': e.get('target', {}).get('name', ''),
               'lab': e.get('lab', {}).get('name', ''),
               'assay_title': e.get('assay_title', ''),
               'alternate_accessions': e.get('alternate_accessions', []),
               'submitter_comment': e.get('submitter_comment', ''),
               'possible_controls': [r.get('@id') for r in e.get('possible_controls', [])]}
        
        
def get_pair_data(pair, base_url=grab.base_url):
    data = grab.quick_grab_data(['{}/{}/?{}&frame=embedded&datastore=database'.format(base_url,
                                                                                     p,
                                                                                     grab.json_only) for p in pair])
    return data


def get_merge_info(e):
    return {'reps': [(r.get('uuid'),
                      r.get('experiment'),
                      r.get('biological_replicate_number'),
                      r.get('technical_replicate_number'),
                      r.get('library',
                            {}).get('biosample',
                                    {}).get('donor',
                                            {}).get('accession'),
                      r.get('library',
                            {}).get('biosample',
                                    {}).get('accession')) for r in e.get('replicates', {})],
            'original_files': e.get('original_files', {}),
            'dbxrefs': e.get('dbxrefs', []),
            'aliases': e.get('aliases', []),
            'documents': e.get('documents', []),
            'alternate_accessions': e.get('alternate_accessions', []),
            'submitter_comment': e.get('submitter_comment', ''),
            'possible_controls': [r.get('@id') for r in e.get('possible_controls', [])]}


def determine_base_exp(pair_a, pair_b):
    """
    Return True if pair_a should be base experiment else False.
    """
    bio_reps_a = [b[2] for b in get_merge_info(pair_a)['reps']]
    bio_reps_b = [b[2] for b in get_merge_info(pair_b)['reps']]
    if min(bio_reps_a) == min(bio_reps_b):
        # Take one with more tech_reps if bio_rep same.
        if len(bio_reps_b) > len(bio_reps_a):
            return False
        return True
    # Take one with lower bio_rep_number in general.
    if min(bio_reps_a) <= min(bio_reps_b):
        return True
    return False


def rep_patch(base_exp, merge_exp):
    updated_replicates = []
    base_exp_id = base_exp['@id']
    base_donors = donor_set(get_merge_info(base_exp))
    merge_donors = donor_set(get_merge_info(merge_exp))
    next_bio_rep = max([r[2] for r in get_merge_info(base_exp)['reps']]) + 1
    merge_exp_reps_sorted = sorted(get_merge_info(merge_exp)['reps'], key=lambda x: x[2])
    for j, bio_rep in enumerate(sorted(set([r[2] for r in merge_exp_reps_sorted]))):
        for i, y in enumerate(sorted([r for r in merge_exp_reps_sorted if r[2] == bio_rep], key=lambda x: x[3])):
            d = ({'uuid': y[0],
                 'experiment': base_exp_id,
                 'biological_replicate_number': next_bio_rep + j,
                 'technical_replicate_number': i + 1},
                 'REPLICATE')
            updated_replicates.append(d)
    return updated_replicates

def original_files_patch(base_exp, merg_exp):
    updated_original_files = []
    base_exp_id = base_exp['@id']
    for file in get_merge_info(merg_exp)['original_files']:
        d = ({'accession': file,
             'dataset': base_exp_id},
             'ORIGINAL FILE')
        updated_original_files.append(d)
    return updated_original_files


def parse_pair(pair):
    if determine_base_exp(pair[0], pair[1]):
        base_exp, merge_exp = pair[0], pair[1]
    else:
        base_exp, merge_exp = pair[1], pair[0]
    return base_exp, merge_exp


def values_from_both(field):
    return list(set([*get_merge_info(base_exp)[field],
                     *get_merge_info(merge_exp)[field]]))


def patch_item(url, item, data, auth, show_output=False):
    r = grab.requests.patch(url,
                       auth=grab.auth,
                       json=data)
    print('PATCHING: {}'.format(item), data)
    #print(json.dumps(data, indent=4, sort_keys=True), '\n')
    if show_output or r.status_code != 200:
        print(json.dumps(r.json(), indent=4, sort_keys=True))
    return r.status_code


def parse_patch_set(patch):
    data = patch[0].copy()
    item = data.get('accession', data.get('uuid'))
    [data.pop(field, None) for field in ['accession', 'uuid']]
    return item, data


def make_patch(patch_set, base_url):
    print('Patching on: {}'.format(base_url))
    for p in patch_set:
        item, data = parse_patch_set(p)
        url = grab.urljoin(base_url, item)
        patch_item(url, item, data, grab.auth)
        

def donor_set(merge_info):
    donors = [d[4] for d in merge_info['reps']]
    return set(donors)


def biosample_set(merge_info):
    biosamples = [d[5] for d in merge_info['reps']]
    return set(biosamples)


def filter_uuid_duplicates(reps):
    uuids = set()
    new_reps = []
    for r in reps:
        if r[0] in uuids:
            continue
        else:
            new_reps.append(r)
            uuids.add(r[0])
    return new_reps

    
def biosample_based_rep_patch(base_exp, merge_exp):
    """Tech rep if matching biosample, bio rep if different donor."""
    updated_replicates = []
    base_exp_id = base_exp['@id']
    base_exp_info = get_merge_info(base_exp)
    merge_exp_info = get_merge_info(merge_exp)
    all_biosamples = biosample_set(base_exp_info).union(biosample_set(merge_exp_info))
    all_reps = [r for r in es.chain(base_exp_info['reps'], merge_exp_info['reps'])]
    for i, biosample in enumerate(sorted(all_biosamples)):
        reps_with_biosample = filter_uuid_duplicates([r for r in all_reps if r[5] == biosample])
        for j, rep in enumerate(reps_with_biosample):
            d = ({'uuid': rep[0],
                 'experiment': base_exp_id,
                 'biological_replicate_number': i + 1,
                 'technical_replicate_number': j + 1},
                 'REPLICATE')
            updated_replicates.append(d)
    return updated_replicates
    

def donor_based_rep_patch(base_exp, merge_exp):
    """Tech rep if matching donor, bio rep if different donor."""
    updated_replicates = []
    base_exp_id = base_exp['@id']
    base_exp_info = get_merge_info(base_exp)
    merge_exp_info = get_merge_info(merge_exp)
    all_donors = donor_set(base_exp_info).union(donor_set(merge_exp_info))
    all_reps = [r for r in es.chain(base_exp_info['reps'], merge_exp_info['reps'])]
    for i, donor in enumerate(sorted(all_donors)):
        reps_with_donor = filter_uuid_duplicates([r for r in all_reps if r[4] == donor])
        for j, rep in enumerate(reps_with_donor):
            d = ({'uuid': rep[0],
                 'experiment': base_exp_id,
                 'biological_replicate_number': i + 1,
                 'technical_replicate_number': j + 1},
                 'REPLICATE')
            updated_replicates.append(d)
    return updated_replicates

def clear_conflicting_rep_numbers(base_exp, merge_exp):
    updated_replicates = []
    base_exp_id = base_exp['@id']
    base_exp_info = get_merge_info(base_exp)
    merge_exp_info = get_merge_info(merge_exp)
    all_reps = [r for r in es.chain(base_exp_info['reps'], merge_exp_info['reps'])]
    for i, rep in enumerate(filter_uuid_duplicates(all_reps)):
        d = ({'uuid': rep[0],
             'biological_replicate_number': i + 9999},
             'CLEAR REPLICATE')
        updated_replicates.append(d)
    return updated_replicates

In [3]:
# Step 0: Get embedded data. 
url = 'https://www.encodeproject.org/search/?type=Experiment'\
      '&biosample_term_name=common+myeloid+progenitor%2C+CD34-positive'\
      '&replicates.library.biosample.life_stage=adult&frame=embedded'\
      '&replicates.library.biosample.donor.age=27&format=json&limit=all'\
      '&accession!=ENCSR850RTJ&accession!=ENCSR681HMF'
assert 'embedded' in url
data = grab.quick_grab_data([url])
len(data)


Out[3]:
9

In [608]:
# Step I: Match on
#     - biosample_term_name
#     - biosample_term_id
#     - biosample_type
#     - target.name
#     - lab.name

parsed_data = list(parse_experiments(data))
pairs = list(has_pair(parsed_data, get_data=True))
unique_pairs = get_unique_pairs(pairs)

In [609]:
len(unique_pairs)


Out[609]:
5

In [610]:
print(*unique_pairs, sep='\n')


('ENCSR672EWT', 'ENCSR439WHW')
('ENCSR466AGC', 'ENCSR652CZT')
('ENCSR284HLC', 'ENCSR891KSP')
('ENCSR979YDQ', 'ENCSR340SGE')
('ENCSR122VUW', 'ENCSR603BXE')

In [611]:
list(has_pair(parsed_data))


Out[611]:
[('ENCSR284HLC', True),
 ('ENCSR908QAN', False),
 ('ENCSR672EWT', True),
 ('ENCSR731OMG', False),
 ('ENCSR337LWR', False),
 ('ENCSR754BBV', False),
 ('ENCSR707TMM', False),
 ('ENCSR122VUW', True),
 ('ENCSR919RJD', False),
 ('ENCSR439WHW', True),
 ('ENCSR942LVG', False),
 ('ENCSR071XJU', False),
 ('ENCSR862NIZ', False),
 ('ENCSR466AGC', True),
 ('ENCSR830PAC', False),
 ('ENCSR979YDQ', True),
 ('ENCSR652CZT', True),
 ('ENCSR340SGE', True),
 ('ENCSR836LAC', False),
 ('ENCSR835DEF', False),
 ('ENCSR891KSP', True),
 ('ENCSR734ESI', False),
 ('ENCSR603BXE', True)]

In [612]:
# Print experiments with more than one match.
for d in parsed_data:
    matches = list(find_pair(parsed_data, {'biosample_term_name': (d['biosample_term_name'], 'equals'),
                                           'biosample_term_id': (d['biosample_term_id'], 'equals'),
                                           'biosample_type': (d['biosample_type'], 'equals'),
                                           'biosample_summary': (d['biosample_summary'], 'equals'),
                                           'target': (d['target'], 'equals'),
                                           'lab': (d['lab'], 'equals'),
                                           'assay_title': (d['assay_title'], 'equals')}))
    if len(matches) != 2:
        print('\n\n')
        print('SEARCH')
        for k, v in sorted(d.items()):
            print(k, v)
        print()
        print('MATCHES:', len(matches))
        for m in matches:
            for k, v in sorted(m.items()):
                print(k, v)
            print()




SEARCH
accession ENCSR908QAN
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K36me3-human

MATCHES: 3
accession ENCSR908QAN
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K36me3-human

accession ENCSR731OMG
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K36me3-human

accession ENCSR835DEF
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K36me3-human




SEARCH
accession ENCSR731OMG
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K36me3-human

MATCHES: 3
accession ENCSR908QAN
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K36me3-human

accession ENCSR731OMG
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K36me3-human

accession ENCSR835DEF
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K36me3-human




SEARCH
accession ENCSR337LWR
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K9me3-human

MATCHES: 3
accession ENCSR337LWR
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K9me3-human

accession ENCSR942LVG
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K9me3-human

accession ENCSR734ESI
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K9me3-human




SEARCH
accession ENCSR754BBV
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human

MATCHES: 4
accession ENCSR754BBV
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human

accession ENCSR707TMM
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human

accession ENCSR919RJD
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human

accession ENCSR071XJU
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human




SEARCH
accession ENCSR707TMM
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human

MATCHES: 4
accession ENCSR754BBV
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human

accession ENCSR707TMM
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human

accession ENCSR919RJD
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human

accession ENCSR071XJU
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human




SEARCH
accession ENCSR919RJD
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human

MATCHES: 4
accession ENCSR754BBV
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human

accession ENCSR707TMM
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human

accession ENCSR919RJD
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human

accession ENCSR071XJU
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human




SEARCH
accession ENCSR942LVG
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K9me3-human

MATCHES: 3
accession ENCSR337LWR
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K9me3-human

accession ENCSR942LVG
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K9me3-human

accession ENCSR734ESI
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K9me3-human




SEARCH
accession ENCSR071XJU
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human

MATCHES: 4
accession ENCSR754BBV
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human

accession ENCSR707TMM
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human

accession ENCSR919RJD
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human

accession ENCSR071XJU
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target Control-human




SEARCH
accession ENCSR862NIZ
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K27me3-human

MATCHES: 3
accession ENCSR862NIZ
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K27me3-human

accession ENCSR830PAC
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K27me3-human

accession ENCSR836LAC
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K27me3-human




SEARCH
accession ENCSR830PAC
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K27me3-human

MATCHES: 3
accession ENCSR862NIZ
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K27me3-human

accession ENCSR830PAC
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K27me3-human

accession ENCSR836LAC
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K27me3-human




SEARCH
accession ENCSR836LAC
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K27me3-human

MATCHES: 3
accession ENCSR862NIZ
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K27me3-human

accession ENCSR830PAC
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K27me3-human

accession ENCSR836LAC
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K27me3-human




SEARCH
accession ENCSR835DEF
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K36me3-human

MATCHES: 3
accession ENCSR908QAN
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K36me3-human

accession ENCSR731OMG
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K36me3-human

accession ENCSR835DEF
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K36me3-human




SEARCH
accession ENCSR734ESI
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K9me3-human

MATCHES: 3
accession ENCSR337LWR
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K9me3-human

accession ENCSR942LVG
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K9me3-human

accession ENCSR734ESI
alternate_accessions []
assay_title ChIP-seq
biosample_summary common myeloid progenitor, CD34-positive female adult (27 years)
biosample_term_id CL:0001059
biosample_term_name common myeloid progenitor, CD34-positive
biosample_type primary cell
lab bradley-bernstein
target H3K9me3-human


In [ ]:
[
    'ENCSR522XTC',
    'ENCSR858ATS',
    'ENCSR838JUD'
]

'ENCSR505TBF', 'ENCSR072QNY'
'ENCSR024OWH', 'ENCSR905SHH'
'ENCSR316YOE', 'ENCSR231FDF'
'ENCSR267TWX', 'ENCSR631BPS'
'ENCSR256FCW', 'ENCSR694CDP'
'ENCSR404BUR', 'ENCSR797GOJ'
'ENCSR853ZXZ', 'ENCSR835OJV'  
'ENCSR741MIH', 'ENCSR538NBB'

In [4]:
# Group 1
['ENCSR505TBF', 'ENCSR072QNY']
['ENCSR024OWH', 'ENCSR905SHH']
['ENCSR316YOE', 'ENCSR231FDF']
['ENCSR267TWX', 'ENCSR631BPS']
['ENCSR256FCW', 'ENCSR694CDP']
['ENCSR404BUR', 'ENCSR797GOJ']
['ENCSR853ZXZ', 'ENCSR835OJV']


Out[4]:
['ENCSR853ZXZ', 'ENCSR835OJV']

In [ ]:
# Group 2
['ENCSR741MIH', 'ENCSR538NBB']
['ENCSR780PDB', 'ENCSR769HUN']
['ENCSR515TBC', 'ENCSR136ZNV']
['ENCSR354LJH', 'ENCSR115VDV']
['ENCSR005YWW', 'ENCSR367VRA']
['ENCSR446FIP', 'ENCSR162VXO']
['ENCSR841ADQ', 'ENCSR201GCJ']

In [56]:
['ENCSR522XTC', 'ENCSR858ATS', 'ENCSR838JUD']
['ENCSR300MDI', 'ENCSR694MZZ', 'ENCSR312CKN']
['ENCSR794QZO', 'ENCSR541PBW', 'ENCSR995UTR']
['ENCSR686BAF', 'ENCSR910LOX', 'ENCSR023MFG']
['ENCSR994YFY', 'ENCSR964LDA', 'ENCSR002EZN']
['ENCSR160JSN', 'ENCSR433VDH', 'ENCSR819HSS']
['ENCSR983SYA', 'ENCSR581BGH', 'ENCSR100HGO']
['ENCSR395BYT', 'ENCSR071TJB', 'ENCSR525XER']

In [ ]:
# group 4
['ENCSR974EGY', 'ENCSR208IMU']
['ENCSR185IVZ', 'ENCSR902BOX']
['ENCSR718NBZ', 'ENCSR113FMX']
['ENCSR995KPB', 'ENCSR621XWM']
['ENCSR265OGV', 'ENCSR277WTO']

In [31]:
# group 5
['ENCSR859IKY', 'ENCSR568WPC', 'ENCSR185URR']
['ENCSR264MNL', 'ENCSR262VXI']
['ENCSR947HPP', 'ENCSR691XRZ', 'ENCSR555QHZ']
['ENCSR993MJT', 'ENCSR912CWB', 'ENCSR196LEI']
['ENCSR538WKF', 'ENCSR140JQU', 'ENCSR004AKD']
['ENCSR849ILZ', 'ENCSR629XLG', 'ENCSR006GPM']
['ENCSR866UWZ', 'ENCSR818XLT', 'ENCSR439EHQ']

In [ ]:
# group 6
['ENCSR055UQS','ENCSR726WVB']

In [ ]:


In [142]:
# Using biosample-based replicate match. Replicates with differnet
# biosamples become biological replicates. // biosample_based_rep_patch(base_exp, merge_exp)
# Can change to use donor-based replicate match, in which case replicates
# that share the same donor are techincal replicates. // donor_based_rep_patch(base_exp, merge_exp)

# Experiments to merge.
exps = ['ENCSR055UQS','ENCSR726WVB']

# Set server location.
base_url = 'https://www.encodeproject.org'
grab.base_url = base_url

# Change to True to really patch.
update = True

# Must update:
# rep number in EXP_B replicates
# EXP_B.status to replaced
# EXP_A.alternate_accession.append(EXP_B)
# EXP_A.possible_controls.append(EXP_B)
# original_files.dataset in EXP_B to point to EXP_A
# dbxrefs in EXP_A to be list(set(EXP_A.dbxrefs, EXP_B.dbxrefs))
# aliases in EXP_A to be list(set(EXP_A.aliases, EXP_B.aliases))
# remove aliases from EXP_B.
# check for unique documents in EXP_B
# add submitter_comment to EXP_A explaining the merge

parent_base = exps.pop()
for z, exp in enumerate(exps):
    # {'accession': 'UUID', 'field': 'value'} to update.
    pair = (parent_base, exp)
    patch_set = []
    a, b = pair[0], pair[1]
    pair = get_pair_data([a, b], base_url)
    base_exp, merge_exp = parse_pair(pair)
    parent_base = base_exp['accession']
    assert base_exp['uuid'] != merge_exp['uuid'], 'Cannot merge same experiment'
    # Clear rep number conflicts.
    patch_set.extend(clear_conflicting_rep_numbers(base_exp, merge_exp))
    # Calculate new replicate numbers and point to base experiment.
    patch_set.extend(biosample_based_rep_patch(base_exp, merge_exp))
    # Replace EXP_B.
    patch_set.append(({'accession': merge_exp['uuid'],
                       'status': 'replaced'},
                      'REPLACE'))
    # Remove alternate_accessions from EXP_B.
    patch_set.append(({'accession': merge_exp['uuid'],
                       'alternate_accessions': []},
                      'CLEAR ALTERNATE ACCESSIONS'))
    # Add EXP_B to EXP_A alternate_accessions.
    patch_set.append(({'accession': base_exp['accession'],
                       'alternate_accessions': list(set([a for a in
                                                         grab.chain(values_from_both('alternate_accessions'),
                                                                    [merge_exp['accession']])]))},
                      'ALTERNATE ACCESSIONS'))
    # Remove possible_controls from EXP_B.
    patch_set.append(({'accession': merge_exp['uuid'],
                       'possible_controls': []},
                      'CLEAR POSSIBLE CONTROLS'))
    # Add EXP_B possible_controls to EXP_A possible_controls.
    patch_set.append(({'accession': base_exp['accession'],
                       'possible_controls': list(set([a for a in
                                                      collapse_replacements(
                                                          values_from_both('possible_controls'), base_url)]
                                                     ))},
                      'POSSIBLE CONTROLS'))
    # Update dataset in orignal files of EXP_B.
    patch_set.extend(original_files_patch(base_exp, merge_exp))
    # Update dbxrefs in EXP_A.
    patch_set.append(({'accession': base_exp['accession'],
                       'dbxrefs': values_from_both('dbxrefs')},
                      'UPDATE DBXREFS'))
    # Remove aliases from EXP_B.
    patch_set.append(({'accession': merge_exp['uuid'],
                       'aliases': []},
                      'CLEAR ALIASES'))
    # Update aliases in EXP_A.
    patch_set.append(({'accession': base_exp['accession'],
                       'aliases': values_from_both('aliases')},
                      'UPDATE ALIASES'))
    # Update documents in EXP_A.
    patch_set.append(({'accession': base_exp['accession'],
                       'documents': values_from_both('documents')},
                      'UPDATE DOCUMENTS'))
    comment = 'Experiment {} merged into base.'.format(merge_exp['uuid'])
    sub_comment = ' '.join([get_merge_info(base_exp)['submitter_comment'],
                            get_merge_info(merge_exp)['submitter_comment']]).strip()
    patch_set.append(({'accession': base_exp['accession'],
                       'submitter_comment': comment if len(sub_comment) == 0 else '{} {}'.format(sub_comment,
                                                                                                 comment)},
                      'ADD COMMENT'))

    print('*PAIR {}*\n-----------'.format(z + 1))
    print('Base:', base_exp['accession'],
          '({})\n'.format(base_exp['uuid']),
          json.dumps(get_merge_info(base_exp), indent=4, sort_keys=True),
          '\n\nMerge:', merge_exp['accession'],
          '({})\n'.format(merge_exp['uuid']),
          json.dumps(get_merge_info(merge_exp), indent=4, sort_keys=True), '\n')
    for i, x in enumerate(patch_set):
        print(x[1])
        for k, v in sorted(x[0].items()):
            print('{}:'.format(k), v)
        print()
    if update:
        print('MAKE PATCH')
        make_patch(patch_set, base_url)
    print()


Getting redirect from https://www.encodeproject.org
*PAIR 1*
-----------
Base: ENCSR726WVB (f4f50dd5-9874-4224-95e0-c3aaf8ca833a)
 {
    "aliases": [
        "roadmap-epigenomics:ChIP-Seq analysis of H3K27ac in human substantia nigra cells_Nov-20-2012_30808"
    ],
    "alternate_accessions": [],
    "dbxrefs": [
        "GEO:GSM1112778"
    ],
    "documents": [],
    "original_files": [
        "/files/ENCFF549OOU/",
        "/files/ENCFF479MLV/",
        "/files/ENCFF647YZH/"
    ],
    "possible_controls": [
        "/experiments/ENCSR770OIC/"
    ],
    "reps": [
        [
            "675a4296-e4f2-4ab7-b460-ee41ba404840",
            "/experiments/ENCSR726WVB/",
            1,
            2,
            "ENCDO980BZD",
            "ENCBS552BYU"
        ]
    ],
    "submitter_comment": ""
} 

Merge: ENCSR055UQS (49d35ef3-621d-48fc-b9bf-4f24d97f3d03)
 {
    "aliases": [
        "roadmap-epigenomics:ChIP-Seq analysis of H3K27ac in human substantia nigra cells_Jun-28-2012_92129"
    ],
    "alternate_accessions": [],
    "dbxrefs": [
        "GEO:GSM997258"
    ],
    "documents": [],
    "original_files": [
        "/files/ENCFF565OEL/",
        "/files/ENCFF323ESA/",
        "/files/ENCFF205QOL/"
    ],
    "possible_controls": [
        "/experiments/ENCSR770OIC/"
    ],
    "reps": [
        [
            "f81bcfe6-53f9-48b8-8b90-101aed1e974a",
            "/experiments/ENCSR055UQS/",
            1,
            2,
            "ENCDO980BZD",
            "ENCBS246ADS"
        ]
    ],
    "submitter_comment": ""
} 

CLEAR REPLICATE
biological_replicate_number: 9999
uuid: 675a4296-e4f2-4ab7-b460-ee41ba404840

CLEAR REPLICATE
biological_replicate_number: 10000
uuid: f81bcfe6-53f9-48b8-8b90-101aed1e974a

REPLICATE
biological_replicate_number: 1
experiment: /experiments/ENCSR726WVB/
technical_replicate_number: 1
uuid: f81bcfe6-53f9-48b8-8b90-101aed1e974a

REPLICATE
biological_replicate_number: 2
experiment: /experiments/ENCSR726WVB/
technical_replicate_number: 1
uuid: 675a4296-e4f2-4ab7-b460-ee41ba404840

REPLACE
accession: 49d35ef3-621d-48fc-b9bf-4f24d97f3d03
status: replaced

CLEAR ALTERNATE ACCESSIONS
accession: 49d35ef3-621d-48fc-b9bf-4f24d97f3d03
alternate_accessions: []

ALTERNATE ACCESSIONS
accession: ENCSR726WVB
alternate_accessions: ['ENCSR055UQS']

CLEAR POSSIBLE CONTROLS
accession: 49d35ef3-621d-48fc-b9bf-4f24d97f3d03
possible_controls: []

POSSIBLE CONTROLS
accession: ENCSR726WVB
possible_controls: ['/experiments/ENCSR770OIC/']

ORIGINAL FILE
accession: /files/ENCFF565OEL/
dataset: /experiments/ENCSR726WVB/

ORIGINAL FILE
accession: /files/ENCFF323ESA/
dataset: /experiments/ENCSR726WVB/

ORIGINAL FILE
accession: /files/ENCFF205QOL/
dataset: /experiments/ENCSR726WVB/

UPDATE DBXREFS
accession: ENCSR726WVB
dbxrefs: ['GEO:GSM1112778', 'GEO:GSM997258']

CLEAR ALIASES
accession: 49d35ef3-621d-48fc-b9bf-4f24d97f3d03
aliases: []

UPDATE ALIASES
accession: ENCSR726WVB
aliases: ['roadmap-epigenomics:ChIP-Seq analysis of H3K27ac in human substantia nigra cells_Jun-28-2012_92129', 'roadmap-epigenomics:ChIP-Seq analysis of H3K27ac in human substantia nigra cells_Nov-20-2012_30808']

UPDATE DOCUMENTS
accession: ENCSR726WVB
documents: []

ADD COMMENT
accession: ENCSR726WVB
submitter_comment: Experiment 49d35ef3-621d-48fc-b9bf-4f24d97f3d03 merged into base.

MAKE PATCH
Patching on: https://www.encodeproject.org
PATCHING: 675a4296-e4f2-4ab7-b460-ee41ba404840 {'biological_replicate_number': 9999}
PATCHING: f81bcfe6-53f9-48b8-8b90-101aed1e974a {'biological_replicate_number': 10000}
PATCHING: f81bcfe6-53f9-48b8-8b90-101aed1e974a {'experiment': '/experiments/ENCSR726WVB/', 'technical_replicate_number': 1, 'biological_replicate_number': 1}
PATCHING: 675a4296-e4f2-4ab7-b460-ee41ba404840 {'experiment': '/experiments/ENCSR726WVB/', 'technical_replicate_number': 1, 'biological_replicate_number': 2}
PATCHING: 49d35ef3-621d-48fc-b9bf-4f24d97f3d03 {'status': 'replaced'}
PATCHING: 49d35ef3-621d-48fc-b9bf-4f24d97f3d03 {'alternate_accessions': []}
PATCHING: ENCSR726WVB {'alternate_accessions': ['ENCSR055UQS']}
PATCHING: 49d35ef3-621d-48fc-b9bf-4f24d97f3d03 {'possible_controls': []}
PATCHING: ENCSR726WVB {'possible_controls': ['/experiments/ENCSR770OIC/']}
PATCHING: /files/ENCFF565OEL/ {'dataset': '/experiments/ENCSR726WVB/'}
PATCHING: /files/ENCFF323ESA/ {'dataset': '/experiments/ENCSR726WVB/'}
PATCHING: /files/ENCFF205QOL/ {'dataset': '/experiments/ENCSR726WVB/'}
PATCHING: ENCSR726WVB {'dbxrefs': ['GEO:GSM1112778', 'GEO:GSM997258']}
PATCHING: 49d35ef3-621d-48fc-b9bf-4f24d97f3d03 {'aliases': []}
PATCHING: ENCSR726WVB {'aliases': ['roadmap-epigenomics:ChIP-Seq analysis of H3K27ac in human substantia nigra cells_Jun-28-2012_92129', 'roadmap-epigenomics:ChIP-Seq analysis of H3K27ac in human substantia nigra cells_Nov-20-2012_30808']}
PATCHING: ENCSR726WVB {'documents': []}
PATCHING: ENCSR726WVB {'submitter_comment': 'Experiment 49d35ef3-621d-48fc-b9bf-4f24d97f3d03 merged into base.'}


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: