In [ ]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import asyncio
import aiohttp
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import requests
import seaborn as sns
from ast import literal_eval
from collections import defaultdict
pd.options.display.max_rows = 200
pd.options.display.max_columns = 50
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [ ]:
# Copied from pyencoded-tools/encodedcc.py to avoid dependency.
class ENC_Key:
    def __init__(self, keyfile, keyname):
        if os.path.isfile(str(keyfile)):
            keys_f = open(keyfile, 'r')
            keys_json_string = keys_f.read()
            keys_f.close()
            keys = json.loads(keys_json_string)
        else:
            keys = keyfile
        key_dict = keys[keyname]
        self.authid = key_dict['key']
        self.authpw = key_dict['secret']
        self.server = key_dict['server']
        if not self.server.endswith("/"):
            self.server += "/"

            
class ENC_Connection(object):
    def __init__(self, key):
        self.headers = {'content-type': 'application/json', 'accept': 'application/json'}
        self.server = key.server
        self.auth = (key.authid, key.authpw)

In [ ]:
# Define key if private data desired.
key = ENC_Key(os.path.expanduser("~/keypairs.json"), 'prod')

Get accessions for all replaced items


In [ ]:
# Pull accession of all Items with replaced status.
url = 'https://www.encodeproject.org/search/'\
      '?type=File&type=Dataset&type=Donor&type=Library'\
      '&type=Pipeline&type=Biosample&type=AntibodyLot&status=replaced'\
      '&limit=all&format=json'
r = requests.get(url, auth=(key.authid, key.authpw))
search_results = r.json()['@graph']

In [ ]:
len(search_results)

In [ ]:
accessions = set()
for result in search_results:
    accessions.add(result['accession'])

In [ ]:
len(accessions)

Search for each accession and check length of results


In [ ]:
# loop.close()
# loop = asyncio.new_event_loop()
# asyncio.set_event_loop(loop)

In [ ]:
# Asyncio request.

result_length = []
bad_accessions = []
request_auth = aiohttp.BasicAuth(key.authid, key.authpw)

async def get_json(url, sem):
    async with sem:
        async with aiohttp.ClientSession() as session:
            async with session.get(url, auth=request_auth) as resp:
                return await resp.json()

async def get_request(accession, sem):
    url = 'https://www.encodeproject.org/'\
          'search/?type=Item&accession={}'\
          '&limit=all&format=json'.format(accession)
    result = await get_json(url, sem)
    search_results = result['@graph']
    num_results = len(search_results)
    result_length.append({'accession': accession,
                          'result_length': num_results})
    if num_results > 1:
        bad_accessions.append({'accession': accession,
                               'results': search_results})

sem = asyncio.Semaphore(20)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*[get_request(accession, sem) for accession in accessions]));

In [ ]:
# # Search for each accession, count number of results.
# counter = 0 
# result_length = []
# bad_accessions = []
# for accession in accessions:
#     url = 'https://www.encodeproject.org/search/'\
#           '?type=Item&accession={}'\
#           '&limit=all&format=json'.format(accession)
#     r = requests.get(url, auth=(key.authid, key.authpw))
#     search_results = r.json()['@graph']
#     result_length.append({'accession': accession,
#                           'result_length': len(search_results)})
#     if len(search_results) > 1:
#         bad_accessions.append({'accession': accession,
#                                'results': search_results})
#     counter += 1
#     if counter % 100 == 0:
#         print(".", end="")
#     if counter % 1000 == 0:
#         print("\n")

In [ ]:
# Make sure search results returned for each accession.
#assert len(accessions) == counter

In [ ]:
pd.DataFrame(result_length).result_length.value_counts()

In [ ]:
len(bad_accessions)

In [ ]:
bad_accessions[0]

In [ ]:
duplicate_accession_data = []
for bad in bad_accessions:
    for item in bad['results']:
        duplicate_accession_data.append({'accession': item['accession'],
                                         'file_format': item['file_format'],
                                         'status': item['status'],
                                         'dataset': item['dataset']})

In [ ]:
duplicate_accessions = pd.DataFrame(duplicate_accession_data)

In [ ]:
duplicate_accessions.dataset.value_counts()

Associate duplicate accessions to Experiment lab.


In [ ]:
experiment_list = duplicate_accessions.dataset.unique()

In [ ]:
search_ids = "&@id=".join(experiment_list)
url = 'https://www.encodeproject.org/search/'\
      '?type=Item&limit=all&frame=embedded&@id={}'.format(search_ids)
r = requests.get(url, auth=(key.authid, key.authpw))
search_results = r.json()['@graph']
search_id_map = {}
for experiment in search_results:
    search_id_map[experiment['@id']] = experiment['lab']['name']

In [ ]:
duplicate_accessions['lab'] = duplicate_accessions.dataset.apply(lambda x: search_id_map[x])

In [ ]:
print(*sorted(duplicate_accessions.lab.unique()), sep='\n')

In [ ]:
list(duplicate_accessions.accession.unique())

In [ ]:
duplicate_accessions[duplicate_accessions.status == "replaced"].groupby(['lab',
                                                                         'accession',
                                                                         'status',
                                                                         'file_format']).count().sort_index(0)[[]]

In [ ]:
duplicate_accessions.groupby(['lab',
                              'status',
                              'dataset',
                              'accession',
                              'file_format']).count().sort_index(1, 0)

In [ ]:
duplicate_accessions.groupby(['accession',
                              'status', 'file_format',
                              'lab',
                              'dataset',
                              'file_format']).count().sort_index(1, 0).unstack()

In [ ]:
duplicate_accessions

Data for all replaced Items


In [ ]:
# Grab data of all replaced Items.
replaced_data = []
url = 'https://www.encodeproject.org/search/'\
      '?type=File&type=Dataset&type=Donor&type=Library'\
      '&type=Pipeline&type=Biosample&type=AntibodyLot&status=replaced'\
      '&frame=embedded&limit=all&format=json'
r = requests.get(url, auth=(key.authid, key.authpw))
search_results = r.json()['@graph']
na = 'not_available'
for result in search_results:
    sub_by = result.get('submitted_by', {})
    if isinstance(sub_by, str):
        submitted_by = sub_by
    else:
        submitted_by = sub_by.get('title', na)
    lab = result.get('lab', {})
    if isinstance(lab, str):
        lab_name = lab
    else:
        lab_name = lab.get('name', na)
    item_data = {'accession': result['accession'],
                 'submitted_by': submitted_by,
                 'derived_from': result.get('derived_from', na),
                 'superseded_by': result.get('superseded_by', na),
                 'supersedes': result.get('supersedes', na),
                 '@id': result['@id'],
                 'alternate_accessions': result.get('alternate_accessions', na),
                 'dataset': result.get('dataset', na),
                 'lab_name': lab_name,
                 'date_created': result.get('date_created', na),
                 '@type': result['@type'][0],
                 'output_type': result.get('output_type', na),
                 'file_format': result.get('file_format', na),
                 'assembly': result.get('assembly', na),
                 'paired_with': result.get('paired_with', na),
                 'paired_end': result.get('paired_end', na),
                 'file_format_type': result.get('file_format_type', na),
                 'technical_replicates': result.get('technical_replicates', na),
                 'replicate_uuid': result.get('replicate', {}).get('uuid', na),
                 'md5sum': result.get('md5sum', na),
                 'content_md5sum': result.get('content_md5sum', na),
                 'status': result['status'],
                 'product_id': result.get('product_id', na),
                 'culture_start_date': result.get('culture_start_date', na),
                 'biosample_type': result.get('biosample_type', na),
                 'description': result.get('description', na),
                 'treatments': result.get('treatments', na)
                }
    replaced_data.append(item_data)

In [ ]:
replaced_data[900]

In [ ]:
len(replaced_data)

In [ ]:
def parse_lab_name(lab):
    if isinstance(lab, str):
        parse_lab = lab.replace("/", "").replace("labs", "")
        return parse_lab
    else:
        return lab[0]

In [ ]:
rd = pd.DataFrame(replaced_data)
rd.lab_name = rd.lab_name.apply(lambda x: parse_lab_name(x))
rd.loc[rd.assembly.apply(lambda x: len(x) == 0), 'assembly'] = 'empty_list'
rd.loc[rd.superseded_by.apply(lambda x: len(x) == 0), 'superseded_by'] = 'empty_list'
rd.loc[rd.supersedes.apply(lambda x: len(x) == 0), 'supersedes'] = 'empty_list'
rd.loc[rd.derived_from.apply(lambda x: len(x) == 0), 'derived_from'] = 'empty_list'
rd.loc[rd.technical_replicates.apply(lambda x: len(x) == 0), 'technical_replicates'] = 'empty_list'
rd.loc[rd.alternate_accessions.apply(lambda x: len(x) == 0), 'alternate_accessions'] = 'empty_list'
rd.loc[rd.treatments.apply(lambda x: len(x) == 0), 'treatments'] = 'empty_list'

Check to see if replacement is similar to replaced (optional)


In [ ]:
def drop_unique_fields(data):
    drop_fields = ['@id',
                   '@accession',
                   'md5sum',
                   'content_md5sum',
                   'date_created']
    data = {k: v for k, v in data.items() if k not in drop_fields}
    return data

In [ ]:
replacement_data = []
broken_pair = defaultdict(list)
for accession in rd.accession.unique():
    replaced_values = rd[rd.accession == accession].to_dict(orient='records')[0]
    url = 'https://www.encodeproject.org/{}/?format=json'.format(accession)
    r = requests.get(url, auth=(key.authid, key.authpw))
    if (r.status_code == 200):
        result = r.json()
        sub_by = result.get('submitted_by', {})
        if isinstance(sub_by, str):
            submitted_by = sub_by
        else:
            submitted_by = sub_by.get('title', na)
        lab = result.get('lab', {})
        if isinstance(lab, str):
            lab_name = lab
        else:
            lab_name = lab.get('name', na)
        item_data = {'accession': result['accession'],
                     'submitted_by': submitted_by,
                     '@id': result['@id'],
                     'alternate_accessions': result.get('alternate_accessions', na),
                     'dataset': result.get('dataset', na),
                     'lab_name': lab_name,
                     'date_created': result.get('date_created', na),
                     '@type': result['@type'][0],
                     'output_type': result.get('output_type', na),
                     'file_format': result.get('file_format', na),
                     'assembly': result.get('assembly', na),
                     'paired_with': result.get('paired_with', na),
                     'paired_end': result.get('paired_end', na),
                     'file_format_type': result.get('file_format_type', na),
                     'technical_replicates': result.get('technical_replicates', na),
                     'replicate_uuid': result.get('replicate', {}).get('uuid', na),
                     'md5sum': result.get('md5sum', na),
                     'content_md5sum': result.get('content_md5sum', na),
                     'status': result['status'],
                     'product_id': result.get('product_id', na),
                     'culture_start_date': result.get('culture_start_date', na),
                     'biosample_type': result.get('biosample_type', na),
                     'description': result.get('description', na),
                     'treatments': result.get('treatments', na)
                    }
        item_temp = pd.DataFrame([item_data])
        item_temp.lab_name = item_temp.lab_name.apply(lambda x: parse_lab_name(x))
        item_temp.loc[item_temp.assembly.apply(lambda x: len(x) == 0), 'assembly'] = 'empty_list'
        item_temp.loc[item_temp.technical_replicates.apply(lambda x: len(x) == 0), 'technical_replicates'] = 'empty_list'
        item_temp.loc[item_temp.alternate_accessions.apply(lambda x: len(x) == 0), 'alternate_accessions'] = 'empty_list'
        item_temp.loc[item_temp.treatments.apply(lambda x: len(x) == 0), 'treatments'] = 'empty_list'
        item_temp = item_temp.to_dict(orient='records')[0]
        replaced_dict = drop_unique_fields(replaced_values)
        replacement_dict = drop_unique_fields(replaced_dict)
        if replaced_dict != replacement_dict:
            broken_pair['accession'].append(item_data)
        replacement_data.append(item_data)

In [ ]:
len(replacement_data)

Data for portal redirect of replaced accessions


In [ ]:
# loop.close()
# loop = asyncio.new_event_loop()
# asyncio.set_event_loop(loop)

In [ ]:
# Asyncio request.
replaced_by_file = []
na = 'not_available'

async def get_request(session, accession):
        url = 'https://www.encodeproject.org/{}'.format(accession)
        async with session.get(url, auth=request_auth, timeout=None) as response:
                if response.status == 404:
                    item_data = {'searched_accession': accession,
                                 'redirected_to_accession': 'no_result'}
                    replaced_by_file.append(item_data)
                else:
                    result = await response.json()
                    sub_by = result.get('submitted_by', {})
                    if isinstance(sub_by, str):
                        submitted_by = sub_by
                    else:
                        submitted_by = sub_by.get('title', na)
                    lab = result.get('lab', {})
                    if isinstance(lab, str):
                        lab_name = lab
                    else:
                        lab_name = lab.get('name', na)
                    item_data = {'accession': result['accession'],
                                 'submitted_by': submitted_by,
                                 'derived_from': result.get('derived_from', na),
                                 'superseded_by': result.get('superseded_by', na),
                                 'supersedes': result.get('supersedes', na),
                                 '@id': result['@id'],
                                 'alternate_accessions': result.get('alternate_accessions', na),
                                 'dataset': result.get('dataset', na),
                                 'lab_name': lab_name,
                                 'date_created': result.get('date_created', na),
                                 '@type': result['@type'][0],
                                 'output_type': result.get('output_type', na),
                                 'file_format': result.get('file_format', na),
                                 'assembly': result.get('assembly', na),
                                 'paired_with': result.get('paired_with', na),
                                 'paired_end': result.get('paired_end', na),
                                 'file_format_type': result.get('file_format_type', na),
                                 'technical_replicates': result.get('technical_replicates', na),
                                 'replicate_uuid': result.get('replicate', {}).get('uuid', na),
                                 'md5sum': result.get('md5sum', na),
                                 'content_md5sum': result.get('content_md5sum', na),
                                 'status': result['status'],
                                 'product_id': result.get('product_id', na),
                                 'culture_start_date': result.get('culture_start_date', na),
                                 'biosample_type': result.get('biosample_type', na),
                                 'description': result.get('description', na),
                                 'treatments': result.get('treatments', na)}
                    replaced_by_file.append(item_data)
                if len(replaced_by_file) % 100 == 0:
                    print(len(replaced_by_file))
    
async def create_session(accessions, loop):
    connector = aiohttp.TCPConnector(keepalive_timeout=10, limit=100)
    async with aiohttp.ClientSession(connector=connector, loop=loop) as session:
        results = await asyncio.gather(*[get_request(session, accession) for accession in accessions])

In [ ]:
loop = asyncio.get_event_loop()
loop.run_until_complete(create_session(accessions, loop))

In [ ]:
len(replaced_by_file)

In [ ]:
len(accessions)

In [ ]:
# # Asyncio request.

# request_auth = aiohttp.BasicAuth(key.authid, key.authpw)

# replaced_by_file = []
# na = 'not_available'

# async def get_request(url, sem):
#     async with sem:
#         async with aiohttp.ClientSession() as session:
#             async with session.get(url, auth=request_auth) as resp:
#                 return await resp.json()

# async def get_data(accession, sem):
#     url = 'https://www.encodeproject.org/{}'.format(accession)
#     result = await get_request(url, sem)
#     if result.get('code', False) == 404:
#         item_data = {'searched_accession': accession,
#                      'redirected_to_accession': 'no_result'}
#         replaced_by_file.append(item_data)
#     else:
#         sub_by = result.get('submitted_by', {})
#         if isinstance(sub_by, str):
#             submitted_by = sub_by
#         else:
#             submitted_by = sub_by.get('title', na)
#         lab = result.get('lab', {})
#         if isinstance(lab, str):
#             lab_name = lab
#         else:
#             lab_name = lab.get('name', na)
#         item_data = {'accession': result['accession'],
#                      'submitted_by': submitted_by,
#                      'derived_from': result.get('derived_from', na),
#                      'superseded_by': result.get('superseded_by', na),
#                      'supersedes': result.get('supersedes', na),
#                      '@id': result['@id'],
#                      'alternate_accessions': result.get('alternate_accessions', na),
#                      'dataset': result.get('dataset', na),
#                      'lab_name': lab_name,
#                      'date_created': result.get('date_created', na),
#                      '@type': result['@type'][0],
#                      'output_type': result.get('output_type', na),
#                      'file_format': result.get('file_format', na),
#                      'assembly': result.get('assembly', na),
#                      'paired_with': result.get('paired_with', na),
#                      'paired_end': result.get('paired_end', na),
#                      'file_format_type': result.get('file_format_type', na),
#                      'technical_replicates': result.get('technical_replicates', na),
#                      'replicate_uuid': result.get('replicate', {}).get('uuid', na),
#                      'md5sum': result.get('md5sum', na),
#                      'content_md5sum': result.get('content_md5sum', na),
#                      'status': result['status'],
#                      'product_id': result.get('product_id', na),
#                      'culture_start_date': result.get('culture_start_date', na),
#                      'biosample_type': result.get('biosample_type', na),
#                      'description': result.get('description', na),
#                      'treatments': result.get('treatments', na)
#                     }
#         replaced_by_file.append(item_data)
        
# sem = asyncio.Semaphore(100)
# loop = asyncio.get_event_loop()
# loop.run_until_complete(asyncio.gather(*[get_data(accession, sem) for accession in accessions]));

In [ ]:
# loop = asyncio.get_event_loop()
# loop.run_until_complete(create_session(accessions, loop))

In [ ]:


In [ ]:
# # For every replaced accession:
# # Check if https://www.encodeproject.org/{accession} returns anything.
# # If so, does it match replaced file type?
# replaced_by_file = []
# na = 'not_available'
# for accession in accessions:
#     url = 'https://www.encodeproject.org/{}'.format(accession)
#     r = requests.get(url, auth=(key.authid, key.authpw))
#     if r.status_code == 404:
#         item_data = {'searched_accession': accession,
#                      'redirected_to_accession': 'no_result'}
#         replaced_by_file.append(item_data)
#     else:
#         result = r.json()
#         sub_by = result.get('submitted_by', {})
#         if isinstance(sub_by, str):
#             submitted_by = sub_by
#         else:
#             submitted_by = sub_by.get('title', na)
#         lab = result.get('lab', {})
#         if isinstance(lab, str):
#             lab_name = lab
#         else:
#             lab_name = lab.get('name', na)
#         item_data = {'accession': result['accession'],
#                      'submitted_by': submitted_by,
#                      'derived_from': result.get('derived_from', na),
#                      'superseded_by': result.get('superseded_by', na),
#                      'supersedes': result.get('supersedes', na),
#                      '@id': result['@id'],
#                      'alternate_accessions': result.get('alternate_accessions', na),
#                      'dataset': result.get('dataset', na),
#                      'lab_name': lab_name,
#                      'date_created': result.get('date_created', na),
#                      '@type': result['@type'][0],
#                      'output_type': result.get('output_type', na),
#                      'file_format': result.get('file_format', na),
#                      'assembly': result.get('assembly', na),
#                      'paired_with': result.get('paired_with', na),
#                      'paired_end': result.get('paired_end', na),
#                      'file_format_type': result.get('file_format_type', na),
#                      'technical_replicates': result.get('technical_replicates', na),
#                      'replicate_uuid': result.get('replicate', {}).get('uuid', na),
#                      'md5sum': result.get('md5sum', na),
#                      'content_md5sum': result.get('content_md5sum', na),
#                      'status': result['status'],
#                      'product_id': result.get('product_id', na),
#                      'culture_start_date': result.get('culture_start_date', na),
#                      'biosample_type': result.get('biosample_type', na),
#                      'description': result.get('description', na),
#                      'treatments': result.get('treatments', na)
#                     }
#         replaced_by_file.append(item_data)

In [ ]:
len(accessions)

In [ ]:
len(replaced_by_file)

In [ ]:
rbf = pd.DataFrame(replaced_by_file)
rbf = rbf.fillna('is_null')

In [ ]:
rbf.lab_name = rbf.lab_name.apply(lambda x: parse_lab_name(x))
rbf.loc[rbf.assembly.apply(lambda x: len(x) == 0), 'assembly'] = 'empty_list'
rbf.loc[rbf.technical_replicates.apply(lambda x: len(x) == 0), 'technical_replicates'] = 'empty_list'
rbf.loc[rbf.superseded_by.apply(lambda x: len(x) == 0), 'superseded_by'] = 'empty_list'
rbf.loc[rbf.supersedes.apply(lambda x: len(x) == 0), 'supersedes'] = 'empty_list'
rbf.loc[rbf.derived_from.apply(lambda x: len(x) == 0), 'derived_from'] = 'empty_list'

In [ ]:
df = pd.read_excel('replaced_items_no_redirect_06_12_2017.xlsx')

In [ ]:
df.shape

In [ ]:
df['@type'].value_counts()

In [ ]:
dff = df[df['@type'] == 'File']
dff.dataset

In [ ]:
def get_assay_type(experiment):
    url = 'https://www.encodeproject.org{}?format=json'.format(experiment)
    r = requests.get(url, auth=(key.authid, key.authpw))
    result = r.json()
    return result.get('assay_term_name', 'na')
def get_lab_name(experiment):
    url = 'https://www.encodeproject.org/{}/?format=json'.format(experiment)
    r = requests.get(url, auth=(key.authid, key.authpw))
    result = r.json()
    return result.get('lab', {}).get('name', 'na')

In [ ]:
dff.dataset

In [ ]:
dff['assay_type'] = dff.dataset.apply(lambda x: get_assay_type(x))

In [ ]:
dff.assay_type.value_counts()

In [ ]:
dff['experiment_lab'] = dff.dataset.apply(lambda x: get_lab_name(x))

In [ ]:
#rbf.to_csv("replaced_by_search.tsv", sep="\t")

Merge redirect data with replaced Item data


In [ ]:
no_redirect_accessions = rd[rd.accession.isin(rbf[rbf.redirected_to_accession == "no_result"].searched_accession.values)]
no_redirect_accessions = no_redirect_accessions.sort_values('@type').reset_index(drop=True)
no_redirect_accessions.loc[no_redirect_accessions.description.apply(lambda x: len(x) == 0), 'description'] = 'empty_string'

In [ ]:
no_redirect_accessions['status'].value_counts()

In [ ]:
no_redirect_accessions.content_md5sum.value_counts()

In [ ]:
no_redirect_accessions.description.value_counts()

In [ ]:
no_redirect_accessions.lab_name.value_counts()

In [ ]:
no_redirect_accessions['@type'].value_counts()

In [ ]:
no_redirect_accessions[no_redirect_accessions.md5sum != "not_available"].accession.unique()

In [ ]:
len(no_redirect_accessions[no_redirect_accessions.md5sum != "not_available"].accession.unique())

In [ ]:
len(no_redirect_accessions[no_redirect_accessions.md5sum == 'not_available'].accession.unique())

In [ ]:
#.to_excel('replaced_items_no_redirect_06_12_2017.xlsx')

Search for possible replacement files with same MD5sum


In [ ]:
# possible_replacements = defaultdict(list)
# for md5 in no_redirect_accessions.md5sum.unique()[1:]:
#     url = 'https://www.encodeproject.org/search/'\
#           '?type=Item&md5sum={}&status%21=replaced'\
#           '&frame=embedded&limit=all&format=json'.format(md5)
#     r = requests.get(url, auth=(key.authid, key.authpw))
#     if (r.status_code == 404) or (len(r.json()['@graph']) == 0):
#         item_data = {'md5sum': md5,
#                      'accession': 'no_result'}
#         possible_replacements[md5].append(item_data)
#     else:
#         results = r.json()['@graph']
#         for result in results:
#             lab = result.get('lab', {})
#             if isinstance(lab, str):
#                 lab_name = lab
#             else:
#                 lab_name = lab.get('name', na)
#             possible_replacements[md5].append({'accession': result['accession'],
#                                                '@id': result['@id'],
#                                                'alternate_accessions': result.get('alternate_accessions', na),
#                                                'dataset': result.get('dataset', na),
#                                                'lab_name': lab_name,
#                                                'date_created': result.get('date_created', na),
#                                                '@type': result['@type'][0],
#                                                'output_type': result.get('output_type', na),
#                                                'file_format': result.get('file_format', na),
#                                                'assembly': result.get('assembly', na),
#                                                'paired_with': result.get('paired_with', na),
#                                                'paired_end': result.get('paired_end', na),
#                                                'file_format_type': result.get('file_format_type', na),
#                                                'technical_replicates': result.get('technical_replicates', na),
#                                                'replicate_uuid': result.get('replicate', {}).get('uuid', na),
#                                                'md5sum': result.get('md5sum', na),
#                                                'content_md5sum': result.get('content_md5sum', na),
#                                                'status': result['status']
#                                               })

In [ ]:
loop.close()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)

In [ ]:
possible_replacements = defaultdict(list)

async def get_request(session, md5):
    url = 'https://www.encodeproject.org/search/'\
                  '?type=Item&md5sum={}&status%21=replaced'\
                  '&frame=embedded&limit=all&format=json'.format(md5)
    async with session.get(url, auth=request_auth) as response:
        r = await response.json()
        results = r['@graph']
        if len(results) == 0:
            item_data = {'md5sum': md5,
                         'accession': 'no_result'}
            possible_replacements[md5].append(item_data)
        else:
            for result in results:
                lab = result.get('lab', {})
                if isinstance(lab, str):
                    lab_name = lab
                else:
                    lab_name = lab.get('name', na)
                possible_replacements[md5].append({'accession': result['accession'],
                                                   '@id': result['@id'],
                                                   'alternate_accessions': result.get('alternate_accessions', na),
                                                   'dataset': result.get('dataset', na),
                                                   'lab_name': lab_name,
                                                   'date_created': result.get('date_created', na),
                                                   '@type': result['@type'][0],
                                                   'output_type': result.get('output_type', na),
                                                   'file_format': result.get('file_format', na),
                                                   'assembly': result.get('assembly', na),
                                                   'paired_with': result.get('paired_with', na),
                                                   'paired_end': result.get('paired_end', na),
                                                   'file_format_type': result.get('file_format_type', na),
                                                   'technical_replicates': result.get('technical_replicates', na),
                                                   'replicate_uuid': result.get('replicate', {}).get('uuid', na),
                                                   'md5sum': result.get('md5sum', na),
                                                   'content_md5sum': result.get('content_md5sum', na),
                                                   'status': result['status']
                                                  })

async def create_session(md5s, loop):
    conn = aiohttp.TCPConnector(keepalive_timeout=10, limit=100)
    async with aiohttp.ClientSession(connector=conn, loop=loop) as session:
        results = await asyncio.gather(*[get_request(session, md5) for md5 in md5s])

In [ ]:
loop = asyncio.get_event_loop()
loop.run_until_complete(create_session(no_redirect_accessions.md5sum.unique()[1:], loop))

In [ ]:
len(possible_replacements)

In [ ]:
possible_replacements

In [ ]:
possible_merge = [item for key, value in possible_replacements.items()
                       for item in value if item['accession'] != 'no_result']

In [ ]:
possible_merge = pd.DataFrame(possible_merge)

In [ ]:
possible_merge = possible_merge.rename(columns={'accession': 'possible_redirect_accession',
                                                'status': 'possible_redirect_status'})

In [ ]:
possible_merge.loc[possible_merge.technical_replicates.apply(lambda x: len(x) == 0), 'technical_replicates'] = 'empty_list'

In [ ]:
possible_merge.shape

In [ ]:
no_matches = no_redirect_accessions[~(no_redirect_accessions.md5sum.isin(possible_merge.md5sum.values))].reset_index(drop=True)

In [ ]:
pm = possible_merge.merge(no_redirect_accessions,
                          how='left',
                          suffixes=('_new', '_old'),
                          on=['md5sum',
                              '@type',
                              'file_format',
                              'file_format_type'])[['md5sum',
                                                    'accession',
                                                    'status',
                                                    'possible_redirect_accession',
                                                    'possible_redirect_status',
                                                    '@type',
                                                    'file_format',
                                                    'file_format_type',
                                                    'assembly_old',
                                                    'assembly_new',
                                                    'dataset_old',
                                                    'dataset_new',
                                                    'date_created_old',
                                                    'date_created_new',
                                                    'lab_name_old',
                                                    'lab_name_new',
                                                    'technical_replicates_old',
                                                    'technical_replicates_new',
                                                    '@id_old',
                                                    '@id_new',
                                                    'output_type_old',
                                                    'output_type_new',
                                                    'paired_end_old',
                                                    'paired_end_new',
                                                    'paired_with_old',
                                                    'paired_with_new',
                                                    'replicate_uuid_old',
                                                    'replicate_uuid_new',
                                                    'alternate_accessions_old',
                                                    'alternate_accessions_new',
                                                    'content_md5sum_old',
                                                    'content_md5sum_new']]
pm#.to_excel('possible_redirect_accessions_for_replaced_files_06_12_2017.xlsx')

In [ ]:
no_redirect_accessions[no_redirect_accessions.accession == 'ENCFF133IYK']

In [ ]:
pm.shape

In [ ]:
len(pm.accession.unique())

In [ ]:
replacements_exact_match = pm[(pm.dataset_old == pm.dataset_new)].reset_index(drop=True)

In [ ]:
replacements_exact_match.shape

In [ ]:
replacements_exact_match[[col for col in replacements_exact_match]]

In [ ]:
replacements_different = pm[~(pm.dataset_old == pm.dataset_new)].reset_index(drop=True)

In [ ]:
replacements_different.shape

In [ ]:
replacements_different

In [ ]:
# Different datasets but same MD5. Have to update replaced file to have replacement dataset.
replacements_update_dataset = replacements_different[['@id_old', 'dataset_new']].rename(columns={'@id_old': '@id', 'dataset_new': 'dataset'})
#replacements_update_dataset.to_csv('../../update_dataset_of_replaced_filed_matching_md5_06_27_2017.tsv', index=False, sep='\t')

In [ ]:
# Now set exact match

In [ ]:
replacements_patch = replacements_exact_match[['possible_redirect_accession',
                                               'accession']].rename(columns={'accession': 'alternate_accessions:array',
                                                                             'possible_redirect_accession': 'accession'})

In [ ]:
replacements_patch = replacements_patch.sort_values("alternate_accessions:array")

In [ ]:
replacements_patch.shape

In [ ]:
flat_list_patch = []
for accession in replacements_patch.accession.unique():
    data = {'accession': accession,
            'alternate_accessions:array': ", ".join(replacements_patch[replacements_patch.accession == accession]\
                                                                                   ['alternate_accessions:array'].values)}
    flat_list_patch.append(data)

In [ ]:
replacements_patch_flat_list = pd.DataFrame(flat_list_patch)

In [ ]:
#replacements_patch_flat_list.to_csv('../../replaced_with_matching_replacements_patch_06_27_2017.tsv', sep="\t", index=False)

In [ ]:
#replacements_different.sort_values('possible_redirect_accession').to_excel('replaced_same_md5_mismatched_dataset_06_14_2017.xlsx', index=False)

Extract the MD5sums with no matching replacements


In [ ]:
no_matching_md5_replacements = [item['md5sum'] for key, value in possible_replacements.items()
                                               for item in value if item['accession'] == 'no_result']

In [ ]:
len(pd.DataFrame(list(set(no_matching_md5_replacements))).rename(columns={0: 'md5sum'}).merge(no_redirect_accessions,
                                                                                         how='left',
                                                                                         on='md5sum')['accession'].unique())

Search for similar types of Files for possible replacement


In [ ]:
no_redirect_file = no_redirect_accessions[no_redirect_accessions['@type'] == 'File'].reset_index(drop=True)

In [ ]:
no_redirect_file

In [ ]:
na = 'not_available'
possible_replacements = defaultdict(list)
async def get_request_two(session, url, r):
     async with session.get(url, auth=request_auth) as response_two:
        result_one = await response_two.json()
        search_results = result_one['@graph']
        if len(search_results) == 0:
            possible_replacements[r['accession']].append({'accession': r['accession'],
                                                          'possible_replacement_accession': 'no_result'})
        for result in search_results:
                lab = result.get('lab', {})
                sub_by = result.get('submitted_by', {})
                if isinstance(sub_by, str):
                    submitted_by = sub_by
                else:
                    submitted_by = sub_by.get('title', na)
                if isinstance(lab, str):
                    lab_name = lab
                else:
                    lab_name = lab.get('name', na)
                possible_replacements[r['accession']].append({'accession': r['accession'],
                                                              'possible_replacement_accession': result['accession'],
                                                              '@id': result['@id'],
                                                              'alternate_accessions': result.get('alternate_accessions', na),
                                                              'dataset': result.get('dataset', na),
                                                              'lab_name': lab_name,
                                                              'date_created': result.get('date_created', na),
                                                              '@type': result['@type'][0],
                                                              'output_type': result.get('output_type', na),
                                                              'file_format': result.get('file_format', na),
                                                              'assembly': result.get('assembly', na),
                                                              'paired_with': result.get('paired_with', na),
                                                              'paired_end': result.get('paired_end', na),
                                                              'file_format_type': result.get('file_format_type', na),
                                                              'technical_replicates': result.get('technical_replicates', na),
                                                              'replicate_uuid': result.get('replicate', {}).get('uuid', na),
                                                              'md5sum': result.get('md5sum', na),
                                                              'content_md5sum': result.get('content_md5sum', na),
                                                              'status': result['status'],
                                                              'submitted_by': submitted_by,
                                                              'derived_from': result.get('derived_from', na),
                                                              'superseded_by': result.get('superseded_by', na),
                                                              'supersedes': result.get('supersedes', na)
                                                              })
    
async def get_request_one(session, file_id):
    url = 'https://www.encodeproject.org/{}/?format=json'.format(file_id)
    async with session.get(url, auth=request_auth) as response_one:
        result_one = await response_one.json()
        r = result_one
        file_format = r['file_format']
        output_type = r['output_type']
        dataset = r['dataset']
        assembly = r.get('assembly', '*')
        try:
            assay_term_name = r['quality_metrics'][0]['assay_term_name']
            url = 'https://www.encodeproject.org/search/?type=File&file_format={}'\
                  '&output_type={}&quality_metrics.assay_term_name={}'\
                  '&dataset={}&assembly={}&format=json&frame=embedded'\
                  '&status!=replaced'.format(file_format,
                                             output_type,
                                             assay_term_name,
                                             dataset,
                                             assembly)
        except IndexError:
            url = 'https://www.encodeproject.org/search/?type=File&file_format={}'\
                  '&output_type={}&dataset={}&assembly={}&format=json&frame=embedded'\
                  '&status!=replaced'.format(file_format,
                                             output_type,
                                             dataset,
                                             assembly)
        if assembly == '*':
            url = url.replace('&assembly=*', '&assembly!=*')
        result_two = await get_request_two(session, url, r)
            
async def create_session(file_ids, loop):
    conn = aiohttp.TCPConnector(keepalive_timeout=10, limit=100)
    async with aiohttp.ClientSession(connector=conn, loop=loop) as session:  
        results = await asyncio.gather(*[get_request_one(session, file_id) for file_id in file_ids])

In [ ]:
loop = asyncio.get_event_loop()
loop.run_until_complete(create_session(no_redirect_file['@id'].unique(), loop))

In [ ]:
len(possible_replacements)

In [ ]:
possible_replacements

Fill in empty_lists for list fields


In [ ]:
replacement_search = pd.DataFrame([item for key, value in possible_replacements.items() for item in value])
replacement_search = replacement_search.fillna('isnull')
replacement_search.loc[replacement_search.alternate_accessions.apply(lambda x: len(x) == 0), 'alternate_accessions'] = 'empty_list'
replacement_search.loc[replacement_search.technical_replicates.apply(lambda x: len(x) == 0), 'technical_replicates'] = 'empty_list'
replacement_search.loc[replacement_search.superseded_by.apply(lambda x: len(x) == 0), 'superseded_by'] = 'empty_list'
replacement_search.loc[replacement_search.supersedes.apply(lambda x: len(x) == 0), 'supersedes'] = 'empty_list'
replacement_search.loc[replacement_search.derived_from.apply(lambda x: len(x) == 0), 'derived_from'] = 'empty_list'

no_redirect_file.loc[no_redirect_accessions.alternate_accessions.apply(lambda x: len(x) == 0), 'alternate_accessions'] = 'empty_list'
no_redirect_file.loc[no_redirect_accessions.technical_replicates.apply(lambda x: len(x) == 0), 'technical_replicates'] = 'empty_list'
no_redirect_file.loc[no_redirect_file.superseded_by.apply(lambda x: len(x) == 0), 'superseded_by'] = 'empty_list'
no_redirect_file.loc[no_redirect_file.supersedes.apply(lambda x: len(x) == 0), 'supersedes'] = 'empty_list'
no_redirect_file.loc[no_redirect_file.derived_from.apply(lambda x: len(x) == 0), 'derived_from'] = 'empty_list'

In [ ]:
rsm = replacement_search.merge(no_redirect_file,
                          how='left',
                          suffixes=('_new', '_old'),
                          on=['accession'])

In [ ]:
rsm.shape

Substitute replaced file_ids with replacement file_ids in derived_from fields


In [ ]:
rsm = rsm[~(rsm.status_new.isin(['revoked', 'deleted']))]

In [ ]:
# Extract lookup table from data with just one result.
# If derived_from File doesn't redirect then look up and see possible replacement.
# Use that as fill in value of comparison.
dfl = rsm[(rsm.possible_replacement_accession != 'no_result')
                          & (rsm.technical_replicates_old == rsm.technical_replicates_new)].drop_duplicates('accession',
                                                                                                            keep=False).reset_index(drop=True)
dfl.shape

In [ ]:
rsm[(rsm.possible_replacement_accession != 'no_result')
    & (rsm.technical_replicates_old == rsm.technical_replicates_new)].drop_duplicates('accession',
                                                                                      keep=False).reset_index(drop=True).shape

In [ ]:
# Create from previous iterations below.
derived_from_lookup = pd.concat([dfl, matching_rep.drop_duplicates('accession', keep=False)], axis=0).drop_duplicates('accession').reset_index(drop=True)

In [ ]:
len(derived_from_lookup.accession.unique())

In [ ]:
def get_json(id):
    url = 'https://www.encodeproject.org/{}/?format=json'.format(id)
    return requests.get(url, auth=(key.authid, key.authpw))

def parse_derived_from(x):
    if len(x) == 0 or x == 'not_available':
        return x
    new_list = []
    for y in x:
        y_id = y.split('/')[2]
        if y_id.startswith('ENC'):
            new_list.append(y)
            continue
        else:
            r = get_json(y)
            try:
                accession = r.json()['accession']
                r = get_json(accession)
                if r.status_code == 404:
                    # Pull from local lookup table.
                    try:
                        accession_replacement = derived_from_lookup[derived_from_lookup.accession == accession]\
                                                                      .possible_replacement_accession.values[0]
                        new_list.append('/files/{}/'.format(accession_replacement))
                    # If no results returned from one-result table.
                    except IndexError:
                        new_list.append(y)
                else:
                    accession_replacement = r.json()['accession']
                    new_list.append('/files/{}/'.format(accession_replacement))
            except KeyError:
                print(y)
                print(x)
                new_list.append(y)
    return new_list

In [ ]:
rsm_derived_from_old = rsm.derived_from_old.apply(lambda x: parse_derived_from(x))

In [ ]:
rsm.derived_from_old = rsm_derived_from_old

In [ ]:
rsm

In [ ]:
rsm[~(rsm['@id_old'].isin(['/files/d9e23f37-9b33-41b9-b9df-0700ca87bc75/',
                         '/files/3efeced1-a3c5-4131-a721-7c5f743350a9/',
                         '/files/9fe192e9-af81-46f5-a16f-4d6b5cda577c/'])) & (rsm.supersedes_new != 'not_available')][cols]

In [ ]:

Parse lists for comparison


In [ ]:
lazy_dict = {'_,e,i,l,m,p,s,t,t,y': 'empty_list',
             'i,l,l,n,s,u': 'isnull',
             '_,a,a,a,b,e,i,l,l,n,o,t,v': 'not_available'}
def parse_list(x):
    return ','.join([y.strip() for y in sorted(x)])
rsm.date_created_old = rsm.date_created_old.apply(lambda x: pd.to_datetime(x))
for field in ['technical_replicates_old',
              'technical_replicates_new',
              'superseded_by_old',
              'superseded_by_new',
              'supersedes_old',
              'supersedes_new',
              'derived_from_old',
              'derived_from_new']:
    rsm[field] = rsm[field].apply(lambda x: parse_list(x)).apply(lambda x: lazy_dict[x] if x in lazy_dict.keys() else x)

In [ ]:
rsm[rsm.technical_replicates_old != rsm.technical_replicates_new][['technical_replicates_old',
                                                                   'technical_replicates_new']]

In [ ]:
rsm[rsm.accession == 'ENCFF721IVN'][cols]

In [ ]:
rsm[rsm.derived_from_old != rsm.derived_from_new][['derived_from_old', 'derived_from_new']]

Matching content_md5sum, ready to patch


In [ ]:
rsm_patch = rsm[(rsm.content_md5sum_old == rsm.content_md5sum_new)
    & (rsm.content_md5sum_old != 'not_available')].reset_index(drop=True)

In [ ]:
first_cols = ['accession', 'possible_replacement_accession']
cols = first_cols + [col for col in sorted(rsm_patch.columns, reverse=True) if col not in first_cols]

In [ ]:
rsm_patch[cols]

In [ ]:
#rsm_patch[['possible_replacement_accession', 'accession']].rename(columns={'possible_replacement_accession': 'accession', 'accession': 'alternate_accessions:list'}).to_csv('../../matching_content_md5sum_patch_06_29_2017.tsv', sep='\t', index=False)

Remove files to be patched


In [ ]:
rsm = rsm[~(rsm.accession.isin(rsm_patch.accession.values))].reset_index(drop=True)

Total Files that need replacement


In [ ]:
len(rsm.accession.unique())

Possible replacement with zero results


In [ ]:
rsm_zero_result = rsm[rsm.possible_replacement_accession == 'no_result'].reset_index(drop=True)

In [ ]:
len(rsm_zero_result.accession.unique())

In [ ]:
rsm_zero_result.submitted_by_old.value_counts()

In [ ]:
rsm_zero_result[cols]

In [ ]:
# To set to deleted because no conservative IDR anymore.
#rsm_zero_result.loc[rsm_zero_result.submitted_by_old == 'J. Seth Strattan', 'status_old'] = 'deleted'
#rsm_zero_result[rsm_zero_result.submitted_by_old == 'J. Seth Strattan'][['@id_old', 'status_old']].rename(columns={'status_old': 'status', '@id_old': '@id'}).to_csv('../../zero_match_replaced_to_deleted_patch_06_28_2017.tsv', sep='\t', index=False)

Check for superseded_by/supersedes field


In [ ]:
rsm_zero_result.superseded_by_old.value_counts()

Possible replacement with one result


In [ ]:
rsm_one_result = rsm[rsm.possible_replacement_accession != 'no_result'].drop_duplicates('accession',
                                                                                        keep=False).reset_index(drop=True)

In [ ]:
len(rsm_one_result)

In [ ]:
rsm_one_result.submitted_by_old.value_counts()

In [ ]:
rsm_one_result = rsm_one_result[cols]

In [ ]:
rsm_one_result[rsm_one_result.submitted_by_old == "Diane Trout"]

Check for superseded_by/supersedes field


In [ ]:
rsm_one_result.superseded_by_old.value_counts()

In [ ]:
rsm_one_result.supersedes_old.value_counts()

In [ ]:
rsm_one_result.superseded_by_new.value_counts()

In [ ]:
#rsm_one_result.supersedes_new.value_counts()

Files that should be revoked instead of replaced?


In [ ]:
rsm_one_result[(rsm_one_result.superseded_by_old != 'empty_list')][cols]

In [ ]:
rsm_one_result_patch = rsm_one_result[(rsm_one_result.superseded_by_old != 'empty_list')].reset_index(drop=True)

In [ ]:
rsm_one_result_patch[['accession', 'superseded_by_old']]

Remove files with superseded_by values


In [ ]:
rsm_one_result = rsm_one_result[~(rsm_one_result.accession.isin(rsm_one_result_patch.accession.values))].reset_index(drop=True)

In [ ]:
rsm_one_result.shape

In [ ]:
rsm_one_result[rsm_one_result.derived_from_old != rsm_one_result.derived_from_new][cols].submitted_by_old.value_counts() #[['derived_from_old', 'derived_from_new']].values

In [ ]:
rsm_one_result[(rsm_one_result.derived_from_old != rsm_one_result.derived_from_new)
              & (rsm_one_result.submitted_by_old == 'Anna Vlasova')][cols][['accession', 'possible_replacement_accession', 'derived_from_old', 'derived_from_new']]

In [ ]:
rsm[(rsm['@type_old'] == 'File')]['@id_old'].unique()

Replacements with one result and matching derived_from files != not_available


In [ ]:
rsm_one_result[(rsm_one_result.derived_from_old == rsm_one_result.derived_from_new)
               & (rsm_one_result.derived_from_old != 'not_available')].shape

Replacements with one result and derived_from both equal to not_available


In [ ]:
rsm_one_result[(rsm_one_result.derived_from_old == rsm_one_result.derived_from_new)
               & (rsm_one_result.derived_from_old == 'not_available')]

In [ ]:
# Patch one of Diane's that has missing derived_from but otherwise equal
# dp = rsm_one_result[(rsm_one_result.derived_from_old == rsm_one_result.derived_from_new)
#                & (rsm_one_result.derived_from_old == 'not_available')]
# dp[['possible_replacement_accession', 'accession']].rename(columns={'possible_replacement_accession': 'accession',
#                                                                     'accession': 'alternate_accessions:list'}).to_csv('../../one_match_missing_derived_from_patch_06_28_2017.tsv', sep='\t', index=False)

In [ ]:
# Patch 58 narrowPeaks with one match after dropping revoked/deleted from possible replacements
# rsm_one_result[['possible_replacement_accession',
#                 'accession']].rename(columns={'possible_replacement_accession': 'accession',
#                                               'accession': 'alternate_accessions:list'}).to_csv('../../one_match_after_dropping_deleted_revoked_patch_06_30_2017.tsv', sep='\t', index=False)

In [ ]:

Replacements with one result where derived_from_old but not derived_from_new equal to not_available


In [ ]:
rsm_one_result[(rsm_one_result.derived_from_old != rsm_one_result.derived_from_new)
               & (rsm_one_result.derived_from_old == 'not_available')].shape

Replacements with one result where derived_from_new but not derived_from_old equal to not_available


In [ ]:
rsm_one_result[(rsm_one_result.derived_from_old != rsm_one_result.derived_from_new)
               & (rsm_one_result.derived_from_new == 'not_available')]

Replacements with one result where either are not_available


In [ ]:
rsm_one_result[(rsm_one_result.derived_from_old == 'not_available')
               | (rsm_one_result.derived_from_new == 'not_available')]

Replacements with one result where derived_from not matching


In [ ]:
rsm_one_result[rsm_one_result.derived_from_old != rsm_one_result.derived_from_new].shape

Replacements with one result where derived_from is matching


In [ ]:
rsm_one_result[rsm_one_result.derived_from_old == rsm_one_result.derived_from_new].shape

In [ ]:
rsm_one_result_full_match = rsm_one_result[(rsm_one_result.derived_from_old == rsm_one_result.derived_from_new)
                                           & (rsm_one_result.derived_from_old != 'not_available')][cols].reset_index(drop=True)

In [ ]:
rsm_one_result_full_match

In [ ]:
len(rsm_one_result_full_match.possible_replacement_accession.unique())

In [ ]:
rsm_one_result_full_match[['possible_replacement_accession', 'accession']].rename(columns={'possible_replacement_accession': 'accession',
                                                                                           'accession': 'alternate_accessions:list'})

Replacements with one result with no matching derived_from


In [ ]:
rsm_one_result_no_match = rsm_one_result[~(rsm_one_result.accession.isin(rsm_one_result_full_match.accession.values))][cols].reset_index(drop=True)

In [ ]:
rsm_one_result_no_match.shape

In [ ]:
rsm_one_result_no_match

In [ ]:
rsm_one_result_no_match.file_format_type_new.value_counts()

In [ ]:
rsm_one_result_no_match[rsm_one_result_no_match.file_format_type_new == "not_available"]

In [ ]:
rsm_one_result_no_match[['derived_from_new', 'derived_from_old']].values

In [ ]:
rsm_one_result_no_match[rsm_one_result_no_match.submitted_by_old == 'J. Seth Strattan']

In [ ]:
# Patch these narrowPeaks that match except for derived_from because upstream Files changed.
sp = rsm_one_result_no_match[rsm_one_result_no_match.submitted_by_old == 'J. Seth Strattan'][['possible_replacement_accession', 'accession']]
sp.rename(columns={'possible_replacement_accession': 'accession',
                   'accession': 'alternate_accessions:list'})#.to_csv('../../one_match_derived_from_mismatch_patch_06_28_2017.tsv', index=False, sep='\t')

Possible replacement with many results


In [ ]:
rsm_multi_result = rsm[rsm.duplicated('accession', keep=False)].reset_index(drop=True)

In [ ]:
len(rsm_multi_result.accession.unique())

In [ ]:
rsm_multi_result.drop_duplicates('accession', keep='first').reset_index().submitted_by_old.value_counts()

In [ ]:
rsm_multi_result[rsm_multi_result.accession == 'ENCFF719FSK']

Groups add back up to total number of accessions?


In [ ]:
assert len(rsm_zero_result) + len(rsm_one_result) + len(rsm_one_result_patch) + len(rsm_multi_result.accession.unique()) == len(rsm.accession.unique())

Does matching on technical replicates and derived_from reduce number of possible replacements with many results?


In [ ]:
matching_rep = rsm_multi_result[(rsm_multi_result.technical_replicates_old == rsm_multi_result.technical_replicates_new)
                                & (rsm_multi_result.derived_from_old == rsm_multi_result.derived_from_new)].reset_index(drop=True)

In [ ]:
len(matching_rep.accession.unique())

Multiresults that now only have one result after matching on technical_replicate and derived_from


In [ ]:
len(matching_rep.drop_duplicates('accession', keep=False).accession.unique())

In [ ]:
rsm_multi_one_result = matching_rep.drop_duplicates('accession', keep=False)[cols].reset_index(drop=True)

In [ ]:
rsm_multi_one_result[cols]

In [ ]:
# rsm_multi_one_result[['possible_replacement_accession', 'accession']].rename(columns={'possible_replacement_accession': 'accession',
#                                                                                       'accession': 'alternate_accessions:list'}).to_csv('../../multi_one_match_patch_06_27_2017.tsv',
#                                                                                                                                        index=False, sep='\t')

In [ ]:
# Patch multiresults that have one match when matched on tech_rep (only narrowPeaks)
# multi_one_narrow_peaks = rsm_multi_result[(rsm_multi_result.technical_replicates_old == rsm_multi_result.technical_replicates_new)
#                                          & (rsm_multi_result.file_format_type_old == 'narrowPeak')].drop_duplicates('accession', keep=False).reset_index(drop=True)
# multi_one_narrow_peaks[['possible_replacement_accession', 'accession']].rename(columns={'possible_replacement_accession': 'accession',
#                                                                            'accession': 'alternate_accessions:list'}).to_csv('../../multi_narrow_peaks_tech_rep_match_patch_06_30_2017.tsv', sep='\t', index=False)

Multiresults that still have more than one result after matching on technical_replicate and derived_from


In [ ]:
len(matching_rep[matching_rep.duplicated('accession', keep=False)].accession.unique())

Group by accession and possible_replacement


In [ ]:
cols = ['accession','possible_replacement_accession']
cols = cols + [x for x in sorted(rsm.columns, reverse=True) if (x not in cols) and (x not in ['alternate_accessions_new',
                                                                                              'alternate_accessions_old'])]
mr = matching_rep[matching_rep.duplicated('accession', keep=False)].groupby(cols).count().reset_index()
matching_rep[matching_rep.duplicated('accession', keep=False)].groupby(cols).count()

In [ ]:
# # Patch pointing to in progress replacement instead of deleted replacement.
# in_prog_multi_patch = mr[(mr.status_new == 'in progress')
#                          & (mr.accession.isin(['ENCFF219IZI',
#                                                'ENCFF362CIL',
#                                                'ENCFF522EVZ',
#                                                'ENCFF526SQT',
#                                                'ENCFF554QRY',
#                                                'ENCFF799OIZ',
#                                                'ENCFF826MUG',
#                                                'ENCFF832XOD',
#                                                'ENCFF833LEK']))]
# # in_prog_multi_patch[['possible_replacement_accession', 'accession']].rename(columns={'possible_replacement_accession': 'accession',
# #                                                                                      'accession': 'alternate_accessions:list'})\
# #                                         .to_csv('../../multi_result_point_to_in_progress_patch_06_28_2017.tsv', index=False, sep='\t')
# in_prog_multi_patch

In [ ]:
# # Patch pointing to released replacement instead of revoked replacement.
# released_multi_patch = mr[(mr.status_new == 'released')
#                            & (mr.accession.isin(['ENCFF311CTD',
#                                                  'ENCFF442FSP',
#                                                  'ENCFF521DYG',
#                                                  'ENCFF660PBO',
#                                                  'ENCFF723DLE',
#                                                  'ENCFF758WLI',
#                                                  'ENCFF803YCX',
#                                                  'ENCFF809POG']))]
# # released_multi_patch[['possible_replacement_accession', 'accession']].rename(columns={'possible_replacement_accession': 'accession',
# #                                                                                       'accession': 'alternate_accessions:list'})\
# #                                     .to_csv('../../multi_result_point_to_released_patch_06_28_2017.tsv', index=False, sep='\t')
# released_multi_patch

In [ ]:
# # Patch these as deleted because merged fasta that was never released
# mr.loc[mr.submitted_by_old == 'Xintao Wei', 'status_old'] = 'deleted'
# mr[mr.submitted_by_old == 'Xintao Wei'].drop_duplicates('accession')[['@id_old', 'status_old']].rename(columns={'status_old': 'status', '@id_old': '@id'}).to_csv('../../two_match_to_deleted_patch_06_29_2017.tsv', sep='\t', index=False)

Multiresults that don't match on technical_replicates or derived_from


In [ ]:
no_matching_rep = rsm_multi_result[~(rsm_multi_result.accession.isin(matching_rep.accession.unique()))].reset_index(drop=True)

In [ ]:
len(no_matching_rep.accession.unique())

In [ ]:
no_matching_rep[~(no_matching_rep.accession.isin(multi_tech_match.accession)) & (no_matching_rep.submitted_by_old == "J. Seth Strattan")]['@id_old'].unique()

Multiresults that have matching technical_replicates but not derived_from


In [ ]:
len(no_matching_rep[(no_matching_rep.technical_replicates_old == no_matching_rep.technical_replicates_new)
                & (no_matching_rep.derived_from_old != no_matching_rep.derived_from_new)].accession.unique())

In [ ]:
no_matching_rep[(no_matching_rep.technical_replicates_old == no_matching_rep.technical_replicates_new)
                & (no_matching_rep.derived_from_old != no_matching_rep.derived_from_new)].drop_duplicates('accession', keep=False)

In [ ]:
multi_tech_match = no_matching_rep[(no_matching_rep.technical_replicates_old == no_matching_rep.technical_replicates_new)
                & (no_matching_rep.derived_from_old != no_matching_rep.derived_from_new)]
no_matching_rep[(no_matching_rep.technical_replicates_old == no_matching_rep.technical_replicates_new)
                & (no_matching_rep.derived_from_old != no_matching_rep.derived_from_new)].groupby(cols).count()

In [ ]:
multi_tech_match.superseded_by_old.value_counts()

In [ ]:
multi_tech_match[multi_tech_match.superseded_by_old == 'empty_list'][cols]

In [ ]:
multi_tech_match[multi_tech_match.supersedes_new != 'not_available'][cols]

In [ ]:
multi_tech_match.supersedes_old.value_counts()

One result after matching on technical_replicate


In [ ]:
multi_tech_one_match = multi_tech_match.drop_duplicates('accession', keep=False)
len(multi_tech_match.drop_duplicates('accession', keep=False).accession.unique())

In [ ]:
multi_tech_one_match.submitted_by_old.value_counts()

In [ ]:
pd.crosstab(multi_tech_one_match.output_type_old, multi_tech_one_match.submitted_by_old, margins=False)

In [ ]:
multi_tech_one_match

In [ ]:
# Delete because no matching derived_from
#multi_tech_one_match[['@id_old', 'status_old']].rename(columns={'@id_old': '@id', 'status_old': 'status'}).to_csv('../../no_matching_derived_from_delete_patch_07_03_2017.tsv', index=False, sep='\t')

In [ ]:
multi_tech_one_match.file_format_old.value_counts()

In [ ]:
multi_tech_one_match[(multi_tech_one_match.output_type_old != 'alignments')][cols]

In [ ]:
multi_tech_one_match[(multi_tech_one_match.submitted_by_old == 'Xintao Wei')
                    & (multi_tech_one_match.output_type_old != 'alignments')][cols]#[['@id_old', 'possible_replacement_accession']].values

In [ ]:
multi_tech_one_match.groupby(cols).count()

In [ ]:
multi_tech_one_match.file_format_type_old.value_counts()

In [ ]:
multi_tech_one_match[multi_tech_one_match.submitted_by_old == "Jean Davidson"][cols]

Multiresult after matching on technical_replicate


In [ ]:
len(multi_tech_match[multi_tech_match.duplicated('accession', keep=False)].accession.unique())

In [ ]:
mtm = multi_tech_match[multi_tech_match.duplicated('accession', keep=False)]
mtm.groupby(cols).count()

In [ ]:
mtm[mtm.submitted_by_old == 'Jean Davidson'].groupby(cols).count()

In [ ]:
mtm[mtm.submitted_by_old == 'J. Seth Strattan'].groupby(cols).count()

Multiresults that have matching derived_from but not technical_replicates


In [ ]:
no_matching_rep[(no_matching_rep.technical_replicates_old != no_matching_rep.technical_replicates_new)
                & (no_matching_rep.derived_from_old == no_matching_rep.derived_from_new)].shape

In [ ]:
no_matching_rep[(no_matching_rep.technical_replicates_old != no_matching_rep.technical_replicates_new)
                & (no_matching_rep.derived_from_old == no_matching_rep.derived_from_new)].groupby(cols).count()

Multiresults that have mismatching derived_from and technical_replicates


In [ ]:
len(no_matching_rep[(no_matching_rep.technical_replicates_old != no_matching_rep.technical_replicates_new)
                & (no_matching_rep.derived_from_old != no_matching_rep.derived_from_new)].accession.unique())

In [ ]:
no_matching_rep[(no_matching_rep.technical_replicates_old != no_matching_rep.technical_replicates_new)
                & (no_matching_rep.derived_from_old != no_matching_rep.derived_from_new)].groupby(cols).count()

In [ ]:


In [ ]:
cols = ['accession','possible_replacement_accession']
cols = cols + [x for x in sorted(matching_rep.columns, reverse=True) if (x not in cols) and (x not in ['alternate_accessions_new',
                                                                                                       'alternate_accessions_old'])]
no_matching_rep.groupby(cols).count()

Accessions of multiple results that don't have matching technical_replicates or derived_from


In [ ]:
mis_matching_rep = rsm_multi_result[~(rsm_multi_result.accession.isin(matching_rep.accession))].reset_index(drop=True)

In [ ]:
len(mis_matching_rep.accession.unique())

In [ ]:
mis_matching_rep[['technical_replicates_old','technical_replicates_new', 'derived_from_old', 'derived_from_new']]

Pull all accessions ready for patching


In [ ]:
replacement_patch = pd.concat([rsm_patch,
                               rsm_one_result_full_match,
                               rsm_multi_one_result])

In [ ]:
# Squash list for patching.
patch_list = []
for replacement_accession in replacement_patch.possible_replacement_accession.unique():
    values = replacement_patch[replacement_patch.possible_replacement_accession == replacement_accession]['accession']
    accession_list = []
    for val in values:
        accession_list.append(val)
    patch_list.append({'accession': replacement_accession,
                       'alternate_accessions:array': ', '.join(accession_list)})

In [ ]:
patch_data = pd.DataFrame(patch_list)

In [ ]:
#patch_data.to_csv("replaced_with_matching_replacements_patch_06_21_2017.tsv", sep="\t", index=False)

In [ ]:
with sns.plotting_context("notebook", font_scale=1.5):
    fig = plt.figure(figsize=[14, 8])
    sns.set_style('whitegrid')
    sns.stripplot(x='date_created_old',
                  data=rsm[rsm.possible_replacement_accession == 'no_result'],
                 size=10,
                 color='black',
                 alpha=0.8)

Biosamples


In [ ]:
biosamples = no_redirect_accessions[no_redirect_accessions['@type'] == 'Biosample']

In [ ]:
biosamples.submitted_by.value_counts()

In [ ]:
na = 'not_available'
possible_replacements = defaultdict(list)
async def get_request_two(session, url, r):
     async with session.get(url, auth=request_auth) as response_two:
        result_one = await response_two.json()
        search_results = result_one['@graph']
        if len(search_results) == 0:
            possible_replacements[r['accession']].append({'accession': r['accession'],
                                                          'possible_replacement_accession': 'no_result'})
        for result in search_results:
                lab = result.get('lab', {})
                sub_by = result.get('submitted_by', {})
                if isinstance(sub_by, str):
                    submitted_by = sub_by
                else:
                    submitted_by = sub_by.get('title', na)
                if isinstance(lab, str):
                    lab_name = lab
                else:
                    lab_name = lab.get('name', na)
                possible_replacements[r['accession']].append({'accession': r['accession'],
                                                              'possible_replacement_accession': result['accession'],
                                                              '@id': result['@id'],
                                                              'alternate_accessions': result.get('alternate_accessions', na),
                                                              'dataset': result.get('dataset', na),
                                                              'lab_name': lab_name,
                                                              'date_created': result.get('date_created', na),
                                                              '@type': result['@type'][0],
                                                              'output_type': result.get('output_type', na),
                                                              'file_format': result.get('file_format', na),
                                                              'assembly': result.get('assembly', na),
                                                              'paired_with': result.get('paired_with', na),
                                                              'paired_end': result.get('paired_end', na),
                                                              'file_format_type': result.get('file_format_type', na),
                                                              'technical_replicates': result.get('technical_replicates', na),
                                                              'replicate_uuid': result.get('replicate', {}).get('uuid', na),
                                                              'md5sum': result.get('md5sum', na),
                                                              'content_md5sum': result.get('content_md5sum', na),
                                                              'status': result['status'],
                                                              'submitted_by': submitted_by,
                                                              'derived_from': result.get('derived_from', na),
                                                              'superseded_by': result.get('superseded_by', na),
                                                              'supersedes': result.get('supersedes', na)
                                                              })
    
async def get_request_one(session, file_id):
    url = 'https://www.encodeproject.org/{}/?format=json'.format(file_id)
    async with session.get(url, auth=request_auth) as response_one:
        result_one = await response_one.json()
        r = result_one
        file_format = r['file_format']
        output_type = r['output_type']
        dataset = r['dataset']
        assembly = r.get('assembly', '*')
        try:
            assay_term_name = r['quality_metrics'][0]['assay_term_name']
            url = 'https://www.encodeproject.org/search/?type=File&file_format={}'\
                  '&output_type={}&quality_metrics.assay_term_name={}'\
                  '&dataset={}&assembly={}&format=json&frame=embedded'\
                  '&status!=replaced'.format(file_format,
                                             output_type,
                                             assay_term_name,
                                             dataset,
                                             assembly)
        except IndexError:
            url = 'https://www.encodeproject.org/search/?type=File&file_format={}'\
                  '&output_type={}&dataset={}&assembly={}&format=json&frame=embedded'\
                  '&status!=replaced'.format(file_format,
                                             output_type,
                                             dataset,
                                             assembly)
        if assembly == '*':
            url = url.replace('&assembly=*', '&assembly!=*')
        result_two = await get_request_two(session, url, r)
            
async def create_session(file_ids, loop):
    conn = aiohttp.TCPConnector(keepalive_timeout=10, limit=100)
    async with aiohttp.ClientSession(connector=conn, loop=loop) as session:  
        results = await asyncio.gather(*[get_request_one(session, file_id) for file_id in file_ids])

In [ ]:
na = 'not_available'
possible_biosample_replacements = defaultdict(list)
for biosample_id in biosamples['@id'].unique():
    r = requests.get('https://www.encodeproject.org/{}/?format=json'.format(biosample_id),
                     auth=(key.authid, key.authpw))
    r = r.json()
    lab_old = r.get('lab', {})
    if isinstance(lab_old, str):
        lab_name_old = lab_old
    else:
        lab_name_old = lab_old.get('name', na)
    donor_old = r.get('donor', {})
    if isinstance(donor_old, str):
        donor_name_old = donor_old
    else:
        donor_name_old = donor_old.get('@id', na)
    sub_by_old = r.get('submitted_by', {})
    if isinstance(sub_by_old, str):
        submitted_by_old = sub_by_old
    else:
        submitted_by_old = sub_by_old.get('title', na)
    try:
        product_id = r['product_id']
        health_status = r['health_status']
        culture_start_date = r['culture_start_date']
        url = 'https://www.encodeproject.org/search/'\
              '?type=Biosample&product_id={}'\
              '&health_status={}&culture_start_date={}'\
              '&status%21=replaced&format=json&frame=embedded'.format(product_id,
                                                                      health_status,
                                                                      culture_start_date)
    except KeyError:
         description = r['description']
         url = 'https://www.encodeproject.org/search/'\
              '?type=Biosample&description={}'\
              '&status%21=replaced&format=json&frame=embedded'.format(description)
    search_results = requests.get(url, auth=(key.authid, key.authpw))
    search_results = search_results.json()['@graph']
    if len(search_results) == 0:
        possible_biosample_replacements[r['accession']].append({'accession': r['accession'],
                                                                'possible_replacement_accession': 'no_result'})
    for result in search_results:
            lab_new = result.get('lab', {})
            if isinstance(lab_new, str):
                lab_name_new = lab_new
            else:
                lab_name_new = lab_new.get('name', na)
            donor_new = result.get('donor', {})
            if isinstance(donor_new, str):
                donor_name_new = donor_new
            else:
                donor_name_new = donor_new.get('@id', na)
            sub_by_new = result.get('submitted_by', {})
            if isinstance(sub_by_new, str):
                submitted_by_new = sub_by_new
            else:
                submitted_by_new = sub_by_new.get('title', na)
            possible_biosample_replacements[r['accession']].append({'accession': r['accession'],
                                                                    'possible_replacement_accession': result['accession'],
                                                                    '@id_old': r['@id'],
                                                                    '@id_new': result['@id'],
                                                                    'alternate_accessions_new': r.get('alternate_accessions', na),
                                                                    'alternate_accessions_old': result.get('alternate_accessions', na),
                                                                    'donor_old': donor_name_old,
                                                                    'donor_new': donor_name_new,
                                                                    'lab_name_old': lab_name_old,
                                                                    'lab_name_new': lab_name_new,
                                                                    'date_created_old': r.get('date_created', na),
                                                                    'date_created_new': result.get('date_created', na),
                                                                    '@type_old': r['@type'][0],
                                                                    '@type_new': result['@type'][0],
                                                                    'status_old': r['status'],
                                                                    'status_new': result['status'],
                                                                    'product_id_old': r.get('product_id', na),
                                                                    'product_id_new': result.get('product_id', na),
                                                                    'health_status_old': r.get('health_status', na),
                                                                    'health_status_new': result.get('health_status', na),
                                                                    'culture_start_date_old': r.get('culture_start_date', na),
                                                                    'culture_start_date_new': result.get('culture_start_date', na),
                                                                    'biosample_type_old': r['biosample_type'],
                                                                    'biosample_type_new': result['biosample_type'],
                                                                    'treatment_old': r['treatments'],
                                                                    'treatment_new': result['treatments'],
                                                                    'biosample_term_name_old': r['biosample_term_name'],
                                                                    'biosample_term_name_new': result['biosample_term_name'],
                                                                    'summary_old': r['summary'],
                                                                    'summary_new': result['summary'],
                                                                    'description_old': r['description'],
                                                                    'description_new': result['description'],
                                                                    'pooled_from_old': r.get('pooled_from', na),
                                                                    'pooled_from_new': result.get('pooled_from', na),
                                                                    'part_of_old': r.get('part_of', na),
                                                                    'part_of_new': result.get('part_of', na),
                                                                    'culture_harvest_date_old': r.get('culture_harvest_date', na),
                                                                    'culture_harvest_date_new': result.get('culture_harvest_date', na),
                                                                    'passage_number_old': r.get('passage_number', na),
                                                                    'passage_number_new': result.get('passage_number', na),
                                                                    'lot_id_old': r.get('lot_id', na),
                                                                    'lot_id_new': result.get('lot_id', na),
                                                                    'submitted_by_old': submitted_by_old,
                                                                    'submitted_by_new': submitted_by_new
                                                                    })

In [ ]:
len(possible_biosample_replacements)

In [ ]:
possible_biosample_replacements

In [ ]:
replacement_search = pd.DataFrame([item for key, value in possible_biosample_replacements.items() for item in value])
replacement_search = replacement_search.fillna('isnull')
replacement_search.loc[replacement_search.alternate_accessions_old.apply(lambda x: len(x) == 0), 'alternate_accessions_old'] = 'empty_list'
replacement_search.loc[replacement_search.alternate_accessions_new.apply(lambda x: len(x) == 0), 'alternate_accessions_new'] = 'empty_list'
#replacement_search.loc[replacement_search.pooled_from_old.apply(lambda x: len(x) == 0), 'pooled_from_old'] = 'empty_list'
#replacement_search.loc[replacement_search.pooled_from_new.apply(lambda x: len(x) == 0), 'pooled_from_new'] = 'empty_list'

In [ ]:
replacement_search.shape

In [ ]:
lazy_dict = {'_,e,i,l,m,p,s,t,t,y': 'empty_list',
             'i,l,l,n,s,u': 'isnull',
             '_,a,a,a,b,e,i,l,l,n,o,t,v': 'not_available'}
def parse_list(x):
    return ','.join([y.strip() for y in sorted(x)])
replacement_search.date_created_old = replacement_search.date_created_old.apply(lambda x: pd.to_datetime(x))
replacement_search.date_created_new = replacement_search.date_created_new.apply(lambda x: pd.to_datetime(x))
for field in ['treatment_new',
              'treatment_old',
              'alternate_accessions_old',
              'alternate_accessions_new',
              'pooled_from_new',
              'pooled_from_old',
              'part_of_new',
              'part_of_old']:
    replacement_search[field] = replacement_search[field].apply(lambda x: parse_list(x)).apply(lambda x: lazy_dict[x] if x in lazy_dict.keys() else x)

In [ ]:
bcols

In [ ]:
biosamples_one_match = replacement_search.drop_duplicates('accession', keep=False)

In [ ]:
first_cols = ['accession', 'possible_replacement_accession']
bcols = first_cols + [col for col in sorted(biosamples_one_match.columns, reverse=True) if col not in first_cols]

In [ ]:
biosamples[biosamples['@id'].isin(replacement_search['@id_old'])].shape

In [ ]:
biosamples_one_match[bcols].lab_name_old.value_counts()

In [ ]:
biosamples_one_match[bcols]

In [ ]:
flat_patch = []
for replacement in bs_patch.possible_replacement_accession.unique():
    data = {'accession': replacement,
            'alternate_accessions:array': ", ".join(bs_patch[bs_patch.possible_replacement_accession == replacement].accession.values)}
    flat_patch.append(data)

In [ ]:
fp = pd.DataFrame(flat_patch)

In [ ]:
# fp.to_csv('../../biosample_one_match_patch_07_03_2017.tsv', sep='\t', index=False)

In [ ]:
biosamples_multi_match[biosamples_multi_match.accession.isin(bs_multi_match.accession)]

In [ ]:
bs_multi_match = bs[(bs.donor_old == bs.donor_old)
   & (bs.passage_number_old == bs.passage_number_new)
   & (bs.lot_id_old == bs.lot_id_new)
   & (bs.product_id_old == bs.product_id_new)
   & (bs.culture_harvest_date_old == bs.culture_harvest_date_new)
   & (bs.culture_start_date_old == bs.culture_start_date_new)]

In [ ]:
bs = biosamples_multi_match#.drop_duplicates('accession').shape

In [ ]:
bs_multi_match.submitted_by_old.value_counts()

In [ ]:
bs_multi_match.groupby(bcols).count()

In [ ]:
ANTIBODIES:
product_id=A301-145A
@type=AntibodyLot
targets.gene_name: "NCOR1",
antigen_description: "Nuclear Receptor corepressor 1; N-CoR, TRAC1, KIAA1047, hN-CoR",
source.title: "Bethyl Labs",
    
https://www.encodeproject.org/search/?type=AntibodyLot&targets.gene_name=NCOR1&source.title=Bethyl+Labs&product_id=A301-145A&status%21=replaced
    

BIOSAMPLE
biosample_type: "immortalized cell line",
treatment: [ ]
lab.name: "gene-yeo"
culture_start_date: "2015-06-12",
health_status: "hepatocellular carcinoma",
product_id: "HB-8065",
biosample_term_name: "HepG2",
@type: "Biosample"
donor.@id: "/human-donors/ENCDO000AAC/",
summary: "Homo sapiens HepG2 immortalized cell line",
life_stage: "child",
source.title: "ATCC",
biosample_term_name: "HepG2",
    
https://www.encodeproject.org/search/
?type=Biosample&product_id=HB-8065
&health_status=hepatocellular+carcinoma
&culture_start_date=2015-06-12&status%21=replaced


FILE
quality_metrics.assay_term_name: "ChIP-seq",
file_type: "bam",
assembly: "hg19",
lab.name: "encode-processing-pipeline",
output_category: "alignment",
analysis_step_version.analysis_step.name: "bwa-raw-alignment-step-v-1",
biological_replicates: 1
technical_replicates: [
"1_1"
        
https://www.encodeproject.org/search/?type=File&file_format=bam
&output_type=alignments&quality_metrics.assay_term_name=ChIP-seq
&dataset=%2Fexperiments%2FENCSR021JFW%2F&assembly=hg19
        
LIBRARY
nucleic_acid_term_name: "DNA",
library_size_selection_method: "SPRI beads",
strand_specificity: false,
fragmentation_method: "shearing (Covaris S2)",
aliases: "tim-reddy:hic_dex.t0_brep1_lib"
lab: "/labs/tim-reddy/",
crosslinking_method: "formaldehyde",
biosample.summary: "Homo sapiens A549 immortalized cell line"
biosample.biosample_term_name: "A549"
        
https://www.encodeproject.org/search/?type=Library
&lab=%2Flabs%2Fthomas-gingeras%2F
&nucleic_acid_term_name=polyadenylated+mRNA
&strand_specificity=true&depleted_in_term_name=rRNA
&biosample.biosample_term_name=NCI-H460
&biosample.%40id=%2Fbiosamples%2FENCBS814QPR%2F&status%21=replaced