In [ ]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import asyncio
import aiohttp
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import requests
import seaborn as sns
from ast import literal_eval
from collections import defaultdict
pd.options.display.max_rows = 200
pd.options.display.max_columns = 50
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
In [ ]:
# Copied from pyencoded-tools/encodedcc.py to avoid dependency.
class ENC_Key:
def __init__(self, keyfile, keyname):
if os.path.isfile(str(keyfile)):
keys_f = open(keyfile, 'r')
keys_json_string = keys_f.read()
keys_f.close()
keys = json.loads(keys_json_string)
else:
keys = keyfile
key_dict = keys[keyname]
self.authid = key_dict['key']
self.authpw = key_dict['secret']
self.server = key_dict['server']
if not self.server.endswith("/"):
self.server += "/"
class ENC_Connection(object):
def __init__(self, key):
self.headers = {'content-type': 'application/json', 'accept': 'application/json'}
self.server = key.server
self.auth = (key.authid, key.authpw)
In [ ]:
# Define key if private data desired.
key = ENC_Key(os.path.expanduser("~/keypairs.json"), 'prod')
In [ ]:
# Pull accession of all Items with replaced status.
url = 'https://www.encodeproject.org/search/'\
'?type=File&type=Dataset&type=Donor&type=Library'\
'&type=Pipeline&type=Biosample&type=AntibodyLot&status=replaced'\
'&limit=all&format=json'
r = requests.get(url, auth=(key.authid, key.authpw))
search_results = r.json()['@graph']
In [ ]:
len(search_results)
In [ ]:
accessions = set()
for result in search_results:
accessions.add(result['accession'])
In [ ]:
len(accessions)
In [ ]:
# loop.close()
# loop = asyncio.new_event_loop()
# asyncio.set_event_loop(loop)
In [ ]:
# Asyncio request.
result_length = []
bad_accessions = []
request_auth = aiohttp.BasicAuth(key.authid, key.authpw)
async def get_json(url, sem):
async with sem:
async with aiohttp.ClientSession() as session:
async with session.get(url, auth=request_auth) as resp:
return await resp.json()
async def get_request(accession, sem):
url = 'https://www.encodeproject.org/'\
'search/?type=Item&accession={}'\
'&limit=all&format=json'.format(accession)
result = await get_json(url, sem)
search_results = result['@graph']
num_results = len(search_results)
result_length.append({'accession': accession,
'result_length': num_results})
if num_results > 1:
bad_accessions.append({'accession': accession,
'results': search_results})
sem = asyncio.Semaphore(20)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*[get_request(accession, sem) for accession in accessions]));
In [ ]:
# # Search for each accession, count number of results.
# counter = 0
# result_length = []
# bad_accessions = []
# for accession in accessions:
# url = 'https://www.encodeproject.org/search/'\
# '?type=Item&accession={}'\
# '&limit=all&format=json'.format(accession)
# r = requests.get(url, auth=(key.authid, key.authpw))
# search_results = r.json()['@graph']
# result_length.append({'accession': accession,
# 'result_length': len(search_results)})
# if len(search_results) > 1:
# bad_accessions.append({'accession': accession,
# 'results': search_results})
# counter += 1
# if counter % 100 == 0:
# print(".", end="")
# if counter % 1000 == 0:
# print("\n")
In [ ]:
# Make sure search results returned for each accession.
#assert len(accessions) == counter
In [ ]:
pd.DataFrame(result_length).result_length.value_counts()
In [ ]:
len(bad_accessions)
In [ ]:
bad_accessions[0]
In [ ]:
duplicate_accession_data = []
for bad in bad_accessions:
for item in bad['results']:
duplicate_accession_data.append({'accession': item['accession'],
'file_format': item['file_format'],
'status': item['status'],
'dataset': item['dataset']})
In [ ]:
duplicate_accessions = pd.DataFrame(duplicate_accession_data)
In [ ]:
duplicate_accessions.dataset.value_counts()
In [ ]:
experiment_list = duplicate_accessions.dataset.unique()
In [ ]:
search_ids = "&@id=".join(experiment_list)
url = 'https://www.encodeproject.org/search/'\
'?type=Item&limit=all&frame=embedded&@id={}'.format(search_ids)
r = requests.get(url, auth=(key.authid, key.authpw))
search_results = r.json()['@graph']
search_id_map = {}
for experiment in search_results:
search_id_map[experiment['@id']] = experiment['lab']['name']
In [ ]:
duplicate_accessions['lab'] = duplicate_accessions.dataset.apply(lambda x: search_id_map[x])
In [ ]:
print(*sorted(duplicate_accessions.lab.unique()), sep='\n')
In [ ]:
list(duplicate_accessions.accession.unique())
In [ ]:
duplicate_accessions[duplicate_accessions.status == "replaced"].groupby(['lab',
'accession',
'status',
'file_format']).count().sort_index(0)[[]]
In [ ]:
duplicate_accessions.groupby(['lab',
'status',
'dataset',
'accession',
'file_format']).count().sort_index(1, 0)
In [ ]:
duplicate_accessions.groupby(['accession',
'status', 'file_format',
'lab',
'dataset',
'file_format']).count().sort_index(1, 0).unstack()
In [ ]:
duplicate_accessions
In [ ]:
# Grab data of all replaced Items.
replaced_data = []
url = 'https://www.encodeproject.org/search/'\
'?type=File&type=Dataset&type=Donor&type=Library'\
'&type=Pipeline&type=Biosample&type=AntibodyLot&status=replaced'\
'&frame=embedded&limit=all&format=json'
r = requests.get(url, auth=(key.authid, key.authpw))
search_results = r.json()['@graph']
na = 'not_available'
for result in search_results:
sub_by = result.get('submitted_by', {})
if isinstance(sub_by, str):
submitted_by = sub_by
else:
submitted_by = sub_by.get('title', na)
lab = result.get('lab', {})
if isinstance(lab, str):
lab_name = lab
else:
lab_name = lab.get('name', na)
item_data = {'accession': result['accession'],
'submitted_by': submitted_by,
'derived_from': result.get('derived_from', na),
'superseded_by': result.get('superseded_by', na),
'supersedes': result.get('supersedes', na),
'@id': result['@id'],
'alternate_accessions': result.get('alternate_accessions', na),
'dataset': result.get('dataset', na),
'lab_name': lab_name,
'date_created': result.get('date_created', na),
'@type': result['@type'][0],
'output_type': result.get('output_type', na),
'file_format': result.get('file_format', na),
'assembly': result.get('assembly', na),
'paired_with': result.get('paired_with', na),
'paired_end': result.get('paired_end', na),
'file_format_type': result.get('file_format_type', na),
'technical_replicates': result.get('technical_replicates', na),
'replicate_uuid': result.get('replicate', {}).get('uuid', na),
'md5sum': result.get('md5sum', na),
'content_md5sum': result.get('content_md5sum', na),
'status': result['status'],
'product_id': result.get('product_id', na),
'culture_start_date': result.get('culture_start_date', na),
'biosample_type': result.get('biosample_type', na),
'description': result.get('description', na),
'treatments': result.get('treatments', na)
}
replaced_data.append(item_data)
In [ ]:
replaced_data[900]
In [ ]:
len(replaced_data)
In [ ]:
def parse_lab_name(lab):
if isinstance(lab, str):
parse_lab = lab.replace("/", "").replace("labs", "")
return parse_lab
else:
return lab[0]
In [ ]:
rd = pd.DataFrame(replaced_data)
rd.lab_name = rd.lab_name.apply(lambda x: parse_lab_name(x))
rd.loc[rd.assembly.apply(lambda x: len(x) == 0), 'assembly'] = 'empty_list'
rd.loc[rd.superseded_by.apply(lambda x: len(x) == 0), 'superseded_by'] = 'empty_list'
rd.loc[rd.supersedes.apply(lambda x: len(x) == 0), 'supersedes'] = 'empty_list'
rd.loc[rd.derived_from.apply(lambda x: len(x) == 0), 'derived_from'] = 'empty_list'
rd.loc[rd.technical_replicates.apply(lambda x: len(x) == 0), 'technical_replicates'] = 'empty_list'
rd.loc[rd.alternate_accessions.apply(lambda x: len(x) == 0), 'alternate_accessions'] = 'empty_list'
rd.loc[rd.treatments.apply(lambda x: len(x) == 0), 'treatments'] = 'empty_list'
In [ ]:
def drop_unique_fields(data):
drop_fields = ['@id',
'@accession',
'md5sum',
'content_md5sum',
'date_created']
data = {k: v for k, v in data.items() if k not in drop_fields}
return data
In [ ]:
replacement_data = []
broken_pair = defaultdict(list)
for accession in rd.accession.unique():
replaced_values = rd[rd.accession == accession].to_dict(orient='records')[0]
url = 'https://www.encodeproject.org/{}/?format=json'.format(accession)
r = requests.get(url, auth=(key.authid, key.authpw))
if (r.status_code == 200):
result = r.json()
sub_by = result.get('submitted_by', {})
if isinstance(sub_by, str):
submitted_by = sub_by
else:
submitted_by = sub_by.get('title', na)
lab = result.get('lab', {})
if isinstance(lab, str):
lab_name = lab
else:
lab_name = lab.get('name', na)
item_data = {'accession': result['accession'],
'submitted_by': submitted_by,
'@id': result['@id'],
'alternate_accessions': result.get('alternate_accessions', na),
'dataset': result.get('dataset', na),
'lab_name': lab_name,
'date_created': result.get('date_created', na),
'@type': result['@type'][0],
'output_type': result.get('output_type', na),
'file_format': result.get('file_format', na),
'assembly': result.get('assembly', na),
'paired_with': result.get('paired_with', na),
'paired_end': result.get('paired_end', na),
'file_format_type': result.get('file_format_type', na),
'technical_replicates': result.get('technical_replicates', na),
'replicate_uuid': result.get('replicate', {}).get('uuid', na),
'md5sum': result.get('md5sum', na),
'content_md5sum': result.get('content_md5sum', na),
'status': result['status'],
'product_id': result.get('product_id', na),
'culture_start_date': result.get('culture_start_date', na),
'biosample_type': result.get('biosample_type', na),
'description': result.get('description', na),
'treatments': result.get('treatments', na)
}
item_temp = pd.DataFrame([item_data])
item_temp.lab_name = item_temp.lab_name.apply(lambda x: parse_lab_name(x))
item_temp.loc[item_temp.assembly.apply(lambda x: len(x) == 0), 'assembly'] = 'empty_list'
item_temp.loc[item_temp.technical_replicates.apply(lambda x: len(x) == 0), 'technical_replicates'] = 'empty_list'
item_temp.loc[item_temp.alternate_accessions.apply(lambda x: len(x) == 0), 'alternate_accessions'] = 'empty_list'
item_temp.loc[item_temp.treatments.apply(lambda x: len(x) == 0), 'treatments'] = 'empty_list'
item_temp = item_temp.to_dict(orient='records')[0]
replaced_dict = drop_unique_fields(replaced_values)
replacement_dict = drop_unique_fields(replaced_dict)
if replaced_dict != replacement_dict:
broken_pair['accession'].append(item_data)
replacement_data.append(item_data)
In [ ]:
len(replacement_data)
In [ ]:
# loop.close()
# loop = asyncio.new_event_loop()
# asyncio.set_event_loop(loop)
In [ ]:
# Asyncio request.
replaced_by_file = []
na = 'not_available'
async def get_request(session, accession):
url = 'https://www.encodeproject.org/{}'.format(accession)
async with session.get(url, auth=request_auth, timeout=None) as response:
if response.status == 404:
item_data = {'searched_accession': accession,
'redirected_to_accession': 'no_result'}
replaced_by_file.append(item_data)
else:
result = await response.json()
sub_by = result.get('submitted_by', {})
if isinstance(sub_by, str):
submitted_by = sub_by
else:
submitted_by = sub_by.get('title', na)
lab = result.get('lab', {})
if isinstance(lab, str):
lab_name = lab
else:
lab_name = lab.get('name', na)
item_data = {'accession': result['accession'],
'submitted_by': submitted_by,
'derived_from': result.get('derived_from', na),
'superseded_by': result.get('superseded_by', na),
'supersedes': result.get('supersedes', na),
'@id': result['@id'],
'alternate_accessions': result.get('alternate_accessions', na),
'dataset': result.get('dataset', na),
'lab_name': lab_name,
'date_created': result.get('date_created', na),
'@type': result['@type'][0],
'output_type': result.get('output_type', na),
'file_format': result.get('file_format', na),
'assembly': result.get('assembly', na),
'paired_with': result.get('paired_with', na),
'paired_end': result.get('paired_end', na),
'file_format_type': result.get('file_format_type', na),
'technical_replicates': result.get('technical_replicates', na),
'replicate_uuid': result.get('replicate', {}).get('uuid', na),
'md5sum': result.get('md5sum', na),
'content_md5sum': result.get('content_md5sum', na),
'status': result['status'],
'product_id': result.get('product_id', na),
'culture_start_date': result.get('culture_start_date', na),
'biosample_type': result.get('biosample_type', na),
'description': result.get('description', na),
'treatments': result.get('treatments', na)}
replaced_by_file.append(item_data)
if len(replaced_by_file) % 100 == 0:
print(len(replaced_by_file))
async def create_session(accessions, loop):
connector = aiohttp.TCPConnector(keepalive_timeout=10, limit=100)
async with aiohttp.ClientSession(connector=connector, loop=loop) as session:
results = await asyncio.gather(*[get_request(session, accession) for accession in accessions])
In [ ]:
loop = asyncio.get_event_loop()
loop.run_until_complete(create_session(accessions, loop))
In [ ]:
len(replaced_by_file)
In [ ]:
len(accessions)
In [ ]:
# # Asyncio request.
# request_auth = aiohttp.BasicAuth(key.authid, key.authpw)
# replaced_by_file = []
# na = 'not_available'
# async def get_request(url, sem):
# async with sem:
# async with aiohttp.ClientSession() as session:
# async with session.get(url, auth=request_auth) as resp:
# return await resp.json()
# async def get_data(accession, sem):
# url = 'https://www.encodeproject.org/{}'.format(accession)
# result = await get_request(url, sem)
# if result.get('code', False) == 404:
# item_data = {'searched_accession': accession,
# 'redirected_to_accession': 'no_result'}
# replaced_by_file.append(item_data)
# else:
# sub_by = result.get('submitted_by', {})
# if isinstance(sub_by, str):
# submitted_by = sub_by
# else:
# submitted_by = sub_by.get('title', na)
# lab = result.get('lab', {})
# if isinstance(lab, str):
# lab_name = lab
# else:
# lab_name = lab.get('name', na)
# item_data = {'accession': result['accession'],
# 'submitted_by': submitted_by,
# 'derived_from': result.get('derived_from', na),
# 'superseded_by': result.get('superseded_by', na),
# 'supersedes': result.get('supersedes', na),
# '@id': result['@id'],
# 'alternate_accessions': result.get('alternate_accessions', na),
# 'dataset': result.get('dataset', na),
# 'lab_name': lab_name,
# 'date_created': result.get('date_created', na),
# '@type': result['@type'][0],
# 'output_type': result.get('output_type', na),
# 'file_format': result.get('file_format', na),
# 'assembly': result.get('assembly', na),
# 'paired_with': result.get('paired_with', na),
# 'paired_end': result.get('paired_end', na),
# 'file_format_type': result.get('file_format_type', na),
# 'technical_replicates': result.get('technical_replicates', na),
# 'replicate_uuid': result.get('replicate', {}).get('uuid', na),
# 'md5sum': result.get('md5sum', na),
# 'content_md5sum': result.get('content_md5sum', na),
# 'status': result['status'],
# 'product_id': result.get('product_id', na),
# 'culture_start_date': result.get('culture_start_date', na),
# 'biosample_type': result.get('biosample_type', na),
# 'description': result.get('description', na),
# 'treatments': result.get('treatments', na)
# }
# replaced_by_file.append(item_data)
# sem = asyncio.Semaphore(100)
# loop = asyncio.get_event_loop()
# loop.run_until_complete(asyncio.gather(*[get_data(accession, sem) for accession in accessions]));
In [ ]:
# loop = asyncio.get_event_loop()
# loop.run_until_complete(create_session(accessions, loop))
In [ ]:
In [ ]:
# # For every replaced accession:
# # Check if https://www.encodeproject.org/{accession} returns anything.
# # If so, does it match replaced file type?
# replaced_by_file = []
# na = 'not_available'
# for accession in accessions:
# url = 'https://www.encodeproject.org/{}'.format(accession)
# r = requests.get(url, auth=(key.authid, key.authpw))
# if r.status_code == 404:
# item_data = {'searched_accession': accession,
# 'redirected_to_accession': 'no_result'}
# replaced_by_file.append(item_data)
# else:
# result = r.json()
# sub_by = result.get('submitted_by', {})
# if isinstance(sub_by, str):
# submitted_by = sub_by
# else:
# submitted_by = sub_by.get('title', na)
# lab = result.get('lab', {})
# if isinstance(lab, str):
# lab_name = lab
# else:
# lab_name = lab.get('name', na)
# item_data = {'accession': result['accession'],
# 'submitted_by': submitted_by,
# 'derived_from': result.get('derived_from', na),
# 'superseded_by': result.get('superseded_by', na),
# 'supersedes': result.get('supersedes', na),
# '@id': result['@id'],
# 'alternate_accessions': result.get('alternate_accessions', na),
# 'dataset': result.get('dataset', na),
# 'lab_name': lab_name,
# 'date_created': result.get('date_created', na),
# '@type': result['@type'][0],
# 'output_type': result.get('output_type', na),
# 'file_format': result.get('file_format', na),
# 'assembly': result.get('assembly', na),
# 'paired_with': result.get('paired_with', na),
# 'paired_end': result.get('paired_end', na),
# 'file_format_type': result.get('file_format_type', na),
# 'technical_replicates': result.get('technical_replicates', na),
# 'replicate_uuid': result.get('replicate', {}).get('uuid', na),
# 'md5sum': result.get('md5sum', na),
# 'content_md5sum': result.get('content_md5sum', na),
# 'status': result['status'],
# 'product_id': result.get('product_id', na),
# 'culture_start_date': result.get('culture_start_date', na),
# 'biosample_type': result.get('biosample_type', na),
# 'description': result.get('description', na),
# 'treatments': result.get('treatments', na)
# }
# replaced_by_file.append(item_data)
In [ ]:
len(accessions)
In [ ]:
len(replaced_by_file)
In [ ]:
rbf = pd.DataFrame(replaced_by_file)
rbf = rbf.fillna('is_null')
In [ ]:
rbf.lab_name = rbf.lab_name.apply(lambda x: parse_lab_name(x))
rbf.loc[rbf.assembly.apply(lambda x: len(x) == 0), 'assembly'] = 'empty_list'
rbf.loc[rbf.technical_replicates.apply(lambda x: len(x) == 0), 'technical_replicates'] = 'empty_list'
rbf.loc[rbf.superseded_by.apply(lambda x: len(x) == 0), 'superseded_by'] = 'empty_list'
rbf.loc[rbf.supersedes.apply(lambda x: len(x) == 0), 'supersedes'] = 'empty_list'
rbf.loc[rbf.derived_from.apply(lambda x: len(x) == 0), 'derived_from'] = 'empty_list'
In [ ]:
df = pd.read_excel('replaced_items_no_redirect_06_12_2017.xlsx')
In [ ]:
df.shape
In [ ]:
df['@type'].value_counts()
In [ ]:
dff = df[df['@type'] == 'File']
dff.dataset
In [ ]:
def get_assay_type(experiment):
url = 'https://www.encodeproject.org{}?format=json'.format(experiment)
r = requests.get(url, auth=(key.authid, key.authpw))
result = r.json()
return result.get('assay_term_name', 'na')
def get_lab_name(experiment):
url = 'https://www.encodeproject.org/{}/?format=json'.format(experiment)
r = requests.get(url, auth=(key.authid, key.authpw))
result = r.json()
return result.get('lab', {}).get('name', 'na')
In [ ]:
dff.dataset
In [ ]:
dff['assay_type'] = dff.dataset.apply(lambda x: get_assay_type(x))
In [ ]:
dff.assay_type.value_counts()
In [ ]:
dff['experiment_lab'] = dff.dataset.apply(lambda x: get_lab_name(x))
In [ ]:
#rbf.to_csv("replaced_by_search.tsv", sep="\t")
In [ ]:
no_redirect_accessions = rd[rd.accession.isin(rbf[rbf.redirected_to_accession == "no_result"].searched_accession.values)]
no_redirect_accessions = no_redirect_accessions.sort_values('@type').reset_index(drop=True)
no_redirect_accessions.loc[no_redirect_accessions.description.apply(lambda x: len(x) == 0), 'description'] = 'empty_string'
In [ ]:
no_redirect_accessions['status'].value_counts()
In [ ]:
no_redirect_accessions.content_md5sum.value_counts()
In [ ]:
no_redirect_accessions.description.value_counts()
In [ ]:
no_redirect_accessions.lab_name.value_counts()
In [ ]:
no_redirect_accessions['@type'].value_counts()
In [ ]:
no_redirect_accessions[no_redirect_accessions.md5sum != "not_available"].accession.unique()
In [ ]:
len(no_redirect_accessions[no_redirect_accessions.md5sum != "not_available"].accession.unique())
In [ ]:
len(no_redirect_accessions[no_redirect_accessions.md5sum == 'not_available'].accession.unique())
In [ ]:
#.to_excel('replaced_items_no_redirect_06_12_2017.xlsx')
In [ ]:
# possible_replacements = defaultdict(list)
# for md5 in no_redirect_accessions.md5sum.unique()[1:]:
# url = 'https://www.encodeproject.org/search/'\
# '?type=Item&md5sum={}&status%21=replaced'\
# '&frame=embedded&limit=all&format=json'.format(md5)
# r = requests.get(url, auth=(key.authid, key.authpw))
# if (r.status_code == 404) or (len(r.json()['@graph']) == 0):
# item_data = {'md5sum': md5,
# 'accession': 'no_result'}
# possible_replacements[md5].append(item_data)
# else:
# results = r.json()['@graph']
# for result in results:
# lab = result.get('lab', {})
# if isinstance(lab, str):
# lab_name = lab
# else:
# lab_name = lab.get('name', na)
# possible_replacements[md5].append({'accession': result['accession'],
# '@id': result['@id'],
# 'alternate_accessions': result.get('alternate_accessions', na),
# 'dataset': result.get('dataset', na),
# 'lab_name': lab_name,
# 'date_created': result.get('date_created', na),
# '@type': result['@type'][0],
# 'output_type': result.get('output_type', na),
# 'file_format': result.get('file_format', na),
# 'assembly': result.get('assembly', na),
# 'paired_with': result.get('paired_with', na),
# 'paired_end': result.get('paired_end', na),
# 'file_format_type': result.get('file_format_type', na),
# 'technical_replicates': result.get('technical_replicates', na),
# 'replicate_uuid': result.get('replicate', {}).get('uuid', na),
# 'md5sum': result.get('md5sum', na),
# 'content_md5sum': result.get('content_md5sum', na),
# 'status': result['status']
# })
In [ ]:
loop.close()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
In [ ]:
possible_replacements = defaultdict(list)
async def get_request(session, md5):
url = 'https://www.encodeproject.org/search/'\
'?type=Item&md5sum={}&status%21=replaced'\
'&frame=embedded&limit=all&format=json'.format(md5)
async with session.get(url, auth=request_auth) as response:
r = await response.json()
results = r['@graph']
if len(results) == 0:
item_data = {'md5sum': md5,
'accession': 'no_result'}
possible_replacements[md5].append(item_data)
else:
for result in results:
lab = result.get('lab', {})
if isinstance(lab, str):
lab_name = lab
else:
lab_name = lab.get('name', na)
possible_replacements[md5].append({'accession': result['accession'],
'@id': result['@id'],
'alternate_accessions': result.get('alternate_accessions', na),
'dataset': result.get('dataset', na),
'lab_name': lab_name,
'date_created': result.get('date_created', na),
'@type': result['@type'][0],
'output_type': result.get('output_type', na),
'file_format': result.get('file_format', na),
'assembly': result.get('assembly', na),
'paired_with': result.get('paired_with', na),
'paired_end': result.get('paired_end', na),
'file_format_type': result.get('file_format_type', na),
'technical_replicates': result.get('technical_replicates', na),
'replicate_uuid': result.get('replicate', {}).get('uuid', na),
'md5sum': result.get('md5sum', na),
'content_md5sum': result.get('content_md5sum', na),
'status': result['status']
})
async def create_session(md5s, loop):
conn = aiohttp.TCPConnector(keepalive_timeout=10, limit=100)
async with aiohttp.ClientSession(connector=conn, loop=loop) as session:
results = await asyncio.gather(*[get_request(session, md5) for md5 in md5s])
In [ ]:
loop = asyncio.get_event_loop()
loop.run_until_complete(create_session(no_redirect_accessions.md5sum.unique()[1:], loop))
In [ ]:
len(possible_replacements)
In [ ]:
possible_replacements
In [ ]:
possible_merge = [item for key, value in possible_replacements.items()
for item in value if item['accession'] != 'no_result']
In [ ]:
possible_merge = pd.DataFrame(possible_merge)
In [ ]:
possible_merge = possible_merge.rename(columns={'accession': 'possible_redirect_accession',
'status': 'possible_redirect_status'})
In [ ]:
possible_merge.loc[possible_merge.technical_replicates.apply(lambda x: len(x) == 0), 'technical_replicates'] = 'empty_list'
In [ ]:
possible_merge.shape
In [ ]:
no_matches = no_redirect_accessions[~(no_redirect_accessions.md5sum.isin(possible_merge.md5sum.values))].reset_index(drop=True)
In [ ]:
pm = possible_merge.merge(no_redirect_accessions,
how='left',
suffixes=('_new', '_old'),
on=['md5sum',
'@type',
'file_format',
'file_format_type'])[['md5sum',
'accession',
'status',
'possible_redirect_accession',
'possible_redirect_status',
'@type',
'file_format',
'file_format_type',
'assembly_old',
'assembly_new',
'dataset_old',
'dataset_new',
'date_created_old',
'date_created_new',
'lab_name_old',
'lab_name_new',
'technical_replicates_old',
'technical_replicates_new',
'@id_old',
'@id_new',
'output_type_old',
'output_type_new',
'paired_end_old',
'paired_end_new',
'paired_with_old',
'paired_with_new',
'replicate_uuid_old',
'replicate_uuid_new',
'alternate_accessions_old',
'alternate_accessions_new',
'content_md5sum_old',
'content_md5sum_new']]
pm#.to_excel('possible_redirect_accessions_for_replaced_files_06_12_2017.xlsx')
In [ ]:
no_redirect_accessions[no_redirect_accessions.accession == 'ENCFF133IYK']
In [ ]:
pm.shape
In [ ]:
len(pm.accession.unique())
In [ ]:
replacements_exact_match = pm[(pm.dataset_old == pm.dataset_new)].reset_index(drop=True)
In [ ]:
replacements_exact_match.shape
In [ ]:
replacements_exact_match[[col for col in replacements_exact_match]]
In [ ]:
replacements_different = pm[~(pm.dataset_old == pm.dataset_new)].reset_index(drop=True)
In [ ]:
replacements_different.shape
In [ ]:
replacements_different
In [ ]:
# Different datasets but same MD5. Have to update replaced file to have replacement dataset.
replacements_update_dataset = replacements_different[['@id_old', 'dataset_new']].rename(columns={'@id_old': '@id', 'dataset_new': 'dataset'})
#replacements_update_dataset.to_csv('../../update_dataset_of_replaced_filed_matching_md5_06_27_2017.tsv', index=False, sep='\t')
In [ ]:
# Now set exact match
In [ ]:
replacements_patch = replacements_exact_match[['possible_redirect_accession',
'accession']].rename(columns={'accession': 'alternate_accessions:array',
'possible_redirect_accession': 'accession'})
In [ ]:
replacements_patch = replacements_patch.sort_values("alternate_accessions:array")
In [ ]:
replacements_patch.shape
In [ ]:
flat_list_patch = []
for accession in replacements_patch.accession.unique():
data = {'accession': accession,
'alternate_accessions:array': ", ".join(replacements_patch[replacements_patch.accession == accession]\
['alternate_accessions:array'].values)}
flat_list_patch.append(data)
In [ ]:
replacements_patch_flat_list = pd.DataFrame(flat_list_patch)
In [ ]:
#replacements_patch_flat_list.to_csv('../../replaced_with_matching_replacements_patch_06_27_2017.tsv', sep="\t", index=False)
In [ ]:
#replacements_different.sort_values('possible_redirect_accession').to_excel('replaced_same_md5_mismatched_dataset_06_14_2017.xlsx', index=False)
In [ ]:
no_matching_md5_replacements = [item['md5sum'] for key, value in possible_replacements.items()
for item in value if item['accession'] == 'no_result']
In [ ]:
len(pd.DataFrame(list(set(no_matching_md5_replacements))).rename(columns={0: 'md5sum'}).merge(no_redirect_accessions,
how='left',
on='md5sum')['accession'].unique())
In [ ]:
no_redirect_file = no_redirect_accessions[no_redirect_accessions['@type'] == 'File'].reset_index(drop=True)
In [ ]:
no_redirect_file
In [ ]:
na = 'not_available'
possible_replacements = defaultdict(list)
async def get_request_two(session, url, r):
async with session.get(url, auth=request_auth) as response_two:
result_one = await response_two.json()
search_results = result_one['@graph']
if len(search_results) == 0:
possible_replacements[r['accession']].append({'accession': r['accession'],
'possible_replacement_accession': 'no_result'})
for result in search_results:
lab = result.get('lab', {})
sub_by = result.get('submitted_by', {})
if isinstance(sub_by, str):
submitted_by = sub_by
else:
submitted_by = sub_by.get('title', na)
if isinstance(lab, str):
lab_name = lab
else:
lab_name = lab.get('name', na)
possible_replacements[r['accession']].append({'accession': r['accession'],
'possible_replacement_accession': result['accession'],
'@id': result['@id'],
'alternate_accessions': result.get('alternate_accessions', na),
'dataset': result.get('dataset', na),
'lab_name': lab_name,
'date_created': result.get('date_created', na),
'@type': result['@type'][0],
'output_type': result.get('output_type', na),
'file_format': result.get('file_format', na),
'assembly': result.get('assembly', na),
'paired_with': result.get('paired_with', na),
'paired_end': result.get('paired_end', na),
'file_format_type': result.get('file_format_type', na),
'technical_replicates': result.get('technical_replicates', na),
'replicate_uuid': result.get('replicate', {}).get('uuid', na),
'md5sum': result.get('md5sum', na),
'content_md5sum': result.get('content_md5sum', na),
'status': result['status'],
'submitted_by': submitted_by,
'derived_from': result.get('derived_from', na),
'superseded_by': result.get('superseded_by', na),
'supersedes': result.get('supersedes', na)
})
async def get_request_one(session, file_id):
url = 'https://www.encodeproject.org/{}/?format=json'.format(file_id)
async with session.get(url, auth=request_auth) as response_one:
result_one = await response_one.json()
r = result_one
file_format = r['file_format']
output_type = r['output_type']
dataset = r['dataset']
assembly = r.get('assembly', '*')
try:
assay_term_name = r['quality_metrics'][0]['assay_term_name']
url = 'https://www.encodeproject.org/search/?type=File&file_format={}'\
'&output_type={}&quality_metrics.assay_term_name={}'\
'&dataset={}&assembly={}&format=json&frame=embedded'\
'&status!=replaced'.format(file_format,
output_type,
assay_term_name,
dataset,
assembly)
except IndexError:
url = 'https://www.encodeproject.org/search/?type=File&file_format={}'\
'&output_type={}&dataset={}&assembly={}&format=json&frame=embedded'\
'&status!=replaced'.format(file_format,
output_type,
dataset,
assembly)
if assembly == '*':
url = url.replace('&assembly=*', '&assembly!=*')
result_two = await get_request_two(session, url, r)
async def create_session(file_ids, loop):
conn = aiohttp.TCPConnector(keepalive_timeout=10, limit=100)
async with aiohttp.ClientSession(connector=conn, loop=loop) as session:
results = await asyncio.gather(*[get_request_one(session, file_id) for file_id in file_ids])
In [ ]:
loop = asyncio.get_event_loop()
loop.run_until_complete(create_session(no_redirect_file['@id'].unique(), loop))
In [ ]:
len(possible_replacements)
In [ ]:
possible_replacements
In [ ]:
replacement_search = pd.DataFrame([item for key, value in possible_replacements.items() for item in value])
replacement_search = replacement_search.fillna('isnull')
replacement_search.loc[replacement_search.alternate_accessions.apply(lambda x: len(x) == 0), 'alternate_accessions'] = 'empty_list'
replacement_search.loc[replacement_search.technical_replicates.apply(lambda x: len(x) == 0), 'technical_replicates'] = 'empty_list'
replacement_search.loc[replacement_search.superseded_by.apply(lambda x: len(x) == 0), 'superseded_by'] = 'empty_list'
replacement_search.loc[replacement_search.supersedes.apply(lambda x: len(x) == 0), 'supersedes'] = 'empty_list'
replacement_search.loc[replacement_search.derived_from.apply(lambda x: len(x) == 0), 'derived_from'] = 'empty_list'
no_redirect_file.loc[no_redirect_accessions.alternate_accessions.apply(lambda x: len(x) == 0), 'alternate_accessions'] = 'empty_list'
no_redirect_file.loc[no_redirect_accessions.technical_replicates.apply(lambda x: len(x) == 0), 'technical_replicates'] = 'empty_list'
no_redirect_file.loc[no_redirect_file.superseded_by.apply(lambda x: len(x) == 0), 'superseded_by'] = 'empty_list'
no_redirect_file.loc[no_redirect_file.supersedes.apply(lambda x: len(x) == 0), 'supersedes'] = 'empty_list'
no_redirect_file.loc[no_redirect_file.derived_from.apply(lambda x: len(x) == 0), 'derived_from'] = 'empty_list'
In [ ]:
rsm = replacement_search.merge(no_redirect_file,
how='left',
suffixes=('_new', '_old'),
on=['accession'])
In [ ]:
rsm.shape
In [ ]:
rsm = rsm[~(rsm.status_new.isin(['revoked', 'deleted']))]
In [ ]:
# Extract lookup table from data with just one result.
# If derived_from File doesn't redirect then look up and see possible replacement.
# Use that as fill in value of comparison.
dfl = rsm[(rsm.possible_replacement_accession != 'no_result')
& (rsm.technical_replicates_old == rsm.technical_replicates_new)].drop_duplicates('accession',
keep=False).reset_index(drop=True)
dfl.shape
In [ ]:
rsm[(rsm.possible_replacement_accession != 'no_result')
& (rsm.technical_replicates_old == rsm.technical_replicates_new)].drop_duplicates('accession',
keep=False).reset_index(drop=True).shape
In [ ]:
# Create from previous iterations below.
derived_from_lookup = pd.concat([dfl, matching_rep.drop_duplicates('accession', keep=False)], axis=0).drop_duplicates('accession').reset_index(drop=True)
In [ ]:
len(derived_from_lookup.accession.unique())
In [ ]:
def get_json(id):
url = 'https://www.encodeproject.org/{}/?format=json'.format(id)
return requests.get(url, auth=(key.authid, key.authpw))
def parse_derived_from(x):
if len(x) == 0 or x == 'not_available':
return x
new_list = []
for y in x:
y_id = y.split('/')[2]
if y_id.startswith('ENC'):
new_list.append(y)
continue
else:
r = get_json(y)
try:
accession = r.json()['accession']
r = get_json(accession)
if r.status_code == 404:
# Pull from local lookup table.
try:
accession_replacement = derived_from_lookup[derived_from_lookup.accession == accession]\
.possible_replacement_accession.values[0]
new_list.append('/files/{}/'.format(accession_replacement))
# If no results returned from one-result table.
except IndexError:
new_list.append(y)
else:
accession_replacement = r.json()['accession']
new_list.append('/files/{}/'.format(accession_replacement))
except KeyError:
print(y)
print(x)
new_list.append(y)
return new_list
In [ ]:
rsm_derived_from_old = rsm.derived_from_old.apply(lambda x: parse_derived_from(x))
In [ ]:
rsm.derived_from_old = rsm_derived_from_old
In [ ]:
rsm
In [ ]:
rsm[~(rsm['@id_old'].isin(['/files/d9e23f37-9b33-41b9-b9df-0700ca87bc75/',
'/files/3efeced1-a3c5-4131-a721-7c5f743350a9/',
'/files/9fe192e9-af81-46f5-a16f-4d6b5cda577c/'])) & (rsm.supersedes_new != 'not_available')][cols]
In [ ]:
In [ ]:
lazy_dict = {'_,e,i,l,m,p,s,t,t,y': 'empty_list',
'i,l,l,n,s,u': 'isnull',
'_,a,a,a,b,e,i,l,l,n,o,t,v': 'not_available'}
def parse_list(x):
return ','.join([y.strip() for y in sorted(x)])
rsm.date_created_old = rsm.date_created_old.apply(lambda x: pd.to_datetime(x))
for field in ['technical_replicates_old',
'technical_replicates_new',
'superseded_by_old',
'superseded_by_new',
'supersedes_old',
'supersedes_new',
'derived_from_old',
'derived_from_new']:
rsm[field] = rsm[field].apply(lambda x: parse_list(x)).apply(lambda x: lazy_dict[x] if x in lazy_dict.keys() else x)
In [ ]:
rsm[rsm.technical_replicates_old != rsm.technical_replicates_new][['technical_replicates_old',
'technical_replicates_new']]
In [ ]:
rsm[rsm.accession == 'ENCFF721IVN'][cols]
In [ ]:
rsm[rsm.derived_from_old != rsm.derived_from_new][['derived_from_old', 'derived_from_new']]
In [ ]:
rsm_patch = rsm[(rsm.content_md5sum_old == rsm.content_md5sum_new)
& (rsm.content_md5sum_old != 'not_available')].reset_index(drop=True)
In [ ]:
first_cols = ['accession', 'possible_replacement_accession']
cols = first_cols + [col for col in sorted(rsm_patch.columns, reverse=True) if col not in first_cols]
In [ ]:
rsm_patch[cols]
In [ ]:
#rsm_patch[['possible_replacement_accession', 'accession']].rename(columns={'possible_replacement_accession': 'accession', 'accession': 'alternate_accessions:list'}).to_csv('../../matching_content_md5sum_patch_06_29_2017.tsv', sep='\t', index=False)
In [ ]:
rsm = rsm[~(rsm.accession.isin(rsm_patch.accession.values))].reset_index(drop=True)
In [ ]:
len(rsm.accession.unique())
In [ ]:
rsm_zero_result = rsm[rsm.possible_replacement_accession == 'no_result'].reset_index(drop=True)
In [ ]:
len(rsm_zero_result.accession.unique())
In [ ]:
rsm_zero_result.submitted_by_old.value_counts()
In [ ]:
rsm_zero_result[cols]
In [ ]:
# To set to deleted because no conservative IDR anymore.
#rsm_zero_result.loc[rsm_zero_result.submitted_by_old == 'J. Seth Strattan', 'status_old'] = 'deleted'
#rsm_zero_result[rsm_zero_result.submitted_by_old == 'J. Seth Strattan'][['@id_old', 'status_old']].rename(columns={'status_old': 'status', '@id_old': '@id'}).to_csv('../../zero_match_replaced_to_deleted_patch_06_28_2017.tsv', sep='\t', index=False)
In [ ]:
rsm_zero_result.superseded_by_old.value_counts()
In [ ]:
rsm_one_result = rsm[rsm.possible_replacement_accession != 'no_result'].drop_duplicates('accession',
keep=False).reset_index(drop=True)
In [ ]:
len(rsm_one_result)
In [ ]:
rsm_one_result.submitted_by_old.value_counts()
In [ ]:
rsm_one_result = rsm_one_result[cols]
In [ ]:
rsm_one_result[rsm_one_result.submitted_by_old == "Diane Trout"]
In [ ]:
rsm_one_result.superseded_by_old.value_counts()
In [ ]:
rsm_one_result.supersedes_old.value_counts()
In [ ]:
rsm_one_result.superseded_by_new.value_counts()
In [ ]:
#rsm_one_result.supersedes_new.value_counts()
In [ ]:
rsm_one_result[(rsm_one_result.superseded_by_old != 'empty_list')][cols]
In [ ]:
rsm_one_result_patch = rsm_one_result[(rsm_one_result.superseded_by_old != 'empty_list')].reset_index(drop=True)
In [ ]:
rsm_one_result_patch[['accession', 'superseded_by_old']]
In [ ]:
rsm_one_result = rsm_one_result[~(rsm_one_result.accession.isin(rsm_one_result_patch.accession.values))].reset_index(drop=True)
In [ ]:
rsm_one_result.shape
In [ ]:
rsm_one_result[rsm_one_result.derived_from_old != rsm_one_result.derived_from_new][cols].submitted_by_old.value_counts() #[['derived_from_old', 'derived_from_new']].values
In [ ]:
rsm_one_result[(rsm_one_result.derived_from_old != rsm_one_result.derived_from_new)
& (rsm_one_result.submitted_by_old == 'Anna Vlasova')][cols][['accession', 'possible_replacement_accession', 'derived_from_old', 'derived_from_new']]
In [ ]:
rsm[(rsm['@type_old'] == 'File')]['@id_old'].unique()
In [ ]:
rsm_one_result[(rsm_one_result.derived_from_old == rsm_one_result.derived_from_new)
& (rsm_one_result.derived_from_old != 'not_available')].shape
In [ ]:
rsm_one_result[(rsm_one_result.derived_from_old == rsm_one_result.derived_from_new)
& (rsm_one_result.derived_from_old == 'not_available')]
In [ ]:
# Patch one of Diane's that has missing derived_from but otherwise equal
# dp = rsm_one_result[(rsm_one_result.derived_from_old == rsm_one_result.derived_from_new)
# & (rsm_one_result.derived_from_old == 'not_available')]
# dp[['possible_replacement_accession', 'accession']].rename(columns={'possible_replacement_accession': 'accession',
# 'accession': 'alternate_accessions:list'}).to_csv('../../one_match_missing_derived_from_patch_06_28_2017.tsv', sep='\t', index=False)
In [ ]:
# Patch 58 narrowPeaks with one match after dropping revoked/deleted from possible replacements
# rsm_one_result[['possible_replacement_accession',
# 'accession']].rename(columns={'possible_replacement_accession': 'accession',
# 'accession': 'alternate_accessions:list'}).to_csv('../../one_match_after_dropping_deleted_revoked_patch_06_30_2017.tsv', sep='\t', index=False)
In [ ]:
In [ ]:
rsm_one_result[(rsm_one_result.derived_from_old != rsm_one_result.derived_from_new)
& (rsm_one_result.derived_from_old == 'not_available')].shape
In [ ]:
rsm_one_result[(rsm_one_result.derived_from_old != rsm_one_result.derived_from_new)
& (rsm_one_result.derived_from_new == 'not_available')]
In [ ]:
rsm_one_result[(rsm_one_result.derived_from_old == 'not_available')
| (rsm_one_result.derived_from_new == 'not_available')]
In [ ]:
rsm_one_result[rsm_one_result.derived_from_old != rsm_one_result.derived_from_new].shape
In [ ]:
rsm_one_result[rsm_one_result.derived_from_old == rsm_one_result.derived_from_new].shape
In [ ]:
rsm_one_result_full_match = rsm_one_result[(rsm_one_result.derived_from_old == rsm_one_result.derived_from_new)
& (rsm_one_result.derived_from_old != 'not_available')][cols].reset_index(drop=True)
In [ ]:
rsm_one_result_full_match
In [ ]:
len(rsm_one_result_full_match.possible_replacement_accession.unique())
In [ ]:
rsm_one_result_full_match[['possible_replacement_accession', 'accession']].rename(columns={'possible_replacement_accession': 'accession',
'accession': 'alternate_accessions:list'})
In [ ]:
rsm_one_result_no_match = rsm_one_result[~(rsm_one_result.accession.isin(rsm_one_result_full_match.accession.values))][cols].reset_index(drop=True)
In [ ]:
rsm_one_result_no_match.shape
In [ ]:
rsm_one_result_no_match
In [ ]:
rsm_one_result_no_match.file_format_type_new.value_counts()
In [ ]:
rsm_one_result_no_match[rsm_one_result_no_match.file_format_type_new == "not_available"]
In [ ]:
rsm_one_result_no_match[['derived_from_new', 'derived_from_old']].values
In [ ]:
rsm_one_result_no_match[rsm_one_result_no_match.submitted_by_old == 'J. Seth Strattan']
In [ ]:
# Patch these narrowPeaks that match except for derived_from because upstream Files changed.
sp = rsm_one_result_no_match[rsm_one_result_no_match.submitted_by_old == 'J. Seth Strattan'][['possible_replacement_accession', 'accession']]
sp.rename(columns={'possible_replacement_accession': 'accession',
'accession': 'alternate_accessions:list'})#.to_csv('../../one_match_derived_from_mismatch_patch_06_28_2017.tsv', index=False, sep='\t')
In [ ]:
rsm_multi_result = rsm[rsm.duplicated('accession', keep=False)].reset_index(drop=True)
In [ ]:
len(rsm_multi_result.accession.unique())
In [ ]:
rsm_multi_result.drop_duplicates('accession', keep='first').reset_index().submitted_by_old.value_counts()
In [ ]:
rsm_multi_result[rsm_multi_result.accession == 'ENCFF719FSK']
In [ ]:
assert len(rsm_zero_result) + len(rsm_one_result) + len(rsm_one_result_patch) + len(rsm_multi_result.accession.unique()) == len(rsm.accession.unique())
In [ ]:
matching_rep = rsm_multi_result[(rsm_multi_result.technical_replicates_old == rsm_multi_result.technical_replicates_new)
& (rsm_multi_result.derived_from_old == rsm_multi_result.derived_from_new)].reset_index(drop=True)
In [ ]:
len(matching_rep.accession.unique())
In [ ]:
len(matching_rep.drop_duplicates('accession', keep=False).accession.unique())
In [ ]:
rsm_multi_one_result = matching_rep.drop_duplicates('accession', keep=False)[cols].reset_index(drop=True)
In [ ]:
rsm_multi_one_result[cols]
In [ ]:
# rsm_multi_one_result[['possible_replacement_accession', 'accession']].rename(columns={'possible_replacement_accession': 'accession',
# 'accession': 'alternate_accessions:list'}).to_csv('../../multi_one_match_patch_06_27_2017.tsv',
# index=False, sep='\t')
In [ ]:
# Patch multiresults that have one match when matched on tech_rep (only narrowPeaks)
# multi_one_narrow_peaks = rsm_multi_result[(rsm_multi_result.technical_replicates_old == rsm_multi_result.technical_replicates_new)
# & (rsm_multi_result.file_format_type_old == 'narrowPeak')].drop_duplicates('accession', keep=False).reset_index(drop=True)
# multi_one_narrow_peaks[['possible_replacement_accession', 'accession']].rename(columns={'possible_replacement_accession': 'accession',
# 'accession': 'alternate_accessions:list'}).to_csv('../../multi_narrow_peaks_tech_rep_match_patch_06_30_2017.tsv', sep='\t', index=False)
In [ ]:
len(matching_rep[matching_rep.duplicated('accession', keep=False)].accession.unique())
In [ ]:
cols = ['accession','possible_replacement_accession']
cols = cols + [x for x in sorted(rsm.columns, reverse=True) if (x not in cols) and (x not in ['alternate_accessions_new',
'alternate_accessions_old'])]
mr = matching_rep[matching_rep.duplicated('accession', keep=False)].groupby(cols).count().reset_index()
matching_rep[matching_rep.duplicated('accession', keep=False)].groupby(cols).count()
In [ ]:
# # Patch pointing to in progress replacement instead of deleted replacement.
# in_prog_multi_patch = mr[(mr.status_new == 'in progress')
# & (mr.accession.isin(['ENCFF219IZI',
# 'ENCFF362CIL',
# 'ENCFF522EVZ',
# 'ENCFF526SQT',
# 'ENCFF554QRY',
# 'ENCFF799OIZ',
# 'ENCFF826MUG',
# 'ENCFF832XOD',
# 'ENCFF833LEK']))]
# # in_prog_multi_patch[['possible_replacement_accession', 'accession']].rename(columns={'possible_replacement_accession': 'accession',
# # 'accession': 'alternate_accessions:list'})\
# # .to_csv('../../multi_result_point_to_in_progress_patch_06_28_2017.tsv', index=False, sep='\t')
# in_prog_multi_patch
In [ ]:
# # Patch pointing to released replacement instead of revoked replacement.
# released_multi_patch = mr[(mr.status_new == 'released')
# & (mr.accession.isin(['ENCFF311CTD',
# 'ENCFF442FSP',
# 'ENCFF521DYG',
# 'ENCFF660PBO',
# 'ENCFF723DLE',
# 'ENCFF758WLI',
# 'ENCFF803YCX',
# 'ENCFF809POG']))]
# # released_multi_patch[['possible_replacement_accession', 'accession']].rename(columns={'possible_replacement_accession': 'accession',
# # 'accession': 'alternate_accessions:list'})\
# # .to_csv('../../multi_result_point_to_released_patch_06_28_2017.tsv', index=False, sep='\t')
# released_multi_patch
In [ ]:
# # Patch these as deleted because merged fasta that was never released
# mr.loc[mr.submitted_by_old == 'Xintao Wei', 'status_old'] = 'deleted'
# mr[mr.submitted_by_old == 'Xintao Wei'].drop_duplicates('accession')[['@id_old', 'status_old']].rename(columns={'status_old': 'status', '@id_old': '@id'}).to_csv('../../two_match_to_deleted_patch_06_29_2017.tsv', sep='\t', index=False)
In [ ]:
no_matching_rep = rsm_multi_result[~(rsm_multi_result.accession.isin(matching_rep.accession.unique()))].reset_index(drop=True)
In [ ]:
len(no_matching_rep.accession.unique())
In [ ]:
no_matching_rep[~(no_matching_rep.accession.isin(multi_tech_match.accession)) & (no_matching_rep.submitted_by_old == "J. Seth Strattan")]['@id_old'].unique()
In [ ]:
len(no_matching_rep[(no_matching_rep.technical_replicates_old == no_matching_rep.technical_replicates_new)
& (no_matching_rep.derived_from_old != no_matching_rep.derived_from_new)].accession.unique())
In [ ]:
no_matching_rep[(no_matching_rep.technical_replicates_old == no_matching_rep.technical_replicates_new)
& (no_matching_rep.derived_from_old != no_matching_rep.derived_from_new)].drop_duplicates('accession', keep=False)
In [ ]:
multi_tech_match = no_matching_rep[(no_matching_rep.technical_replicates_old == no_matching_rep.technical_replicates_new)
& (no_matching_rep.derived_from_old != no_matching_rep.derived_from_new)]
no_matching_rep[(no_matching_rep.technical_replicates_old == no_matching_rep.technical_replicates_new)
& (no_matching_rep.derived_from_old != no_matching_rep.derived_from_new)].groupby(cols).count()
In [ ]:
multi_tech_match.superseded_by_old.value_counts()
In [ ]:
multi_tech_match[multi_tech_match.superseded_by_old == 'empty_list'][cols]
In [ ]:
multi_tech_match[multi_tech_match.supersedes_new != 'not_available'][cols]
In [ ]:
multi_tech_match.supersedes_old.value_counts()
In [ ]:
multi_tech_one_match = multi_tech_match.drop_duplicates('accession', keep=False)
len(multi_tech_match.drop_duplicates('accession', keep=False).accession.unique())
In [ ]:
multi_tech_one_match.submitted_by_old.value_counts()
In [ ]:
pd.crosstab(multi_tech_one_match.output_type_old, multi_tech_one_match.submitted_by_old, margins=False)
In [ ]:
multi_tech_one_match
In [ ]:
# Delete because no matching derived_from
#multi_tech_one_match[['@id_old', 'status_old']].rename(columns={'@id_old': '@id', 'status_old': 'status'}).to_csv('../../no_matching_derived_from_delete_patch_07_03_2017.tsv', index=False, sep='\t')
In [ ]:
multi_tech_one_match.file_format_old.value_counts()
In [ ]:
multi_tech_one_match[(multi_tech_one_match.output_type_old != 'alignments')][cols]
In [ ]:
multi_tech_one_match[(multi_tech_one_match.submitted_by_old == 'Xintao Wei')
& (multi_tech_one_match.output_type_old != 'alignments')][cols]#[['@id_old', 'possible_replacement_accession']].values
In [ ]:
multi_tech_one_match.groupby(cols).count()
In [ ]:
multi_tech_one_match.file_format_type_old.value_counts()
In [ ]:
multi_tech_one_match[multi_tech_one_match.submitted_by_old == "Jean Davidson"][cols]
In [ ]:
len(multi_tech_match[multi_tech_match.duplicated('accession', keep=False)].accession.unique())
In [ ]:
mtm = multi_tech_match[multi_tech_match.duplicated('accession', keep=False)]
mtm.groupby(cols).count()
In [ ]:
mtm[mtm.submitted_by_old == 'Jean Davidson'].groupby(cols).count()
In [ ]:
mtm[mtm.submitted_by_old == 'J. Seth Strattan'].groupby(cols).count()
In [ ]:
no_matching_rep[(no_matching_rep.technical_replicates_old != no_matching_rep.technical_replicates_new)
& (no_matching_rep.derived_from_old == no_matching_rep.derived_from_new)].shape
In [ ]:
no_matching_rep[(no_matching_rep.technical_replicates_old != no_matching_rep.technical_replicates_new)
& (no_matching_rep.derived_from_old == no_matching_rep.derived_from_new)].groupby(cols).count()
In [ ]:
len(no_matching_rep[(no_matching_rep.technical_replicates_old != no_matching_rep.technical_replicates_new)
& (no_matching_rep.derived_from_old != no_matching_rep.derived_from_new)].accession.unique())
In [ ]:
no_matching_rep[(no_matching_rep.technical_replicates_old != no_matching_rep.technical_replicates_new)
& (no_matching_rep.derived_from_old != no_matching_rep.derived_from_new)].groupby(cols).count()
In [ ]:
In [ ]:
cols = ['accession','possible_replacement_accession']
cols = cols + [x for x in sorted(matching_rep.columns, reverse=True) if (x not in cols) and (x not in ['alternate_accessions_new',
'alternate_accessions_old'])]
no_matching_rep.groupby(cols).count()
In [ ]:
mis_matching_rep = rsm_multi_result[~(rsm_multi_result.accession.isin(matching_rep.accession))].reset_index(drop=True)
In [ ]:
len(mis_matching_rep.accession.unique())
In [ ]:
mis_matching_rep[['technical_replicates_old','technical_replicates_new', 'derived_from_old', 'derived_from_new']]
In [ ]:
replacement_patch = pd.concat([rsm_patch,
rsm_one_result_full_match,
rsm_multi_one_result])
In [ ]:
# Squash list for patching.
patch_list = []
for replacement_accession in replacement_patch.possible_replacement_accession.unique():
values = replacement_patch[replacement_patch.possible_replacement_accession == replacement_accession]['accession']
accession_list = []
for val in values:
accession_list.append(val)
patch_list.append({'accession': replacement_accession,
'alternate_accessions:array': ', '.join(accession_list)})
In [ ]:
patch_data = pd.DataFrame(patch_list)
In [ ]:
#patch_data.to_csv("replaced_with_matching_replacements_patch_06_21_2017.tsv", sep="\t", index=False)
In [ ]:
with sns.plotting_context("notebook", font_scale=1.5):
fig = plt.figure(figsize=[14, 8])
sns.set_style('whitegrid')
sns.stripplot(x='date_created_old',
data=rsm[rsm.possible_replacement_accession == 'no_result'],
size=10,
color='black',
alpha=0.8)
In [ ]:
biosamples = no_redirect_accessions[no_redirect_accessions['@type'] == 'Biosample']
In [ ]:
biosamples.submitted_by.value_counts()
In [ ]:
na = 'not_available'
possible_replacements = defaultdict(list)
async def get_request_two(session, url, r):
async with session.get(url, auth=request_auth) as response_two:
result_one = await response_two.json()
search_results = result_one['@graph']
if len(search_results) == 0:
possible_replacements[r['accession']].append({'accession': r['accession'],
'possible_replacement_accession': 'no_result'})
for result in search_results:
lab = result.get('lab', {})
sub_by = result.get('submitted_by', {})
if isinstance(sub_by, str):
submitted_by = sub_by
else:
submitted_by = sub_by.get('title', na)
if isinstance(lab, str):
lab_name = lab
else:
lab_name = lab.get('name', na)
possible_replacements[r['accession']].append({'accession': r['accession'],
'possible_replacement_accession': result['accession'],
'@id': result['@id'],
'alternate_accessions': result.get('alternate_accessions', na),
'dataset': result.get('dataset', na),
'lab_name': lab_name,
'date_created': result.get('date_created', na),
'@type': result['@type'][0],
'output_type': result.get('output_type', na),
'file_format': result.get('file_format', na),
'assembly': result.get('assembly', na),
'paired_with': result.get('paired_with', na),
'paired_end': result.get('paired_end', na),
'file_format_type': result.get('file_format_type', na),
'technical_replicates': result.get('technical_replicates', na),
'replicate_uuid': result.get('replicate', {}).get('uuid', na),
'md5sum': result.get('md5sum', na),
'content_md5sum': result.get('content_md5sum', na),
'status': result['status'],
'submitted_by': submitted_by,
'derived_from': result.get('derived_from', na),
'superseded_by': result.get('superseded_by', na),
'supersedes': result.get('supersedes', na)
})
async def get_request_one(session, file_id):
url = 'https://www.encodeproject.org/{}/?format=json'.format(file_id)
async with session.get(url, auth=request_auth) as response_one:
result_one = await response_one.json()
r = result_one
file_format = r['file_format']
output_type = r['output_type']
dataset = r['dataset']
assembly = r.get('assembly', '*')
try:
assay_term_name = r['quality_metrics'][0]['assay_term_name']
url = 'https://www.encodeproject.org/search/?type=File&file_format={}'\
'&output_type={}&quality_metrics.assay_term_name={}'\
'&dataset={}&assembly={}&format=json&frame=embedded'\
'&status!=replaced'.format(file_format,
output_type,
assay_term_name,
dataset,
assembly)
except IndexError:
url = 'https://www.encodeproject.org/search/?type=File&file_format={}'\
'&output_type={}&dataset={}&assembly={}&format=json&frame=embedded'\
'&status!=replaced'.format(file_format,
output_type,
dataset,
assembly)
if assembly == '*':
url = url.replace('&assembly=*', '&assembly!=*')
result_two = await get_request_two(session, url, r)
async def create_session(file_ids, loop):
conn = aiohttp.TCPConnector(keepalive_timeout=10, limit=100)
async with aiohttp.ClientSession(connector=conn, loop=loop) as session:
results = await asyncio.gather(*[get_request_one(session, file_id) for file_id in file_ids])
In [ ]:
na = 'not_available'
possible_biosample_replacements = defaultdict(list)
for biosample_id in biosamples['@id'].unique():
r = requests.get('https://www.encodeproject.org/{}/?format=json'.format(biosample_id),
auth=(key.authid, key.authpw))
r = r.json()
lab_old = r.get('lab', {})
if isinstance(lab_old, str):
lab_name_old = lab_old
else:
lab_name_old = lab_old.get('name', na)
donor_old = r.get('donor', {})
if isinstance(donor_old, str):
donor_name_old = donor_old
else:
donor_name_old = donor_old.get('@id', na)
sub_by_old = r.get('submitted_by', {})
if isinstance(sub_by_old, str):
submitted_by_old = sub_by_old
else:
submitted_by_old = sub_by_old.get('title', na)
try:
product_id = r['product_id']
health_status = r['health_status']
culture_start_date = r['culture_start_date']
url = 'https://www.encodeproject.org/search/'\
'?type=Biosample&product_id={}'\
'&health_status={}&culture_start_date={}'\
'&status%21=replaced&format=json&frame=embedded'.format(product_id,
health_status,
culture_start_date)
except KeyError:
description = r['description']
url = 'https://www.encodeproject.org/search/'\
'?type=Biosample&description={}'\
'&status%21=replaced&format=json&frame=embedded'.format(description)
search_results = requests.get(url, auth=(key.authid, key.authpw))
search_results = search_results.json()['@graph']
if len(search_results) == 0:
possible_biosample_replacements[r['accession']].append({'accession': r['accession'],
'possible_replacement_accession': 'no_result'})
for result in search_results:
lab_new = result.get('lab', {})
if isinstance(lab_new, str):
lab_name_new = lab_new
else:
lab_name_new = lab_new.get('name', na)
donor_new = result.get('donor', {})
if isinstance(donor_new, str):
donor_name_new = donor_new
else:
donor_name_new = donor_new.get('@id', na)
sub_by_new = result.get('submitted_by', {})
if isinstance(sub_by_new, str):
submitted_by_new = sub_by_new
else:
submitted_by_new = sub_by_new.get('title', na)
possible_biosample_replacements[r['accession']].append({'accession': r['accession'],
'possible_replacement_accession': result['accession'],
'@id_old': r['@id'],
'@id_new': result['@id'],
'alternate_accessions_new': r.get('alternate_accessions', na),
'alternate_accessions_old': result.get('alternate_accessions', na),
'donor_old': donor_name_old,
'donor_new': donor_name_new,
'lab_name_old': lab_name_old,
'lab_name_new': lab_name_new,
'date_created_old': r.get('date_created', na),
'date_created_new': result.get('date_created', na),
'@type_old': r['@type'][0],
'@type_new': result['@type'][0],
'status_old': r['status'],
'status_new': result['status'],
'product_id_old': r.get('product_id', na),
'product_id_new': result.get('product_id', na),
'health_status_old': r.get('health_status', na),
'health_status_new': result.get('health_status', na),
'culture_start_date_old': r.get('culture_start_date', na),
'culture_start_date_new': result.get('culture_start_date', na),
'biosample_type_old': r['biosample_type'],
'biosample_type_new': result['biosample_type'],
'treatment_old': r['treatments'],
'treatment_new': result['treatments'],
'biosample_term_name_old': r['biosample_term_name'],
'biosample_term_name_new': result['biosample_term_name'],
'summary_old': r['summary'],
'summary_new': result['summary'],
'description_old': r['description'],
'description_new': result['description'],
'pooled_from_old': r.get('pooled_from', na),
'pooled_from_new': result.get('pooled_from', na),
'part_of_old': r.get('part_of', na),
'part_of_new': result.get('part_of', na),
'culture_harvest_date_old': r.get('culture_harvest_date', na),
'culture_harvest_date_new': result.get('culture_harvest_date', na),
'passage_number_old': r.get('passage_number', na),
'passage_number_new': result.get('passage_number', na),
'lot_id_old': r.get('lot_id', na),
'lot_id_new': result.get('lot_id', na),
'submitted_by_old': submitted_by_old,
'submitted_by_new': submitted_by_new
})
In [ ]:
len(possible_biosample_replacements)
In [ ]:
possible_biosample_replacements
In [ ]:
replacement_search = pd.DataFrame([item for key, value in possible_biosample_replacements.items() for item in value])
replacement_search = replacement_search.fillna('isnull')
replacement_search.loc[replacement_search.alternate_accessions_old.apply(lambda x: len(x) == 0), 'alternate_accessions_old'] = 'empty_list'
replacement_search.loc[replacement_search.alternate_accessions_new.apply(lambda x: len(x) == 0), 'alternate_accessions_new'] = 'empty_list'
#replacement_search.loc[replacement_search.pooled_from_old.apply(lambda x: len(x) == 0), 'pooled_from_old'] = 'empty_list'
#replacement_search.loc[replacement_search.pooled_from_new.apply(lambda x: len(x) == 0), 'pooled_from_new'] = 'empty_list'
In [ ]:
replacement_search.shape
In [ ]:
lazy_dict = {'_,e,i,l,m,p,s,t,t,y': 'empty_list',
'i,l,l,n,s,u': 'isnull',
'_,a,a,a,b,e,i,l,l,n,o,t,v': 'not_available'}
def parse_list(x):
return ','.join([y.strip() for y in sorted(x)])
replacement_search.date_created_old = replacement_search.date_created_old.apply(lambda x: pd.to_datetime(x))
replacement_search.date_created_new = replacement_search.date_created_new.apply(lambda x: pd.to_datetime(x))
for field in ['treatment_new',
'treatment_old',
'alternate_accessions_old',
'alternate_accessions_new',
'pooled_from_new',
'pooled_from_old',
'part_of_new',
'part_of_old']:
replacement_search[field] = replacement_search[field].apply(lambda x: parse_list(x)).apply(lambda x: lazy_dict[x] if x in lazy_dict.keys() else x)
In [ ]:
bcols
In [ ]:
biosamples_one_match = replacement_search.drop_duplicates('accession', keep=False)
In [ ]:
first_cols = ['accession', 'possible_replacement_accession']
bcols = first_cols + [col for col in sorted(biosamples_one_match.columns, reverse=True) if col not in first_cols]
In [ ]:
biosamples[biosamples['@id'].isin(replacement_search['@id_old'])].shape
In [ ]:
biosamples_one_match[bcols].lab_name_old.value_counts()
In [ ]:
biosamples_one_match[bcols]
In [ ]:
flat_patch = []
for replacement in bs_patch.possible_replacement_accession.unique():
data = {'accession': replacement,
'alternate_accessions:array': ", ".join(bs_patch[bs_patch.possible_replacement_accession == replacement].accession.values)}
flat_patch.append(data)
In [ ]:
fp = pd.DataFrame(flat_patch)
In [ ]:
# fp.to_csv('../../biosample_one_match_patch_07_03_2017.tsv', sep='\t', index=False)
In [ ]:
biosamples_multi_match[biosamples_multi_match.accession.isin(bs_multi_match.accession)]
In [ ]:
bs_multi_match = bs[(bs.donor_old == bs.donor_old)
& (bs.passage_number_old == bs.passage_number_new)
& (bs.lot_id_old == bs.lot_id_new)
& (bs.product_id_old == bs.product_id_new)
& (bs.culture_harvest_date_old == bs.culture_harvest_date_new)
& (bs.culture_start_date_old == bs.culture_start_date_new)]
In [ ]:
bs = biosamples_multi_match#.drop_duplicates('accession').shape
In [ ]:
bs_multi_match.submitted_by_old.value_counts()
In [ ]:
bs_multi_match.groupby(bcols).count()
In [ ]:
ANTIBODIES:
product_id=A301-145A
@type=AntibodyLot
targets.gene_name: "NCOR1",
antigen_description: "Nuclear Receptor corepressor 1; N-CoR, TRAC1, KIAA1047, hN-CoR",
source.title: "Bethyl Labs",
https://www.encodeproject.org/search/?type=AntibodyLot&targets.gene_name=NCOR1&source.title=Bethyl+Labs&product_id=A301-145A&status%21=replaced
BIOSAMPLE
biosample_type: "immortalized cell line",
treatment: [ ]
lab.name: "gene-yeo"
culture_start_date: "2015-06-12",
health_status: "hepatocellular carcinoma",
product_id: "HB-8065",
biosample_term_name: "HepG2",
@type: "Biosample"
donor.@id: "/human-donors/ENCDO000AAC/",
summary: "Homo sapiens HepG2 immortalized cell line",
life_stage: "child",
source.title: "ATCC",
biosample_term_name: "HepG2",
https://www.encodeproject.org/search/
?type=Biosample&product_id=HB-8065
&health_status=hepatocellular+carcinoma
&culture_start_date=2015-06-12&status%21=replaced
FILE
quality_metrics.assay_term_name: "ChIP-seq",
file_type: "bam",
assembly: "hg19",
lab.name: "encode-processing-pipeline",
output_category: "alignment",
analysis_step_version.analysis_step.name: "bwa-raw-alignment-step-v-1",
biological_replicates: 1
technical_replicates: [
"1_1"
https://www.encodeproject.org/search/?type=File&file_format=bam
&output_type=alignments&quality_metrics.assay_term_name=ChIP-seq
&dataset=%2Fexperiments%2FENCSR021JFW%2F&assembly=hg19
LIBRARY
nucleic_acid_term_name: "DNA",
library_size_selection_method: "SPRI beads",
strand_specificity: false,
fragmentation_method: "shearing (Covaris S2)",
aliases: "tim-reddy:hic_dex.t0_brep1_lib"
lab: "/labs/tim-reddy/",
crosslinking_method: "formaldehyde",
biosample.summary: "Homo sapiens A549 immortalized cell line"
biosample.biosample_term_name: "A549"
https://www.encodeproject.org/search/?type=Library
&lab=%2Flabs%2Fthomas-gingeras%2F
&nucleic_acid_term_name=polyadenylated+mRNA
&strand_specificity=true&depleted_in_term_name=rRNA
&biosample.biosample_term_name=NCI-H460
&biosample.%40id=%2Fbiosamples%2FENCBS814QPR%2F&status%21=replaced