In [ ]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import requests
import seaborn as sns
from ast import literal_eval
from collections import defaultdict
pd.options.display.max_rows = 200
pd.options.display.max_columns = 50
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
In [ ]:
os.getcwd()
In [ ]:
# Copied from pyencoded-tools/encodedcc.py to avoid dependency.
class ENC_Key:
def __init__(self, keyfile, keyname):
if os.path.isfile(str(keyfile)):
keys_f = open(keyfile, 'r')
keys_json_string = keys_f.read()
keys_f.close()
keys = json.loads(keys_json_string)
else:
keys = keyfile
key_dict = keys[keyname]
self.authid = key_dict['key']
self.authpw = key_dict['secret']
self.server = key_dict['server']
if not self.server.endswith("/"):
self.server += "/"
class ENC_Connection(object):
def __init__(self, key):
self.headers = {'content-type': 'application/json', 'accept': 'application/json'}
self.server = key.server
self.auth = (key.authid, key.authpw)
In [ ]:
class ExtractedQCData(object):
"""Class to store extracted QC data
from ENCODE search results."""
def __init__(self, file):
self.accession = file['accession']
self.file_status = file['status']
self.file_format = file['file_format']
self.assembly = file['assembly']
self.dataset = file['dataset']
self.date_file_created = file['date_created']
self.output_type = file['output_type']
try:
self.analysis_name = file['analysis_step_version']['analysis_step']['name']
except KeyError:
self.analysis_name = np.nan
self.quality_metrics = []
for metric in file['quality_metrics']:
try:
assay_term_name = metric['assay_term_name']
except KeyError:
try:
assay_term_name = file['analysis_step_version']['analysis_step']\
['pipelines'][0]['assay_term_name']
except (KeyError, IndexError):
assay_term_name = np.nan
try:
# Only for SamtoolFlagstatsQualityMetrics
processing_stage = metric['processing_stage']
except KeyError:
processing_stage = 'unknown'
metric_data = {
'uuid': metric['uuid'],
'@type': metric['@type'][0],
'qc_status': metric['status'],
'quality_metric_of': [x for x in metric['quality_metric_of']],
'date_qc_created': metric['date_created'],
'assay_term_name': assay_term_name,
'processing_stage': processing_stage
}
self.quality_metrics.append(metric_data)
self.metric_count = len(self.quality_metrics)
In [ ]:
def get_data(url, output_json='qc_metrics.json', auth=False, force_update=False):
"""Gets data from ENCODE and stores
it in output_file. If output_file already
exists then it will use cached version.
Set force_update=True to update cached file.
WARNING: Does not update data automatically
if search URL is changed. Use force_update=True.
Can pass auth=(key.authid, key.authpw)
to return non-public results. Returns pandas
DataFrame object."""
if not output_json.endswith('.json'):
raise ValueError("Output file must be a .json")
if os.path.isfile(output_json) and not force_update:
data = pd.read_json(output_json)
else:
r = requests.get(url, auth=auth)
search_results = r.json()['@graph']
results = []
for file in search_results:
extracted_data = ExtractedQCData(file)
results.append(extracted_data)
data = pd.DataFrame([vars(r) for r in results])
data.to_json(output_json)
return data
In [ ]:
def tidy_data(data):
"""Extracts data from quality_metric
column, returning a tidy dataset where
each quality_metric has its own row. Since
several quality_metrics can refer to same file,
this will duplicate accession and other high-level
metadata."""
# Convert DataFrame to list of objects.
results = data.to_dict(orient='records')
# Make tidy data from list of QC metrics.
data = pd.io.json.json_normalize([r for r in results],
'quality_metrics',
[x for x in results[0].keys() if x != 'quality_metrics'])
# Convert date information to datetime type.
data[['date_qc_created',
'date_file_created']] = data[['date_qc_created',
'date_file_created']].apply(pd.to_datetime)
return data
In [ ]:
# Define key if private data desired.
key = ENC_Key(os.path.expanduser("~/keypairs.json"), 'prod')
# Define query to return objects.
url = 'https://www.encodeproject.org/'\
'search/?type=File&quality_metrics.uuid=%2A'\
'&format=json&limit=all'
#frame=object
In [ ]:
data = get_data(url, auth=(key.authid, key.authpw), force_update=True)
In [ ]:
data.shape
In [ ]:
td = tidy_data(data)
#td['date_qc_created'] = td['date_qc_created'].apply(lambda x: x.strftime('%m-%d-%Y'))
#td['date_qc_created'] = td['date_qc_created'].apply(pd.to_datetime)
In [ ]:
td.shape
In [ ]:
td = td.fillna('unknown')
In [ ]:
td.assay_term_name.value_counts()
In [ ]:
td.analysis_name.value_counts()
In [ ]:
td['@type'].value_counts()
In [ ]:
td.file_format.value_counts()
In [ ]:
data.file_format.value_counts()
In [ ]:
td.processing_stage.value_counts()
In [ ]:
processing_stage_data = td[['accession', 'uuid', 'processing_stage']]
In [ ]:
file_format_url = '&file_format='.join([x for x in expected_file_format])
output_type_url = '&output_type='.join([x.replace(' ', '+') for x in expected_output_type])
url = 'https://www.encodeproject.org/search/?type=File'\
'&file_format={}&output_type={}&quality_metrics.uuid!=*'\
'&format=json&limit=all'.format(file_format_url,
output_type_url)
url
In [ ]:
r = requests.get(url, auth=(key.authid, key.authpw))
search_results = r.json()['@graph']
result_list = []
for file in search_results:
try:
analysis_name = file['analysis_step_version']['analysis_step']['name']
except KeyError:
analysis_name = 'unknown'
try:
assay_term_name = file['analysis_step_version']['analysis_step']['pipelines'][0]['assay_term_name']
except (KeyError, IndexError):
assay_term_name = 'unknown'
try:
assembly = file['assembly']
except KeyError:
assembly = 'unknown'
extracted_results = {
'accession': file['accession'],
'assay_term_name': assay_term_name,
'lab': file['lab']['title'],
'file_format': file['file_format'],
'status': file['status'],
'assembly': assembly,
'dataset': file['dataset'],
'date_file_created': file['date_created'],
'output_type': file['output_type'],
'analysis_name': analysis_name
}
if extracted_results['analysis_name'] in expected_analysis_name:
result_list.append(extracted_results)
In [ ]:
len(result_list)
In [ ]:
no_qc = pd.DataFrame(result_list)
In [ ]:
no_qc[no_qc.lab == 'ENCODE Processing Pipeline'].drop(['accession',
'dataset',
'date_file_created'],
axis=1).apply(pd.Series.value_counts)\
.sort_values([x for x in ['analysis_name',
'assembly',
'lab',
'output_type',
'status']], ascending=False)\
.T.stack().astype(int)
In [ ]:
no_qc
In [ ]:
# Files with expected file_format of:
expected_file_format = ['bam',
'bed',
'bigBed',
'tsv',
'bigWig']
# Files with expected output_type of:
expected_output_type = ['unfiltered alignments',
'conservative idr thresholded peaks',
'hotspots',
'optimal idr thresholded peaks',
'alignments',
'transcriptome alignments',
'gene quantifications',
'transcription start sites',
'signal of unique reads',
'peaks',
'pseudoreplicated idr thresholded peaks',
'methylation state at CHG',
'methylation state at CHH',
'signal',
'methylation state at CpG',
'filtered peaks']
# Files with expected analysis_name of:
expected_analysis_name = ['bwa-alignment-step-v-1',
'tf-idr-step',
'dnase-call-hotspots-v-1',
'tf-idr-peaks-to-bigbed-step',
'kundaje-lab-atac-seq-trim-align-filter-single-rep-v1',
'kundaje-lab-atac-seq-idr-v1', 'dnase-align-bwa-se-v-1',
'dnase-filter-se-v-1', 'lrna-pe-star-alignment-step-v-1',
'lrna-pe-rsem-quantification-v-1', 'rampage-idr-step-v-1',
'dnase-call-hotspots-alt-v-1', 'lrna-pe-tophat-alignment-step-v-1',
'ggr_tr1_rna_seq_trimming_mapping_step',
'rampage-grit-peak-calling-step-v-1',
'kundaje-lab-atac-seq-trim-align-filter-v1',
'modern-chip-seq-peaks-to-bigbed-step-v-1',
'modern-bwa-alignment-step-v-1',
'modern-chip-seq-optimal-idr-step-v-1',
'lrna-se-star-alignment-step-v-1',
'dme-align-pe-v-1',
'rampage-pe-alignment-step-v-1',
'bwa-raw-alignment-step-v-1',
'lrna-se-rsem-quantification-step-v-1',
'dnase-align-bwa-pe-v-1',
'dnase-filter-pe-v-1',
'small-rna-se-star-alignment-step-v-2',
'tf-unreplicated-idr-step',
'ggr_tr1_chip_seq_mapping_filtering_step',
'dme-extract-se-v-2',
'mott-trim-align-bismark-v-1-0',
'kundaje-lab-atac-seq-unreplicated-idr-single-rep-v1',
'dme-extract-pe-v-1',
'tf-unreplicated-idr-peaks-to-bigbed-step',
'uknown',
'modern-chip-seq-filter-for-optimal-idr-peaks-step-v-1',
'lrna-se-tophat-alignment-step-v-1',
'modern-chip-seq-optimal-idr-thresholded-peaks-to-bigbed-step-v-1',
'kundaje-lab-atac-seq-peak-call-single-rep-v1']
In [ ]:
no_qc_grouped = no_qc.groupby(['assay_term_name',
'file_format',
'analysis_name',
'output_type',
'accession']).count()
In [ ]:
no_qc_unstacked = no_qc_grouped[['status']].unstack()
In [ ]:
no_qc_unstacked_transformed = no_qc_unstacked.T
In [ ]:
no_qc_unstacked_summary = no_qc_unstacked_transformed.describe()
In [ ]:
no_qc_unstacked_summary_transformed = no_qc_unstacked_summary.T
In [ ]:
no_qc_dict = no_qc_unstacked_summary_transformed[['mean']].to_dict()
In [ ]:
for i in no_qc_dict['mean'].keys():
print(i)
In [ ]:
for i in summary_expected_dict.keys():
print(i)
In [ ]:
qc_metric_requirements = pd.read_csv("qc_metric_requirements.csv")
In [ ]:
grouped_data = td.groupby(['assay_term_name',
'file_format',
'analysis_name',
'output_type',
'@type',
'processing_stage',
'accession']).count()
In [ ]:
uuid_grouped_data = grouped_data[['uuid']]
In [ ]:
unstacked_uuid_grouped_data = uuid_grouped_data.unstack()
In [ ]:
unstacked_uuid_grouped_data_transformed = unstacked_uuid_grouped_data.T
In [ ]:
summary_unstacked_uuid_grouped_data_transformed = unstacked_uuid_grouped_data_transformed.describe()
In [ ]:
summary_transformed = summary_unstacked_uuid_grouped_data_transformed.T
In [ ]:
summary_transformed['mean'] = summary_transformed['mean'].apply(lambda x: round(x))
In [ ]:
summary_transformed_selected = summary_transformed[['count', 'min', 'mean', 'max']]
In [ ]:
summary_expected_dict = summary_transformed_selected[['mean']].to_dict()['mean']
In [ ]:
required_number = []
for i, y in summary_expected_dict.items():
qc_metric_details = {'assay_term_name': i[0],
'file_format': i[1],
'analysis_name': i[2],
'output_type': i[3],
'@type': i[4],
'processing_stage': i[5],
'count': y}
required_number.append(qc_metric_details)
required_number
In [ ]:
qc_metric_requirements = pd.DataFrame(required_number)[['assay_term_name',
'file_format',
'analysis_name',
'output_type',
'@type',
'processing_stage',
'count']]
In [ ]:
#qc_metric_requirements = pd.read_csv("qc_metric_requirements.csv")
#qc_metric_requirements.to_csv("qc_metric_requirements.csv")
In [ ]:
qc_metric_requirements.groupby(['assay_term_name',
'file_format',
'analysis_name',
'output_type',
'@type',
'processing_stage',
'count']).count()
In [ ]:
for i in summary_expected_dict.keys():
if 'rampage-pe-alignment-step-v-1' in i:
print(i)
print()
In [ ]:
for i, y in summary_expected_dict.items():
if y > 1:
print(i)
In [ ]:
assert summary_expected_dict[('CAGE',
'bam',
'rampage-pe-alignment-step-v-1',
'alignments',
'SamtoolsFlagstatsQualityMetric',
'unknown')] == 1
assert summary_expected_dict[('ChIP-seq',
'bam',
'bwa-alignment-step-v-1',
'alignments',
'SamtoolsFlagstatsQualityMetric',
'filtered')] == 1
In [ ]:
#td['date_qc_created'] = td['date_qc_created'].apply(lambda x: x.strftime('%m-%d-%Y'))
In [ ]:
# Only files that haven't been revoked/deleted etc.
active_files = td[~(td.file_status.isin(['revoked',
'deleted',
'replaced',
'content error']))].groupby(['accession',
'assay_term_name',
'file_format',
'analysis_name',
'output_type',
'@type',
'assembly']).count()[['uuid']].reset_index()
active_files = active_files.reset_index()
In [ ]:
# All file statuses
active_files = td.groupby(['accession',
'assay_term_name',
'file_format',
'analysis_name',
'output_type',
'@type',
'assembly',
'processing_stage']).count()[['uuid']]#.reset_index()
active_files = active_files.reset_index()
In [ ]:
active_files['mapping_key'] = list(zip(active_files.assay_term_name,
active_files.file_format,
active_files.analysis_name,
active_files.output_type,
active_files['@type'],
active_files.processing_stage))
In [ ]:
active_files['expected_number'] = active_files['mapping_key'].apply(lambda x: summary_expected_dict[x])
In [ ]:
active_files[active_files.uuid != active_files.expected_number]
In [ ]:
active_files_mismatch = active_files[active_files.uuid != active_files.expected_number]\
.rename(columns={'uuid': 'actual_number'})\
[['accession',
'assay_term_name',
'analysis_name',
'assembly',
'output_type',
'@type',
'processing_stage',
'expected_number',
'actual_number']].sort_values(['assembly',
'output_type',
'@type',
'actual_number'], ascending=False)
In [ ]:
afm = active_files_mismatch.merge(td, how='left', on=['accession',
'@type',
'assay_term_name',
'analysis_name',
'assembly',
'output_type',
'processing_stage'])
afm
In [ ]:
afm.groupby(['assay_term_name',
'accession',
'output_type',
'@type',
'processing_stage',
'expected_number',
'actual_number',
'qc_status',
'uuid',
'date_qc_created',
'assembly',
'analysis_name',
'file_format',
'date_file_created',
'dataset',
'file_status']).count()
In [ ]:
In [ ]:
#afm.to_excel("duplicate_quality_metrics_06_13_2017.xlsx")
In [ ]:
experiments = td[td.accession.isin(active_files_mismatch['accession'].values)]\
[['accession', 'dataset']].drop_duplicates('accession')
In [ ]:
active_files_mismatch['dataset'] = active_files_mismatch['accession']\
.apply(lambda x: experiments[experiments.accession == x]['dataset'].values[0])
In [ ]:
active_files_mismatch['@type'].value_counts()
In [ ]:
search_ids = "&@id=".join(active_files_mismatch.dataset.unique())
url = 'https://www.encodeproject.org/search/?type=Experiment&limit=all&@id={}'.format(search_ids)
r = requests.get(url, auth=(key.authid, key.authpw))
search_results = r.json()['@graph']
search_id_map = {}
for experiment in search_results:
search_id_map[experiment['@id']] = experiment['status']
In [ ]:
active_files_mismatch['experiment_status'] = active_files_mismatch['dataset'].apply(lambda x: search_id_map[x])
active_files_mismatch = active_files_mismatch.reset_index(drop=True)
active_files_mismatch['dataset'] = active_files_mismatch['dataset'].apply(lambda x: x.replace("/experiments/", "").replace("/", ""))
In [ ]:
active_files_mismatch.shape
In [ ]:
active_files_mismatch[active_files_mismatch.expected_number < active_files_mismatch.actual_number].shape
In [ ]:
released_data = active_files_mismatch[~(active_files_mismatch.experiment_status.isin(['revoked',
'deleted']))]\
.groupby(['assay_term_name',
'dataset',
'accession',
'assembly',
'output_type',
'@type',
'expected_number',
'actual_number',
'experiment_status'])\
.count().unstack()
In [ ]:
released_data
In [ ]:
# Pull Idan's audit files
url = 'https://4800-redundant-qc-6f1aa1cb5-idan.demo.encodedcc.org/search/'\
'?type=File&audit.INTERNAL_ACTION.category=redundant+quality+metric&format=json&limit=all'
r = requests.get(url, auth=(key.authid, key.authpw))
search_results = r.json()['@graph']
audit_flag = []
for file in search_results:
audit_flag.append(file['accession'])
In [ ]:
len(audit_flag)
In [ ]:
af = pd.DataFrame(audit_flag)
af = af.rename(columns={0:'accession'})
In [ ]:
m = '0 ENCFF860EEZ 1 ENCFF806GBQ 2 ENCFF865OKF 3 ENCFF899TCQ 4 ENCFF011AMS 5 ENCFF414WJD 6 ENCFF663HVJ 7 ENCFF998PWW 8 ENCFF243AGG 9 ENCFF258AQQ 10 ENCFF709UWZ 11 ENCFF023KBS 12 ENCFF385ZPU 13 ENCFF619KWP 14 ENCFF767SOM 15 ENCFF948BIJ 16 ENCFF358RZK 17 ENCFF355CBE 18 ENCFF266EKT 19 ENCFF236AJI 20 ENCFF307AIG 21 ENCFF112PJJ 22 ENCFF694GJR 23 ENCFF591KID 24 ENCFF472SIB 25 ENCFF918TKF 26 ENCFF409NLS 27 ENCFF930IKU 28 ENCFF543CGB 29 ENCFF713UFE 30 ENCFF497JBZ 31 ENCFF496OJU 32 ENCFF278OYY 33 ENCFF526CXQ 34 ENCFF311KIP 35 ENCFF779CWH 36 ENCFF373UYA 37 ENCFF315SPJ 38 ENCFF548IHT 39 ENCFF484BSF 40 ENCFF899PSI 41 ENCFF713BDU 42 ENCFF193TFR 43 ENCFF211ZKG 44 ENCFF041ZEP 45 ENCFF220MHX 46 ENCFF077WNR 47 ENCFF283MNG 48 ENCFF896RNG 49 ENCFF970WRP 50 ENCFF659WGE 51 ENCFF281NFW 52 ENCFF491DDU 53 ENCFF137UAY 54 ENCFF896WQY 55 ENCFF950MBW 56 ENCFF900IQG 57 ENCFF384LXN 58 ENCFF865RQN 59 ENCFF071NEW 60 ENCFF476OYZ 61 ENCFF333VCK 62 ENCFF352PYL 63 ENCFF105VHD 64 ENCFF008RRH 65 ENCFF878BEQ 66 ENCFF078QRE 67 ENCFF150LTG 68 ENCFF576YJD 69 ENCFF501KQJ 70 ENCFF764BNL 71 ENCFF278QIG 72 ENCFF988FAL 73 ENCFF901ONA 74 ENCFF951FBF 75 ENCFF344UBE 76 ENCFF218LOB 77 ENCFF792ZKL 78 ENCFF435EIY 79 ENCFF960TDU 80 ENCFF549UEZ 81 ENCFF507RMK 82 ENCFF998KDQ 83 ENCFF312IYF 84 ENCFF811YFQ 85 ENCFF225TLP 86 ENCFF927TJR 87 ENCFF948MDE 88 ENCFF286YMB 89 ENCFF304XON 90 ENCFF375BIG 91 ENCFF365AWU 92 ENCFF093ZFB 93 ENCFF838BUR 94 ENCFF138SMJ 95 ENCFF032YPC 96 ENCFF689YCR 97 ENCFF540SLS 98 ENCFF546ZGF 99 ENCFF263EQX 100 ENCFF276EKT 101 ENCFF425KXW 102 ENCFF455EXR 103 ENCFF161QXX 104 ENCFF532MZI 105 ENCFF766SWF 106 ENCFF904CGH 107 ENCFF977MOV 108 ENCFF214OJW 109 ENCFF424FKL 110 ENCFF156XRJ 111 ENCFF437YIV 112 ENCFF435IEF 113 ENCFF349GID 114 ENCFF835RGX 115 ENCFF086IJA 116 ENCFF203ZIS 117 ENCFF534ULU 118 ENCFF909LKK 119 ENCFF784MKW 120 ENCFF248AGS 121 ENCFF488HDC 122 ENCFF217ISJ 123 ENCFF715NLX 124 ENCFF177LBB 125 ENCFF360MGZ 126 ENCFF801KEW 127 ENCFF884DZN 128 ENCFF566WGL 129 ENCFF374VWZ 130 ENCFF319FBU 131 ENCFF279MNV 132 ENCFF448YLM 133 ENCFF199LDJ 134 ENCFF323PIB 135 ENCFF148VQH 136 ENCFF485KSO 137 ENCFF362MSS 138 ENCFF562MPS 139 ENCFF505MGI 140 ENCFF731ZYR 141 ENCFF113DVJ 142 ENCFF198BFX 143 ENCFF388HRC 144 ENCFF731PDS 145 ENCFF378LGJ 146 ENCFF395BPD 147 ENCFF958MPB 148 ENCFF017LYR 149 ENCFF222DAU 150 ENCFF138WSJ 151 ENCFF245SEV 152 ENCFF051XIW 153 ENCFF407MVV 154 ENCFF152JZK 155 ENCFF121WOM 156 ENCFF304VVZ 157 ENCFF207ZRI 158 ENCFF743FCW 159 ENCFF015XIL 160 ENCFF531PVU 161 ENCFF257QND 162 ENCFF623CPY 163 ENCFF384SBC 164 ENCFF740RLD 165 ENCFF783LYO 166 ENCFF112YYK 167 ENCFF084FGO 168 ENCFF426JHB 169 ENCFF822NZG 170 ENCFF670GFY 171 ENCFF588PIS 172 ENCFF860JUD 173 ENCFF618GUI 174 ENCFF926JOK 175 ENCFF363HXH 176 ENCFF456PDQ 177 ENCFF056ESE 178 ENCFF728UAB 179 ENCFF808NFI 180 ENCFF256VYP 181 ENCFF843MEU 182 ENCFF549PGC 183 ENCFF585IAJ 184 ENCFF437YPG'
In [ ]:
import re
z = [x.strip() for x in re.split('\d* ', m) if x != '']
z = pd.DataFrame(z).rename(columns={0:'accession'})
af = z
In [ ]:
In [ ]:
In [ ]:
active_files_mismatch = active_files_mismatch.reset_index(drop=True)
In [ ]:
active_files_mismatch.assay_term_name.value_counts()
In [ ]:
afm = active_files_mismatch[active_files_mismatch.expected_number < active_files_mismatch.actual_number]
In [ ]:
afm.shape
In [ ]:
#af[~(af.accession.isin(afm.accession.values))].shape
In [ ]:
#afm[afm.accession.isin(af.accession.values)]
In [ ]:
afm[afm.accession.isin(af.accession.values)]\
.sort_values(['assay_term_name', 'assembly'])\
.reset_index(drop=True) #.to_excel('redundant_qc_metrics_06_01_17.xlsx')
In [ ]:
af[(af.accession.isin(afm.accession.values))]
In [ ]:
afm[~(afm.accession.isin(af.accession.values))]\
.sort_values(['assay_term_name', 'assembly'])\
.reset_index(drop=True) #.to_excel("duplicate_but_different_05_31_2017.xlsx")
ddbd = afm[~(afm.accession.isin(af.accession.values))]\
.sort_values(['assay_term_name', 'assembly'])\
.reset_index(drop=True)
In [ ]:
afm.assay_term_name.value_counts()
In [ ]:
# Only if af has been defined above
both = afm[afm.accession.isin(af.accession.values)]\
.sort_values(['assay_term_name', 'assembly'])\
.reset_index(drop=True)
In [ ]:
In [ ]:
both = afm.sort_values(['assay_term_name', 'assembly'])\
.reset_index(drop=True)
both
In [ ]:
len(both.accession.unique())
In [ ]:
uuid = both.groupby(['assay_term_name',
'accession',
'output_type',
'@type',
'processing_stage',
'expected_number',
'actual_number',
'qc_status',
'uuid',
'date_qc_created',
'assembly',
'analysis_name',
'file_format',
'date_file_created',
'dataset',
'file_status']).count()\
.drop(['quality_metric_of',
'metric_count'], axis=1)\
.sort_index(level=[0, 1, 2, 3, 4, 9])
In [ ]:
# uuid = both.merge(td,
# how='left',
# on=['accession',
# '@type',
# 'assay_term_name',
# 'analysis_name',
# 'output_type',
# 'assembly',
# 'processing_stage'])\
# .groupby(['assay_term_name',
# 'accession',
# 'output_type',
# '@type',
# 'processing_stage',
# 'expected_number',
# 'actual_number',
# 'qc_status',
# 'uuid',
# 'date_qc_created',
# 'assembly',
# 'analysis_name',
# 'file_format',
# 'date_file_created',
# 'dataset',
# 'file_status']).count()\
# .drop(['quality_metric_of',
# 'metric_count'], axis=1)\
# .sort_index(level=[0, 1, 2, 3, 4, 9])#.to_excel("uuids_of_duplicate_metrics_06_01_2017.xlsx")
# uuid
In [ ]:
uuid_dict = uuid.reset_index().to_dict(orient='records')
In [ ]:
uuid_dict
In [ ]:
# From Idan's redundant QC audit code.
def audit_file_redundant_qc_metrics(quality_metrics):
metrics_set = set()
if quality_metrics:
redundant_types = set()
for metric in quality_metrics:
metric.pop('uuid', None)
metric.pop('@id', None)
metric.pop('date_created', None)
metric.pop('audit', None)
metric.pop('aliases', None)
metric.pop('attachment', None)
metric.pop('IDR_plot_true', None)
metric.pop('IDR_plot_rep1_pr', None)
metric.pop('IDR_plot_rep2_pr', None)
metric.pop('IDR_plot_pool_pr', None)
metric.pop('IDR_parameters_true', None)
metric.pop('IDR_parameters_rep1_pr', None)
metric.pop('IDR_parameters_rep2_pr', None)
metric.pop('IDR_parameters_pool_pr', None)
metric.pop('cross_correlation_plot', None)
metric.pop('quality_metric_of', None)
metric.pop('schema_version', None)
metric.pop('status', None)
metric.pop('step_run', None)
metric.pop('submitted_by', None)
metric_string = str(ordered_representation(metric))
if metric_string in metrics_set:
redundant_types.add(metric['@type'][0])
else:
metrics_set.add(metric_string)
return metrics_set
def ordered_representation(obj):
if isinstance(obj, dict):
return sorted((k, ordered_representation(v)) for k, v in obj.items())
if isinstance(obj, list):
return sorted(ordered_representation(x) for x in obj)
else:
return obj
In [ ]:
# Ugly code to create nested dictionary
accession_grouped = defaultdict(list)
for record in uuid_dict:
accession_grouped[record['accession']].append(record)
In [ ]:
accession_type_grouped = defaultdict(list)
for k, v in accession_grouped.items():
type_grouped = defaultdict(list)
for record in v:
type_grouped[record['@type']].append(record)
accession_type_grouped[k].append(type_grouped)
In [ ]:
accession_type_processing_grouped = defaultdict(list)
for k, v in accession_type_grouped.items():
for x in v:
type_processing_grouped = defaultdict(list)
for i, y in x.items():
processing_stage = defaultdict(list)
for t in y:
processing_stage[t['processing_stage']].append(t)
type_processing_grouped[i].append(processing_stage)
accession_type_processing_grouped[k].append(type_processing_grouped)
accession_type_processing_grouped
In [ ]:
duplicate_but_different_accessions = []
for k, v in accession_type_processing_grouped.items():
for x in v:
for i, y in x.items():
for t in y:
for p, d in t.items():
expected_number = d[0]['expected_number']
uuid_list = []
for z in d:
uuid_list.append(z['uuid'])
uuids = '&uuid='.join(uuid_list)
url = 'https://www.encodeproject.org/search/'\
'?type=QualityMetric&uuid={}&format=json'\
'&frame=embedded&limit=all'.format(uuids)
r = requests.get(url, auth=(key.authid, key.authpw))
search_results = r.json()['@graph']
unique_metrics = audit_file_redundant_qc_metrics(search_results)
if len(unique_metrics) != expected_number:
duplicate_but_different_accessions.append((k, i, p))
print(k)
print(i)
print(p)
print()
print(*unique_metrics, sep="\n\n")
print()
print("Number of unique metrics: {}".format(len(unique_metrics)))
print("Expected number of metrics: {}".format(expected_number))
print("\n\n")
In [ ]:
len(duplicate_but_different_accessions)
In [ ]:
duplicate_but_different_accessions
In [ ]:
# duplicate_but_different = afm[(afm.accession.isin([x[0] for x in duplicate_but_different_accessions]))
# & (afm['@type'].isin([x[1] for x in duplicate_but_different_accessions]))]
In [ ]:
ddbd2 = afm[(afm.accession.isin([x[0] for x in duplicate_but_different_accessions]))
& (afm['@type'].isin([x[1] for x in duplicate_but_different_accessions]))]
In [ ]:
ddbd2
In [ ]:
pd.concat([ddbd, ddbd2])\
.sort_values('assay_term_name')\
.reset_index(drop=True)\
#.to_excel("duplicate_but_different_06_01_2017.xlsx")
In [ ]:
duplicate_but_different\
.sort_values('assay_term_name')\
.reset_index(drop=True)\
.merge(td,
how='left',
on=['accession',
'@type',
'assay_term_name',
'analysis_name',
'output_type',
'assembly',
'processing_stage'])\
.drop(['quality_metric_of',
'metric_count'], axis=1)\
.sort_values(['accession',
'@type',
'date_qc_created'])\
.reset_index(drop=True)\
.groupby(['assay_term_name',
'accession',
'output_type',
'@type',
'processing_stage',
'expected_number',
'actual_number',
'qc_status',
'uuid',
'date_qc_created',
'assembly',
'analysis_name',
'file_format',
'date_file_created',
'dataset',
'file_status']).count()\
.sort_index(level=[0, 1, 2, 3, 4, 9])
In [ ]:
pd.concat([ddbd, ddbd2])\
.sort_values('assay_term_name')\
.reset_index(drop=True)\
.merge(td,
how='left',
on=['accession',
'@type',
'assay_term_name',
'analysis_name',
'output_type',
'assembly',
'processing_stage'])\
.drop(['quality_metric_of',
'metric_count'], axis=1)\
.sort_values(['accession',
'@type',
'date_qc_created'])\
.reset_index(drop=True)\
.groupby(['assay_term_name',
'accession',
'output_type',
'@type',
'processing_stage',
'expected_number',
'actual_number',
'qc_status',
'uuid',
'date_qc_created',
'assembly',
'analysis_name',
'file_format',
'date_file_created',
'dataset',
'file_status']).count()\
.sort_index(level=[0, 1, 2, 3, 4, 9])#.to_excel("duplicate_but_different_grouped_06_02_2017.xlsx")
In [ ]:
len(both.accession.unique())
In [ ]:
ddbd2
In [ ]:
consensus = both[~((both.accession.isin(ddbd2.accession.values))\
& (both['@type'].isin(ddbd2['@type'].values)))].reset_index(drop=True)
In [ ]:
consensus_uuids = both
In [ ]:
search_ids = "&@id=".join(consensus_uuids.dataset.unique())
url = 'https://www.encodeproject.org/search/'\
'?type=Experiment&limit=all&frame=embedded&@id={}'.format(search_ids)
r = requests.get(url, auth=(key.authid, key.authpw))
search_results = r.json()['@graph']
search_id_map = {}
for experiment in search_results:
search_id_map[experiment['@id']] = experiment['lab']['name']
In [ ]:
search_id_map
In [ ]:
consensus_uuids['experiment_lab'] = consensus_uuids.dataset.apply(lambda x: search_id_map[x])
In [ ]:
consensus_uuids.drop_duplicates('accession', keep='first').experiment_lab.value_counts()
In [ ]:
lab_wrangler_lookup = {
"bradley-bernstein": "Cricket",
"michael-snyder": "Jason",
"richard-myers": "Carrie",
"john-stamatoyannopoulos": "Jason",
"peggy-farnham": "Esther",
"brenton-graveley": "Kath",
"bing-ren": "Kath",
"xiang-dong-fu": "Kath",
"will-greenleaf": "Jason",
"kevin-white": "Esther",
"barbara-wold": "Aditi"
}
In [ ]:
consensus_uuids['wrangler'] = consensus_uuids.experiment_lab\
.apply(lambda x: lab_wrangler_lookup.get(x, 'unknown'))
In [ ]:
consensus_uuids.drop_duplicates('accession', keep='first').wrangler.value_counts()
In [ ]:
consensus_uuids['@type'].unique()
In [ ]:
# Possible atachments as defined in schema.
quality_attachments = {
'IDRQualityMetric': ['IDR_plot_true',
'IDR_plot_rep1_pr',
'IDR_plot_rep2_pr',
'IDR_plot_pool_pr',
'IDR_parameters_true',
'IDR_parameters_rep1_pr',
'IDR_parameters_rep2_pr',
'IDR_parameters_pool_pr'],
'ChipSeqFilterQualityMetric': ['cross_correlation_plot',
'attachment'],
'SamtoolsFlagstatsQualityMetric': ['attachment'],
'StarQualityMetric': ['attachment'],
'GenericQualityMetric': ['attachment'],
'ComplexityXcorrQualityMetric': ['cross_correlation_plot',
'attachment']
}
In [ ]:
query = '&accession='.join(consensus_uuids.accession.unique())
In [ ]:
url = 'https://www.encodeproject.org/'\
'search/?type=File&accession={}'\
'&frame=embedded&format=json&limit=all'.format(query)
In [ ]:
r = requests.get(url, auth=(key.authid, key.authpw))
In [ ]:
results = r.json()['@graph']
In [ ]:
records = []
error_log = []
for result in results:
for metric in result['quality_metrics']:
uuid = metric['uuid']
metric_type = metric['@type'][0]
for possible_attachment in quality_attachments[metric_type]:
try:
records.append((uuid,
metric_type,
possible_attachment,
metric[possible_attachment]['download'],
metric[possible_attachment]['md5sum']))
except KeyError:
records.append((uuid,
metric_type,
possible_attachment,
'no_value',
'no_value'))
error_log.append("{} has no attachment of type {}.".format(metric_type,
possible_attachment))
In [ ]:
attachment_details = pd.DataFrame(records).rename(columns={0: "uuid",
1: "@type",
2: "attachment_field",
3: "attachment_name",
4: "attachment_md5"})
attachment_details = attachment_details[(attachment_details.uuid.isin(consensus_uuids.uuid.values))]\
.reset_index(drop=True)
In [ ]:
len(attachment_details.uuid.unique())
In [ ]:
len(consensus_uuids.uuid.unique())
In [ ]:
consensus_uuids.shape
In [ ]:
consensus_uuids[consensus_uuids.uuid.isin(attachment_details.uuid.values)].shape
In [ ]:
attachment_details.attachment_field.value_counts()
In [ ]:
consensus_uuids.columns
In [ ]:
attachment_details.columns
In [ ]:
cc = consensus_uuids.merge(attachment_details,
how='left',
on=['uuid', '@type']).groupby(['assay_term_name',
'wrangler',
'experiment_lab',
'accession',
'output_type',
'@type',
'processing_stage',
'expected_number',
'actual_number',
'attachment_field',
'qc_status',
'uuid',
'date_qc_created',
'attachment_name',
'attachment_md5',
'assembly',
'analysis_name',
'file_format',
'date_file_created',
'dataset',
'file_status']).count()\
.sort_index(level=[0, 1, 2, 3, 4, 5, 6, 9, 12, 13, 14])#.to_excel("duplicate_metrics_no_value_06_05_2017.xlsx")
cc
In [ ]:
def most_recent_uuid_with_attachment(uuid_group):
"""Returns most recent quality metric uuid
with a valid attachment MD5sum, given a sorted
group of otherwise redundant metrics. If all
attachments are missing (no_value) then the
most recent uuid will be returned."""
uuid_to_keep = None
# Pick most recent uuid with attachment.
for metric in uuid_group[::-1]:
if uuid_to_keep:
break
if metric[4] != 'no_value':
uuid_to_keep = metric[1]
# If attachemnt missing for all uuids
# then pick most recent.
if not uuid_to_keep:
uuid_to_keep = uuid_group[-1][1]
return uuid_to_keep
In [ ]:
# Choose uuid to keep based on date and attachment existing.
uuids_attachments = cc.reset_index().sort_values(['accession',
'@type',
'date_qc_created']).reset_index(drop=True)
uuids_to_keep = []
for accession in uuids_attachments.accession.unique():
# Pull DataFrame records for each accession.
selected_rows = uuids_attachments[uuids_attachments.accession == accession]
metric_types = selected_rows['@type'].unique()
for metric_type in metric_types:
processing_stages = selected_rows[selected_rows['@type'] == metric_type]\
['processing_stage'].unique()
for processing_stage in processing_stages:
attachment_types = selected_rows[(selected_rows['@type'] == metric_type)
& (selected_rows['processing_stage'] == processing_stage)]\
['attachment_field'].unique()
uuid_set = set()
for attachment_type in attachment_types:
uuid_group = selected_rows[(selected_rows['@type'] == metric_type)
& (selected_rows['processing_stage'] == processing_stage)
& (selected_rows['attachment_field'] == attachment_type)]\
.sort_values('date_qc_created')[['uuid',
'date_qc_created',
'qc_status',
'attachment_md5']]\
.reset_index(drop=True)\
.to_records()
uuid_set.add(most_recent_uuid_with_attachment(uuid_group))
if len(uuid_set) == 1:
uuids_to_keep.append(list(uuid_set)[0])
else:
raise ValueError('Different attachment fields between metrics.'
' Conflicting uuids: {}'.format(uuid_set))
In [ ]:
utk = pd.DataFrame(pd.DataFrame(list(set(uuids_to_keep))))
utk['action'] = 'keep'
utk = utk.rename(columns={0:'uuid'})
In [ ]:
# Proposed action.
action_proposed = consensus_uuids.merge(attachment_details,
how='left', on=['uuid','@type'])\
.merge(utk,
how="left",
on=['uuid'])\
.fillna("delete")\
.groupby(['assay_term_name',
'wrangler',
'experiment_lab',
'accession',
'output_type',
'@type',
'processing_stage',
'expected_number',
'actual_number',
'attachment_field',
'qc_status',
'action',
'uuid',
'date_qc_created',
'attachment_name',
'attachment_md5',
'assembly',
'analysis_name',
'file_format',
'date_file_created',
'dataset',
'file_status']).count()\
.sort_index(level=[0, 1, 2, 3, 4, 5, 6, 9, 13, 14])
action_proposed#.to_excel('delete_keep_all_pipelines_06_06_2017.xlsx')
In [ ]:
action_proposed_flat = action_proposed.reset_index()
action_proposed_flat = consensus_uuids.reset_index()
In [ ]:
# Remove metrics Esther has to decide.
action_proposed_flat = action_proposed_flat[~(action_proposed_flat.accession.isin(['ENCFF245SEV',
'ENCFF808NFI']))].reset_index(drop=True)
In [ ]:
# Remove metric Seth fixed.
action_proposed_flat = action_proposed_flat[~(action_proposed_flat.accession == 'ENCFF156XRJ')].reset_index(drop=True)
In [ ]:
uuids_to_keep = action_proposed_flat[action_proposed_flat.action == "keep"].uuid.unique()
In [ ]:
uuids_to_keep
In [ ]:
len(uuids_to_keep)
In [ ]:
uuids_to_delete = action_proposed_flat[action_proposed_flat.action == "delete"].uuid.unique()
In [ ]:
len(uuids_to_delete)
In [ ]:
# Make sure no overlap between lists.
set(uuids_to_keep).intersection(set(uuids_to_delete))
In [ ]:
uuids_to_keep = pd.DataFrame(uuids_to_keep)
uuids_to_delete = pd.DataFrame(uuids_to_delete)
In [ ]:
uuids_to_keep['status'] = "released"
uuids_to_delete['status'] = "deleted"
uuids_to_keep = uuids_to_keep.rename(columns={0: "uuid"})
uuids_to_delete = uuids_to_delete.rename(columns={0: "uuid"})
In [ ]:
uuids_to_keep.to_csv("../../uuids_to_keep_06_28_2017.tsv", sep="\t", index=False)
uuids_to_delete.to_csv("../../uuids_to_delete_06_28_2017.tsv", sep="\t", index=False)
In [ ]:
In [ ]:
In [ ]:
cc_dict = cc.reset_index().to_dict(orient='records')
In [ ]:
cc_dict = action_proposed_flat.to_dict(orient='records')
In [ ]:
# Ugly code to create nested dictionary
accession_grouped = defaultdict(list)
for record in cc_dict:
accession_grouped[record['accession']].append(record)
In [ ]:
accession_type_grouped = defaultdict(list)
for k, v in accession_grouped.items():
type_grouped = defaultdict(list)
for record in v:
type_grouped[record['@type']].append(record)
accession_type_grouped[k].append(type_grouped)
In [ ]:
accession_type_processing_grouped = defaultdict(list)
for k, v in accession_type_grouped.items():
for x in v:
type_processing_grouped = defaultdict(list)
for i, y in x.items():
processing_stage = defaultdict(list)
for t in y:
processing_stage[t['processing_stage']].append(t)
type_processing_grouped[i].append(processing_stage)
accession_type_processing_grouped[k].append(type_processing_grouped)
#accession_type_processing_grouped
In [ ]:
accession_type_processing_attachment_grouped = defaultdict(list)
for k, v in accession_type_processing_grouped.items():
for x in v:
type_processing_attachment_grouped = defaultdict(list)
for o, q in x.items():
for b in q:
processing_attachment_grouped = defaultdict(list)
for i, y in b.items():
attachment_field = defaultdict(list)
for t in y:
attachment_field[t['attachment_field']].append(t)
processing_attachment_grouped[i].append(attachment_field)
type_processing_attachment_grouped[o].append(processing_attachment_grouped)
accession_type_processing_attachment_grouped[k].append(type_processing_attachment_grouped)
#accession_type_processing_attachment_grouped
In [ ]:
In [ ]:
both
In [ ]:
duplicate_but_different_info = []
accession_list = []
for k, v in accession_type_processing_attachment_grouped.items():
for x in v:
for i, y in x.items():
for t in y:
for w, r in t.items():
for u in r:
for p, d in u.items():
expected_number = d[0]['expected_number']
attachment_name_set = set()
attachment_md5_set = set()
metric_info = []
for z in d:
attachment_name_set.add(z['attachment_name'])
attachment_md5_set.add(z['attachment_md5'])
metric_info.append((z['uuid'],
z['attachment_name'],
z['attachment_md5'],
z['qc_status'],
z['file_status']))
if (len(attachment_md5_set) > 1):
print("MD5sums do not match.")
print("Kind:", end=' ')
accession_list.append(k)
print(k, i, w, p)
print("Number duplicates: {}".format(len(d)))
print("Items in group:")
for h, e in enumerate(metric_info):
print(h, end=": ")
print(e)
print("Filename set:")
print(attachment_name_set)
print("Md5sum set:")
print(attachment_md5_set)
print()
In [ ]:
len(action_proposed_flat.accession.unique())
In [ ]:
# Different MD5sums
len(accession_list)
In [ ]:
# Different surface values
len(duplicate_but_different.accession.unique())
In [ ]:
# Choose uuid to keep based on date and attachment existing.
uuids_attachments = dd.reset_index().sort_values(['accession',
'@type',
'date_qc_created']).reset_index(drop=True)
uuids_to_keep = []
for accession in uuids_attachments.accession.unique():
# Pull DataFrame records for each accession.
selected_rows = uuids_attachments[uuids_attachments.accession == accession]
metric_types = selected_rows['@type'].unique()
for metric_type in metric_types:
processing_stages = selected_rows[selected_rows['@type'] == metric_type]\
['processing_stage'].unique()
uuid_set = set()
for processing_stage in processing_stages:
uuid_group = selected_rows[(selected_rows['@type'] == metric_type)
& (selected_rows['processing_stage'] == processing_stage)]\
.sort_values('date_qc_created')[['uuid',
'date_qc_created',
'qc_status']]\
.reset_index(drop=True)\
.to_records()
uuid_set.add(uuid_group[-1][1])
if len(uuid_set) == 1:
uuids_to_keep.append(list(uuid_set)[0])
else:
raise ValueError('Different attachment fields between metrics.'
' Conflicting uuids: {}'.format(uuid_set))
In [ ]:
dduuids_to_keep = pd.DataFrame(uuids_to_keep)
In [ ]:
dduuids_to_keep['status'] = 'released'
In [ ]:
dduuids_to_keep = dduuids_to_keep.rename(columns={0: 'uuid'})
In [ ]:
dd = dd.reset_index().merge(dduuids_to_keep, how='left', on='uuid').fillna('deleted')
In [ ]:
# Remove ones Seth fixed and remove RNA
dd = dd[~(dd.dataset.isin(['/experiments/ENCSR000EGT/', '/experiments/ENCSR000EGU/']))
& ~(dd.assay_term_name=="shRNA knockdown followed by RNA-seq")].reset_index(drop=True)
In [ ]:
dd.groupby(['assay_term_name',
'dataset',
'accession',
'output_type',
'@type',
'processing_stage',
'expected_number',
'actual_number',
'qc_status',
'status',
'uuid',
'date_qc_created',
'assembly',
'analysis_name',
'file_format',
'date_file_created',
'file_status']).count()
In [ ]:
dd_uuids_to_keep = pd.DataFrame(dd[dd.status=='released'].uuid.unique())
dd_uuids_to_keep['status'] = 'released'
dd_uuids_to_keep = dd_uuids_to_keep.rename(columns={0: 'uuid'})
In [ ]:
dd_uuids_to_delete = pd.DataFrame(dd[dd.status=='deleted'].uuid.unique())
dd_uuids_to_delete['status'] = 'deleted'
dd_uuids_to_delete = dd_uuids_to_delete.rename(columns={0: 'uuid'})
In [ ]:
dd_uuids_to_keep.to_csv("dd_uuids_to_keep_06_13_2017.tsv", sep="\t", index=False)
In [ ]:
dd_uuids_to_delete.to_csv("dd_uuids_to_delete_06_13_2017.tsv", sep="\t", index=False)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
exp_values = action_proposed_flat.dataset.unique()
for i in range(0, len(exp_values), 10):
batch = exp_values[i:i+10]
data = action_proposed_flat[action_proposed_flat.dataset.isin(batch)]\
.groupby(['dataset',
'action',
'uuid']).count()[[]].reset_index()
data_to_keep = data[data.action=="keep"]
data_to_delete = data[data.action=="delete"]
uuids_to_keep = pd.DataFrame(data_to_keep.uuid.unique())
uuids_to_delete = pd.DataFrame(data_to_delete.uuid.unique())
uuids_to_keep['status'] = "released"
uuids_to_delete['status'] = "deleted"
uuids_to_keep = uuids_to_keep.rename(columns={0: "uuid"})
uuids_to_delete = uuids_to_delete.rename(columns={0: "uuid"})
print(i)
print(uuids_to_keep.shape)
print(uuids_to_delete.shape)
print(batch)
# uuids_to_keep.to_csv("patch_batch/uuids_to_keep_06_13_2017_batch_{}.tsv".format(i),
# sep="\t",
# index=False)
# uuids_to_delete.to_csv("patch_batch/uuids_to_delete_06_13_2017_batch_{}.tsv".format(i),
# sep="\t",
# index=False)
In [ ]:
In [ ]:
In [ ]:
td.dtypes
In [ ]:
consensus_uuids
In [ ]:
td["date_qc_created"] = td["date_qc_created"].apply(lambda x: pd.to_datetime(x))
action_all = td
action_all['action'] = 'keep'
action_all['audit'] = 'unique'
action_all = action_all.sort_values("assay_term_name", ascending=False).reset_index(drop=True)
In [ ]:
action_proposed_flat = action_proposed.reset_index()
apf = action_proposed_flat[['uuid', 'action', 'actual_number']].reset_index(drop=True)
apf['audit'] = 'duplicate'
action_all = td.merge(apf, how='left', on=['uuid'])
action_all['action'] = action_all['action'].fillna('keep')
action_all['audit'] = action_all['audit'].fillna('unique')
action_all['actual_number'] = action_all['actual_number'].fillna(1)
aad = action_all[action_all.audit=="duplicate"].reset_index(drop=True)
aau = action_all[action_all.audit=="unique"].reset_index(drop=True)
action_all = action_all.sort_values("assay_term_name", ascending=False).reset_index(drop=True)
In [ ]:
In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set(style="ticks", font="Lato")
import datetime
with sns.plotting_context("notebook", font_scale=1.5):
fig = plt.figure(figsize=[12, 15])
sns.stripplot(x="date_qc_created",
y="assay_term_name",
hue="audit",
data=action_all,
size=15,
jitter=True,
split=True,
palette=sns.color_palette(["black",
"#de2d26"]),
alpha=0.3)
for x in fig.gca().findobj(mpl.collections.PathCollection):
edge_color = x.get_facecolor()
x.set_facecolor("white")
x.set_edgecolor(edge_color)
x.set_linewidth(1.5)
x.set_snap(False)
x.set_antialiased(True)
sns.despine(left=True,bottom=True)
fig.gca().xaxis.grid(True, ls='--')
fig.gca().yaxis.grid(True, ls="--")
bottom, top = fig.gca().get_ylim()
for x in np.arange(top, bottom + 1, 1):
fig.gca().axhline(x,
color="black",
linewidth=1)
fig.gca().set_ylim([top - 0.1, bottom + 0.1])
box = fig.gca().get_position()
fig.gca().set_position([box.x0,
box.y0,
box.width * 0.85,
box.height])
sns.plt.legend(loc="upper left",
bbox_to_anchor=(1, 0.5),
frameon=True,
prop={'size':'x-large'})
sns.plt.title('Quality Metrics by Assay and Date (After Patching)',
size=16,
fontweight='bold');
import matplotlib.dates as mdates
fig.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
In [ ]:
action_all[action_all.audit=='duplicate']
In [ ]:
In [ ]:
In [ ]:
In [ ]:
# Order data by date_qc_created and
# and convert field to string for visualization.
td = td.sort_values("date_qc_created").reset_index(drop=True)
td["date_qc_created"] = td["date_qc_created"].astype("str")
In [ ]:
action_proposed_flat = action_proposed.reset_index()
apf = action_proposed_flat[['uuid', 'action', 'actual_number']].reset_index(drop=True)
apf['audit'] = 'duplicate'
action_all = td.merge(apf, how='left', on=['uuid'])
action_all['action'] = action_all['action'].fillna('keep')
action_all['audit'] = action_all['audit'].fillna('unique')
action_all['actual_number'] = action_all['actual_number'].fillna(1)
In [ ]:
aad = action_all[action_all.audit=="duplicate"].reset_index(drop=True)
aau = action_all[action_all.audit=="unique"].reset_index(drop=True)
In [ ]:
action_all = action_all.sort_values("assay_term_name", ascending=False).reset_index(drop=True)
In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set(style="ticks", font="Lato")
import datetime
with sns.plotting_context("notebook", font_scale=1.5):
fig = plt.figure(figsize=[12, 15])
sns.stripplot(x="date_qc_created",
y="assay_term_name",
hue="audit",
data=action_all,
size=15,
jitter=True,
split=True,
palette=sns.color_palette(["black",
"#de2d26"]),
alpha=0.3)
for x in fig.gca().findobj(mpl.collections.PathCollection):
edge_color = x.get_facecolor()
x.set_facecolor("white")
x.set_edgecolor(edge_color)
x.set_linewidth(1.5)
x.set_snap(False)
x.set_antialiased(True)
sns.despine(left=True,bottom=True)
fig.gca().xaxis.grid(True, ls='--')
fig.gca().yaxis.grid(True, ls="--")
bottom, top = fig.gca().get_ylim()
for x in np.arange(top, bottom + 1, 1):
fig.gca().axhline(x,
color="black",
linewidth=1)
fig.gca().set_ylim([top - 0.1, bottom + 0.1])
box = fig.gca().get_position()
fig.gca().set_position([box.x0,
box.y0,
box.width * 0.85,
box.height])
sns.plt.legend(loc="upper left",
bbox_to_anchor=(1, 0.5),
frameon=True,
prop={'size':'x-large'})
sns.plt.title('Quality Metrics by Assay and Date (Before Patching)',
size=16,
fontweight='bold');
import matplotlib.dates as mdates
fig.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
In [ ]:
In [ ]:
action_all[action_all['assay_term_name'] == "ATAC-seq"].audit.value_counts()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
action_all
In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set(style="ticks")
with sns.plotting_context("notebook", font_scale=1.5):
fig = plt.figure(figsize=[10, 8])
sns.stripplot(x="actual_number",
y="assay_term_name",
hue="audit",
data=action_all,
size=15,
#edgecolor="black",
palette=sns.color_palette(["gray",
"#de2d26",
#"#95a5a6",
#"#e41a1c",
#"#a6cee3",
"black",
"#e74c3c",
"#34495e",
"#3498db",
"#2ecc71"]),
jitter=True,
alpha=0.9)
fig.gca().set_xticks([x for x in sorted(action_all.actual_number.unique())])
for x in fig.gca().findobj(mpl.collections.PathCollection):
edge_color = x.get_facecolor()
x.set_facecolor("None")
x.set_edgecolor(edge_color)
x.set_linewidth(0.15)
x.set_snap(False)
x.set_antialiased(True)
for x in fig.gca().legend_.__dict__['legendHandles']:
x.set_linewidth(1)
x.set_alpha(1)
sns.despine(left=True, bottom=True)
fig.gca().xaxis.grid(True)
fig.gca().yaxis.grid(True)
In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set(style="ticks")
with sns.plotting_context("notebook", font_scale=1.5):
fig = plt.figure(figsize=[10, 8])
sns.stripplot(x="metric_count",
y="file_status",
hue="qc_status",
order=['released',
'in progress',
'archived',
'replaced',
'revoked',
'deleted',
'content error'],
data=td,
size=15,
#edgecolor="black",
palette=sns.color_palette(["black",
"#de2d26",
#"#95a5a6",
#"#e41a1c",
#"#a6cee3",
"black",
"#e74c3c",
"#34495e",
"#3498db",
"#2ecc71"]),
jitter=True,
alpha=0.3)
for x in fig.gca().findobj(mpl.collections.PathCollection):
edge_color = x.get_facecolor()
x.set_facecolor("None")
x.set_edgecolor(edge_color)
x.set_linewidth(0.05)
x.set_snap(False)
x.set_antialiased(True)
for x in fig.gca().legend_.__dict__['legendHandles']:
x.set_linewidth(1)
x.set_alpha(1)
sns.despine(left=True, bottom=True)
fig.gca().xaxis.grid(True)
fig.gca().yaxis.grid(True)
In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set_style("ticks")
if td["date_qc_created"].dtype == "<M8[ns]":
pass
with sns.plotting_context("notebook", font_scale=1.4):
fig = plt.figure(figsize=[7, 7])
sns.stripplot(x="metric_count",
y="date_qc_created",
data=td,
size=4,
edgecolor="black",
jitter=True)
for x in fig.gca().findobj(mpl.collections.PathCollection):
x.set_facecolor("white")
x.set_edgecolor("black")
x.set_linewidth(1.5)
x.set_snap(False)
x.set_antialiased(True)
sns.despine(left=True, bottom=True)
fig.gca().xaxis.grid(True)
fig.gca().yaxis.grid(True)
fig.gca().tick_params(labelbottom='on',
labeltop='on')
In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set_style("ticks")
with sns.plotting_context("notebook", font_scale=1.5):
fig = plt.figure(figsize=[10, 8])
sns.stripplot(x="metric_count",
y="assay_term_name",
data=td,
size=15,
edgecolor="black",
jitter=True)
for x in fig.gca().findobj(mpl.collections.PathCollection):
x.set_facecolor("white")
x.set_edgecolor("black")
x.set_linewidth(1.5)
x.set_snap(False)
x.set_antialiased(True)
sns.despine(left=True, bottom=True)
fig.gca().xaxis.grid(True)
fig.gca().yaxis.grid(True)
In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set_style("ticks")
with sns.plotting_context("notebook", font_scale=1.5):
fig = plt.figure(figsize=[10, 8])
sns.stripplot(x="metric_count",
y="@type",
data=td,
size=15,
edgecolor="black",
jitter=True)
for x in fig.gca().findobj(mpl.collections.PathCollection):
x.set_facecolor("white")
x.set_edgecolor("black")
x.set_linewidth(1.5)
x.set_snap(False)
x.set_antialiased(True)
sns.despine(left=True, bottom=True)
fig.gca().xaxis.grid(True)
fig.gca().yaxis.grid(True)
In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set_style("ticks")
with sns.plotting_context("notebook", font_scale=1.5):
fig = plt.figure(figsize=[10, 8])
sns.stripplot(x="metric_count",
y="output_type",
data=td,
size=15,
edgecolor="black",
jitter=True)
for x in fig.gca().findobj(mpl.collections.PathCollection):
x.set_facecolor("white")
x.set_edgecolor("black")
x.set_linewidth(1.5)
x.set_snap(False)
x.set_antialiased(True)
sns.despine(left=True, bottom=True)
fig.gca().xaxis.grid(True)
fig.gca().yaxis.grid(True)
In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set_style("ticks")
with sns.plotting_context("notebook", font_scale=1.5):
fig = plt.figure(figsize=[14, 8])
sns.boxplot(x="file_format",
y="metric_count",
data=td,
color="black")
for i, y in enumerate(fig.gca().artists):
y.set_facecolor("None")
#y.set_edgecolor("black")
#for line in fig.gca().lines:
#line.set_color("red")
#line.set_mfc("red")
#line.set_mec("red")
#sns.despine(ax=fig.gca())
In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set_style("ticks")
with sns.plotting_context("notebook", font_scale=1):
fig = plt.figure(figsize=[14, 8])
sns.boxplot(x="assembly",
y="metric_count",
data=td,
color="black")
for i, y in enumerate(fig.gca().artists):
y.set_facecolor("None")
#y.set_edgecolor("black")
#for line in fig.gca().lines:
#line.set_color("red")
#line.set_mfc("red")
#line.set_mec("red")
#sns.despine(ax=fig.gca())
#fig.get_dpi()
In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set_style("ticks")
with sns.plotting_context("notebook", font_scale=1.5):
fig = plt.figure(figsize=[14, 8])
sns.boxplot(x="file_status",
y="metric_count",
data=td,
color="black")
for i, y in enumerate(fig.gca().artists):
y.set_facecolor("None")
#y.set_edgecolor("black")
#for line in fig.gca().lines:
#line.set_color("red")
#line.set_mfc("red")
#line.set_mec("red")
#sns.despine(ax=fig.gca())
In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set_style("ticks")
with sns.plotting_context("notebook", font_scale=1.5):
fig = plt.figure(figsize=[14, 8])
sns.boxplot(x="qc_status",
y="metric_count",
data=td,
color="black")
for i, y in enumerate(fig.gca().artists):
y.set_facecolor("None")
#y.set_edgecolor("black")
#for line in fig.gca().lines:
#line.set_color("red")
#line.set_mfc("red")
#line.set_mec("red")
#sns.despine(ax=fig.gca())
In [ ]:
df = pd.read_excel('replaced_items_no_redirect_06_12_2017.xlsx')
In [ ]:
df.columns
In [ ]:
df[df.accession=='ENCFF008MOV']
In [ ]:
df[(df.content_md5sum_old == df.content_md5sum_new)
& (df.content_md5sum_old == 'not_available')].shape
In [ ]: