In [ ]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import requests
import seaborn as sns
from ast import literal_eval
from collections import defaultdict
pd.options.display.max_rows = 200
pd.options.display.max_columns = 50
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [ ]:
os.getcwd()

Set up authentication for requests


In [ ]:
# Copied from pyencoded-tools/encodedcc.py to avoid dependency.
class ENC_Key:
    def __init__(self, keyfile, keyname):
        if os.path.isfile(str(keyfile)):
            keys_f = open(keyfile, 'r')
            keys_json_string = keys_f.read()
            keys_f.close()
            keys = json.loads(keys_json_string)
        else:
            keys = keyfile
        key_dict = keys[keyname]
        self.authid = key_dict['key']
        self.authpw = key_dict['secret']
        self.server = key_dict['server']
        if not self.server.endswith("/"):
            self.server += "/"

            
class ENC_Connection(object):
    def __init__(self, key):
        self.headers = {'content-type': 'application/json', 'accept': 'application/json'}
        self.server = key.server
        self.auth = (key.authid, key.authpw)

Define class to extract data from JSON


In [ ]:
class ExtractedQCData(object):
    """Class to store extracted QC data
    from ENCODE search results."""
    def __init__(self, file):
        self.accession = file['accession']
        self.file_status = file['status']
        self.file_format = file['file_format']
        self.assembly = file['assembly']
        self.dataset = file['dataset']
        self.date_file_created = file['date_created']
        self.output_type = file['output_type']
        try:
            self.analysis_name = file['analysis_step_version']['analysis_step']['name']
        except KeyError:
            self.analysis_name = np.nan
        self.quality_metrics = []
        for metric in file['quality_metrics']:
            try:
                assay_term_name = metric['assay_term_name']
            except KeyError:
                try:
                    assay_term_name = file['analysis_step_version']['analysis_step']\
                                                   ['pipelines'][0]['assay_term_name']
                except (KeyError, IndexError):
                    assay_term_name = np.nan
            try:
                # Only for SamtoolFlagstatsQualityMetrics
                processing_stage = metric['processing_stage']
            except KeyError:
                processing_stage = 'unknown'
            metric_data = {
                'uuid': metric['uuid'],
                '@type': metric['@type'][0],
                'qc_status': metric['status'],
                'quality_metric_of': [x for x in metric['quality_metric_of']],
                'date_qc_created': metric['date_created'],
                'assay_term_name': assay_term_name,
                'processing_stage': processing_stage
                }
            self.quality_metrics.append(metric_data)
        self.metric_count = len(self.quality_metrics)

Pull all Files with quality_metrics


In [ ]:
def get_data(url, output_json='qc_metrics.json', auth=False, force_update=False):
    """Gets data from ENCODE and stores
    it in output_file. If output_file already
    exists then it will use cached version.
    Set force_update=True to update cached file.
    WARNING: Does not update data automatically
    if search URL is changed. Use force_update=True.
    Can pass auth=(key.authid, key.authpw)
    to return non-public results. Returns pandas 
    DataFrame object."""
    if not output_json.endswith('.json'):
        raise ValueError("Output file must be a .json")
    if os.path.isfile(output_json) and not force_update:
            data = pd.read_json(output_json)
    else:
        r = requests.get(url, auth=auth)
        search_results = r.json()['@graph']
        results = []
        for file in search_results:
            extracted_data = ExtractedQCData(file)
            results.append(extracted_data)
        data = pd.DataFrame([vars(r) for r in results])
        data.to_json(output_json)
    return data

In [ ]:
def tidy_data(data):
    """Extracts data from quality_metric
    column, returning a tidy dataset where
    each quality_metric has its own row. Since
    several quality_metrics can refer to same file,
    this will duplicate accession and other high-level
    metadata."""
    # Convert DataFrame to list of objects. 
    results = data.to_dict(orient='records')
    # Make tidy data from list of QC metrics. 
    data = pd.io.json.json_normalize([r for r in results],
                                     'quality_metrics',
                                     [x for x in results[0].keys() if x != 'quality_metrics'])
    # Convert date information to datetime type.
    data[['date_qc_created',
          'date_file_created']] = data[['date_qc_created',
                                        'date_file_created']].apply(pd.to_datetime)
    return data

In [ ]:
# Define key if private data desired.
key = ENC_Key(os.path.expanduser("~/keypairs.json"), 'prod')
# Define query to return objects.
url = 'https://www.encodeproject.org/'\
            'search/?type=File&quality_metrics.uuid=%2A'\
            '&format=json&limit=all'
        
#frame=object

In [ ]:
data = get_data(url, auth=(key.authid, key.authpw), force_update=True)

In [ ]:
data.shape

In [ ]:
td = tidy_data(data)
#td['date_qc_created'] = td['date_qc_created'].apply(lambda x: x.strftime('%m-%d-%Y'))
#td['date_qc_created'] = td['date_qc_created'].apply(pd.to_datetime)

In [ ]:
td.shape

In [ ]:
td = td.fillna('unknown')

Value counts for main columns


In [ ]:
td.assay_term_name.value_counts()

In [ ]:
td.analysis_name.value_counts()

In [ ]:
td['@type'].value_counts()

In [ ]:
td.file_format.value_counts()

In [ ]:
data.file_format.value_counts()

In [ ]:
td.processing_stage.value_counts()

In [ ]:
processing_stage_data = td[['accession', 'uuid', 'processing_stage']]

Pull all files that should have quality_metrics but don't (optional)


In [ ]:
file_format_url = '&file_format='.join([x for x in expected_file_format])
output_type_url = '&output_type='.join([x.replace(' ', '+') for x in expected_output_type])
url = 'https://www.encodeproject.org/search/?type=File'\
      '&file_format={}&output_type={}&quality_metrics.uuid!=*'\
      '&format=json&limit=all'.format(file_format_url,
                                      output_type_url)
url

In [ ]:
r = requests.get(url, auth=(key.authid, key.authpw))
search_results = r.json()['@graph']
result_list = []
for file in search_results:
    try:
        analysis_name = file['analysis_step_version']['analysis_step']['name']
    except KeyError:
        analysis_name = 'unknown'
    try:
        assay_term_name = file['analysis_step_version']['analysis_step']['pipelines'][0]['assay_term_name']
    except (KeyError, IndexError):
        assay_term_name = 'unknown'
    try:
        assembly = file['assembly']
    except KeyError:
        assembly = 'unknown'
    extracted_results = {
        'accession': file['accession'],
        'assay_term_name': assay_term_name,
        'lab': file['lab']['title'],
        'file_format': file['file_format'],
        'status': file['status'],
        'assembly': assembly,
        'dataset': file['dataset'],
        'date_file_created': file['date_created'],
        'output_type': file['output_type'],
        'analysis_name': analysis_name
    }
    if extracted_results['analysis_name'] in expected_analysis_name:
        result_list.append(extracted_results)

In [ ]:
len(result_list)

In [ ]:
no_qc = pd.DataFrame(result_list)

In [ ]:
no_qc[no_qc.lab == 'ENCODE Processing Pipeline'].drop(['accession',
                                                       'dataset',
                                                       'date_file_created'],
                                                        axis=1).apply(pd.Series.value_counts)\
                                                        .sort_values([x for x in ['analysis_name',
                                                                                  'assembly',
                                                                                  'lab',
                                                                                  'output_type',
                                                                                  'status']], ascending=False)\
                                                                                  .T.stack().astype(int)

In [ ]:
no_qc

In [ ]:
# Files with expected file_format of:
expected_file_format = ['bam',
                        'bed',
                        'bigBed',
                        'tsv',
                        'bigWig']

# Files with expected output_type of:
expected_output_type = ['unfiltered alignments', 
                        'conservative idr thresholded peaks',
                        'hotspots',
                        'optimal idr thresholded peaks',
                        'alignments',
                        'transcriptome alignments',
                        'gene quantifications',
                        'transcription start sites',
                        'signal of unique reads',
                        'peaks',
                        'pseudoreplicated idr thresholded peaks',
                        'methylation state at CHG',
                        'methylation state at CHH',
                        'signal',
                        'methylation state at CpG',
                        'filtered peaks']

# Files with expected analysis_name of:
expected_analysis_name = ['bwa-alignment-step-v-1',
                          'tf-idr-step',
                          'dnase-call-hotspots-v-1',
                          'tf-idr-peaks-to-bigbed-step',
                          'kundaje-lab-atac-seq-trim-align-filter-single-rep-v1',
                          'kundaje-lab-atac-seq-idr-v1', 'dnase-align-bwa-se-v-1',
                          'dnase-filter-se-v-1', 'lrna-pe-star-alignment-step-v-1',
                          'lrna-pe-rsem-quantification-v-1', 'rampage-idr-step-v-1',
                          'dnase-call-hotspots-alt-v-1', 'lrna-pe-tophat-alignment-step-v-1',
                          'ggr_tr1_rna_seq_trimming_mapping_step',
                          'rampage-grit-peak-calling-step-v-1',
                          'kundaje-lab-atac-seq-trim-align-filter-v1',
                          'modern-chip-seq-peaks-to-bigbed-step-v-1',
                          'modern-bwa-alignment-step-v-1',
                          'modern-chip-seq-optimal-idr-step-v-1',
                          'lrna-se-star-alignment-step-v-1',
                          'dme-align-pe-v-1',
                          'rampage-pe-alignment-step-v-1',
                          'bwa-raw-alignment-step-v-1',
                          'lrna-se-rsem-quantification-step-v-1',
                          'dnase-align-bwa-pe-v-1',
                          'dnase-filter-pe-v-1',
                          'small-rna-se-star-alignment-step-v-2',
                          'tf-unreplicated-idr-step',
                          'ggr_tr1_chip_seq_mapping_filtering_step',
                          'dme-extract-se-v-2',
                          'mott-trim-align-bismark-v-1-0',
                          'kundaje-lab-atac-seq-unreplicated-idr-single-rep-v1',
                          'dme-extract-pe-v-1',
                          'tf-unreplicated-idr-peaks-to-bigbed-step',
                          'uknown',
                          'modern-chip-seq-filter-for-optimal-idr-peaks-step-v-1',
                          'lrna-se-tophat-alignment-step-v-1',
                          'modern-chip-seq-optimal-idr-thresholded-peaks-to-bigbed-step-v-1',
                          'kundaje-lab-atac-seq-peak-call-single-rep-v1']

In [ ]:
no_qc_grouped = no_qc.groupby(['assay_term_name',
                               'file_format',
                               'analysis_name',
                               'output_type',
                               'accession']).count()

In [ ]:
no_qc_unstacked = no_qc_grouped[['status']].unstack()

In [ ]:
no_qc_unstacked_transformed = no_qc_unstacked.T

In [ ]:
no_qc_unstacked_summary = no_qc_unstacked_transformed.describe()

In [ ]:
no_qc_unstacked_summary_transformed = no_qc_unstacked_summary.T

In [ ]:
no_qc_dict = no_qc_unstacked_summary_transformed[['mean']].to_dict()

In [ ]:
for i in no_qc_dict['mean'].keys():
    print(i)

In [ ]:
for i in summary_expected_dict.keys():
    print(i)

Calculate expected number of quality_metrics for specified type


In [ ]:
qc_metric_requirements = pd.read_csv("qc_metric_requirements.csv")

In [ ]:
grouped_data = td.groupby(['assay_term_name',
                           'file_format',
                           'analysis_name',
                           'output_type',
                           '@type',       
                           'processing_stage',
                           'accession']).count()

In [ ]:
uuid_grouped_data = grouped_data[['uuid']]

In [ ]:
unstacked_uuid_grouped_data = uuid_grouped_data.unstack()

In [ ]:
unstacked_uuid_grouped_data_transformed = unstacked_uuid_grouped_data.T

In [ ]:
summary_unstacked_uuid_grouped_data_transformed = unstacked_uuid_grouped_data_transformed.describe()

In [ ]:
summary_transformed = summary_unstacked_uuid_grouped_data_transformed.T

In [ ]:
summary_transformed['mean'] = summary_transformed['mean'].apply(lambda x: round(x))

In [ ]:
summary_transformed_selected = summary_transformed[['count', 'min', 'mean', 'max']]

In [ ]:
summary_expected_dict = summary_transformed_selected[['mean']].to_dict()['mean']

In [ ]:
required_number = []
for i, y in summary_expected_dict.items():
    qc_metric_details = {'assay_term_name': i[0],
                          'file_format': i[1],
                          'analysis_name': i[2],
                          'output_type': i[3],
                          '@type': i[4],
                          'processing_stage': i[5],
                          'count': y}
    required_number.append(qc_metric_details)
required_number

In [ ]:
qc_metric_requirements = pd.DataFrame(required_number)[['assay_term_name',
                                                        'file_format',
                                                        'analysis_name',
                                                        'output_type',
                                                        '@type',
                                                        'processing_stage',
                                                        'count']]

In [ ]:
#qc_metric_requirements = pd.read_csv("qc_metric_requirements.csv")
#qc_metric_requirements.to_csv("qc_metric_requirements.csv")

In [ ]:
qc_metric_requirements.groupby(['assay_term_name',
                               'file_format',
                               'analysis_name',
                               'output_type',
                               '@type',
                               'processing_stage',
                               'count']).count()

In [ ]:
for i in summary_expected_dict.keys():
    if 'rampage-pe-alignment-step-v-1' in i:
        print(i)
        print()

In [ ]:
for i, y in summary_expected_dict.items():
    if y > 1:
        print(i)

In [ ]:
assert summary_expected_dict[('CAGE',
                              'bam',
                              'rampage-pe-alignment-step-v-1',
                              'alignments',
                              'SamtoolsFlagstatsQualityMetric',
                              'unknown')] == 1
assert summary_expected_dict[('ChIP-seq',
                              'bam',
                              'bwa-alignment-step-v-1',
                              'alignments',
                              'SamtoolsFlagstatsQualityMetric',
                              'filtered')] == 1

In [ ]:
#td['date_qc_created'] = td['date_qc_created'].apply(lambda x: x.strftime('%m-%d-%Y'))

Find files that have unexpected number of quality_metrics


In [ ]:
# Only files that haven't been revoked/deleted etc.
active_files = td[~(td.file_status.isin(['revoked',
                                        'deleted',
                                        'replaced',
                                        'content error']))].groupby(['accession',
                                                                    'assay_term_name',
                                                                    'file_format',
                                                                    'analysis_name',
                                                                    'output_type',
                                                                    '@type',
                                                                    'assembly']).count()[['uuid']].reset_index()
active_files = active_files.reset_index()

In [ ]:
# All file statuses
active_files = td.groupby(['accession',
                        'assay_term_name',
                        'file_format',
                        'analysis_name',
                        'output_type',
                        '@type',
                        'assembly',
                        'processing_stage']).count()[['uuid']]#.reset_index()
active_files = active_files.reset_index()

In [ ]:
active_files['mapping_key'] = list(zip(active_files.assay_term_name,
                                        active_files.file_format,
                                        active_files.analysis_name,
                                        active_files.output_type,
                                        active_files['@type'],
                                        active_files.processing_stage))

In [ ]:
active_files['expected_number'] = active_files['mapping_key'].apply(lambda x: summary_expected_dict[x])

In [ ]:
active_files[active_files.uuid != active_files.expected_number]

In [ ]:
active_files_mismatch = active_files[active_files.uuid != active_files.expected_number]\
                                                .rename(columns={'uuid': 'actual_number'})\
                                                [['accession',
                                                  'assay_term_name',
                                                  'analysis_name',
                                                  'assembly',
                                                  'output_type',
                                                  '@type',
                                                  'processing_stage',
                                                  'expected_number',
                                                  'actual_number']].sort_values(['assembly',
                                                                                 'output_type',
                                                                                 '@type',
                                                                                 'actual_number'], ascending=False)

In [ ]:
afm = active_files_mismatch.merge(td, how='left', on=['accession',
                                                '@type',
                                                'assay_term_name',
                                                'analysis_name',
                                                'assembly',
                                                'output_type',
                                                'processing_stage'])
afm

In [ ]:
afm.groupby(['assay_term_name',
              'accession',
              'output_type',
              '@type',
              'processing_stage',
              'expected_number',
              'actual_number',
              'qc_status',
              'uuid',
              'date_qc_created',
              'assembly',
              'analysis_name',
              'file_format',
              'date_file_created',
              'dataset',
              'file_status']).count()

In [ ]:


In [ ]:
#afm.to_excel("duplicate_quality_metrics_06_13_2017.xlsx")

Filter by Experiment/File status


In [ ]:
experiments = td[td.accession.isin(active_files_mismatch['accession'].values)]\
                        [['accession', 'dataset']].drop_duplicates('accession')

In [ ]:
active_files_mismatch['dataset'] = active_files_mismatch['accession']\
    .apply(lambda x: experiments[experiments.accession == x]['dataset'].values[0])

In [ ]:
active_files_mismatch['@type'].value_counts()

In [ ]:
search_ids = "&@id=".join(active_files_mismatch.dataset.unique())
url = 'https://www.encodeproject.org/search/?type=Experiment&limit=all&@id={}'.format(search_ids)
r = requests.get(url, auth=(key.authid, key.authpw))
search_results = r.json()['@graph']
search_id_map = {}
for experiment in search_results:
    search_id_map[experiment['@id']] = experiment['status']

In [ ]:
active_files_mismatch['experiment_status'] = active_files_mismatch['dataset'].apply(lambda x: search_id_map[x])
active_files_mismatch = active_files_mismatch.reset_index(drop=True)
active_files_mismatch['dataset'] = active_files_mismatch['dataset'].apply(lambda x: x.replace("/experiments/", "").replace("/", ""))

In [ ]:
active_files_mismatch.shape

In [ ]:
active_files_mismatch[active_files_mismatch.expected_number < active_files_mismatch.actual_number].shape

In [ ]:
released_data = active_files_mismatch[~(active_files_mismatch.experiment_status.isin(['revoked',
                                                                                      'deleted']))]\
                                                                                .groupby(['assay_term_name',
                                                                                          'dataset',
                                                                                          'accession',
                                                                                          'assembly',
                                                                                          'output_type',
                                                                                          '@type',
                                                                                          'expected_number',
                                                                                          'actual_number',
                                                                                          'experiment_status'])\
                                                                                 .count().unstack()

In [ ]:
released_data

Pull files with redundant audit flag from Idan's branch #4800, find consensus list (optional)


In [ ]:
# Pull Idan's audit files
url = 'https://4800-redundant-qc-6f1aa1cb5-idan.demo.encodedcc.org/search/'\
       '?type=File&audit.INTERNAL_ACTION.category=redundant+quality+metric&format=json&limit=all'
r = requests.get(url, auth=(key.authid, key.authpw))
search_results = r.json()['@graph']
audit_flag = []
for file in search_results:
    audit_flag.append(file['accession'])

In [ ]:
len(audit_flag)

In [ ]:
af = pd.DataFrame(audit_flag)
af = af.rename(columns={0:'accession'})

In [ ]:
m = '0 ENCFF860EEZ 1 ENCFF806GBQ 2 ENCFF865OKF 3 ENCFF899TCQ 4 ENCFF011AMS 5 ENCFF414WJD 6 ENCFF663HVJ 7 ENCFF998PWW 8 ENCFF243AGG 9 ENCFF258AQQ 10 ENCFF709UWZ 11 ENCFF023KBS 12 ENCFF385ZPU 13 ENCFF619KWP 14 ENCFF767SOM 15 ENCFF948BIJ 16 ENCFF358RZK 17 ENCFF355CBE 18 ENCFF266EKT 19 ENCFF236AJI 20 ENCFF307AIG 21 ENCFF112PJJ 22 ENCFF694GJR 23 ENCFF591KID 24 ENCFF472SIB 25 ENCFF918TKF 26 ENCFF409NLS 27 ENCFF930IKU 28 ENCFF543CGB 29 ENCFF713UFE 30 ENCFF497JBZ 31 ENCFF496OJU 32 ENCFF278OYY 33 ENCFF526CXQ 34 ENCFF311KIP 35 ENCFF779CWH 36 ENCFF373UYA 37 ENCFF315SPJ 38 ENCFF548IHT 39 ENCFF484BSF 40 ENCFF899PSI 41 ENCFF713BDU 42 ENCFF193TFR 43 ENCFF211ZKG 44 ENCFF041ZEP 45 ENCFF220MHX 46 ENCFF077WNR 47 ENCFF283MNG 48 ENCFF896RNG 49 ENCFF970WRP 50 ENCFF659WGE 51 ENCFF281NFW 52 ENCFF491DDU 53 ENCFF137UAY 54 ENCFF896WQY 55 ENCFF950MBW 56 ENCFF900IQG 57 ENCFF384LXN 58 ENCFF865RQN 59 ENCFF071NEW 60 ENCFF476OYZ 61 ENCFF333VCK 62 ENCFF352PYL 63 ENCFF105VHD 64 ENCFF008RRH 65 ENCFF878BEQ 66 ENCFF078QRE 67 ENCFF150LTG 68 ENCFF576YJD 69 ENCFF501KQJ 70 ENCFF764BNL 71 ENCFF278QIG 72 ENCFF988FAL 73 ENCFF901ONA 74 ENCFF951FBF 75 ENCFF344UBE 76 ENCFF218LOB 77 ENCFF792ZKL 78 ENCFF435EIY 79 ENCFF960TDU 80 ENCFF549UEZ 81 ENCFF507RMK 82 ENCFF998KDQ 83 ENCFF312IYF 84 ENCFF811YFQ 85 ENCFF225TLP 86 ENCFF927TJR 87 ENCFF948MDE 88 ENCFF286YMB 89 ENCFF304XON 90 ENCFF375BIG 91 ENCFF365AWU 92 ENCFF093ZFB 93 ENCFF838BUR 94 ENCFF138SMJ 95 ENCFF032YPC 96 ENCFF689YCR 97 ENCFF540SLS 98 ENCFF546ZGF 99 ENCFF263EQX 100 ENCFF276EKT 101 ENCFF425KXW 102 ENCFF455EXR 103 ENCFF161QXX 104 ENCFF532MZI 105 ENCFF766SWF 106 ENCFF904CGH 107 ENCFF977MOV 108 ENCFF214OJW 109 ENCFF424FKL 110 ENCFF156XRJ 111 ENCFF437YIV 112 ENCFF435IEF 113 ENCFF349GID 114 ENCFF835RGX 115 ENCFF086IJA 116 ENCFF203ZIS 117 ENCFF534ULU 118 ENCFF909LKK 119 ENCFF784MKW 120 ENCFF248AGS 121 ENCFF488HDC 122 ENCFF217ISJ 123 ENCFF715NLX 124 ENCFF177LBB 125 ENCFF360MGZ 126 ENCFF801KEW 127 ENCFF884DZN 128 ENCFF566WGL 129 ENCFF374VWZ 130 ENCFF319FBU 131 ENCFF279MNV 132 ENCFF448YLM 133 ENCFF199LDJ 134 ENCFF323PIB 135 ENCFF148VQH 136 ENCFF485KSO 137 ENCFF362MSS 138 ENCFF562MPS 139 ENCFF505MGI 140 ENCFF731ZYR 141 ENCFF113DVJ 142 ENCFF198BFX 143 ENCFF388HRC 144 ENCFF731PDS 145 ENCFF378LGJ 146 ENCFF395BPD 147 ENCFF958MPB 148 ENCFF017LYR 149 ENCFF222DAU 150 ENCFF138WSJ 151 ENCFF245SEV 152 ENCFF051XIW 153 ENCFF407MVV 154 ENCFF152JZK 155 ENCFF121WOM 156 ENCFF304VVZ 157 ENCFF207ZRI 158 ENCFF743FCW 159 ENCFF015XIL 160 ENCFF531PVU 161 ENCFF257QND 162 ENCFF623CPY 163 ENCFF384SBC 164 ENCFF740RLD 165 ENCFF783LYO 166 ENCFF112YYK 167 ENCFF084FGO 168 ENCFF426JHB 169 ENCFF822NZG 170 ENCFF670GFY 171 ENCFF588PIS 172 ENCFF860JUD 173 ENCFF618GUI 174 ENCFF926JOK 175 ENCFF363HXH 176 ENCFF456PDQ 177 ENCFF056ESE 178 ENCFF728UAB 179 ENCFF808NFI 180 ENCFF256VYP 181 ENCFF843MEU 182 ENCFF549PGC 183 ENCFF585IAJ 184 ENCFF437YPG'

In [ ]:
import re
z = [x.strip() for x in re.split('\d* ', m) if x != '']
z = pd.DataFrame(z).rename(columns={0:'accession'})
af = z

In [ ]:


In [ ]:


In [ ]:
active_files_mismatch = active_files_mismatch.reset_index(drop=True)

In [ ]:
active_files_mismatch.assay_term_name.value_counts()

In [ ]:
afm = active_files_mismatch[active_files_mismatch.expected_number < active_files_mismatch.actual_number]

In [ ]:
afm.shape

In [ ]:
#af[~(af.accession.isin(afm.accession.values))].shape

In [ ]:
#afm[afm.accession.isin(af.accession.values)]

In [ ]:
afm[afm.accession.isin(af.accession.values)]\
                 .sort_values(['assay_term_name', 'assembly'])\
                 .reset_index(drop=True) #.to_excel('redundant_qc_metrics_06_01_17.xlsx')

In [ ]:
af[(af.accession.isin(afm.accession.values))]

In [ ]:
afm[~(afm.accession.isin(af.accession.values))]\
                   .sort_values(['assay_term_name', 'assembly'])\
                   .reset_index(drop=True) #.to_excel("duplicate_but_different_05_31_2017.xlsx")
              
ddbd = afm[~(afm.accession.isin(af.accession.values))]\
                   .sort_values(['assay_term_name', 'assembly'])\
                   .reset_index(drop=True)

In [ ]:
afm.assay_term_name.value_counts()

Check that all quality_metrics are same for each group of uuids in consensus list


In [ ]:
# Only if af has been defined above
both = afm[afm.accession.isin(af.accession.values)]\
                 .sort_values(['assay_term_name', 'assembly'])\
                 .reset_index(drop=True)

In [ ]:


In [ ]:
both = afm.sort_values(['assay_term_name', 'assembly'])\
          .reset_index(drop=True)
both

In [ ]:
len(both.accession.unique())

In [ ]:
uuid = both.groupby(['assay_term_name',
                      'accession',
                      'output_type',
                      '@type',
                      'processing_stage',
                      'expected_number',
                      'actual_number',
                      'qc_status',
                      'uuid',
                      'date_qc_created',
                      'assembly',
                      'analysis_name',
                      'file_format',
                      'date_file_created',
                      'dataset',
                      'file_status']).count()\
            .drop(['quality_metric_of',
                   'metric_count'], axis=1)\
            .sort_index(level=[0, 1, 2, 3, 4, 9])

In [ ]:
# uuid = both.merge(td,
#            how='left',
#            on=['accession',
#                '@type',
#                'assay_term_name',
#                'analysis_name',
#                'output_type',
#                'assembly',
#                'processing_stage'])\
#             .groupby(['assay_term_name',
#                       'accession',
#                       'output_type',
#                       '@type',
#                       'processing_stage',
#                       'expected_number',
#                       'actual_number',
#                       'qc_status',
#                       'uuid',
#                       'date_qc_created',
#                       'assembly',
#                       'analysis_name',
#                       'file_format',
#                       'date_file_created',
#                       'dataset',
#                       'file_status']).count()\
#             .drop(['quality_metric_of',
#                    'metric_count'], axis=1)\
#             .sort_index(level=[0, 1, 2, 3, 4, 9])#.to_excel("uuids_of_duplicate_metrics_06_01_2017.xlsx")
# uuid

In [ ]:
uuid_dict = uuid.reset_index().to_dict(orient='records')

In [ ]:
uuid_dict

In [ ]:
# From Idan's redundant QC audit code.
def audit_file_redundant_qc_metrics(quality_metrics):
        metrics_set = set()
        if quality_metrics:
            redundant_types = set()
            for metric in quality_metrics:
                metric.pop('uuid', None)
                metric.pop('@id', None)
                metric.pop('date_created', None)
                metric.pop('audit', None)
                metric.pop('aliases', None)
                metric.pop('attachment', None)
                metric.pop('IDR_plot_true', None)
                metric.pop('IDR_plot_rep1_pr', None)
                metric.pop('IDR_plot_rep2_pr', None)
                metric.pop('IDR_plot_pool_pr', None)
                metric.pop('IDR_parameters_true', None)
                metric.pop('IDR_parameters_rep1_pr', None)
                metric.pop('IDR_parameters_rep2_pr', None)
                metric.pop('IDR_parameters_pool_pr', None)
                metric.pop('cross_correlation_plot', None)
                metric.pop('quality_metric_of', None)
                metric.pop('schema_version', None)
                metric.pop('status', None)
                metric.pop('step_run', None)
                metric.pop('submitted_by', None)

                metric_string = str(ordered_representation(metric))
                if metric_string in metrics_set:
                    redundant_types.add(metric['@type'][0])
                else:
                    metrics_set.add(metric_string)
            return metrics_set

def ordered_representation(obj):
    if isinstance(obj, dict):
        return sorted((k, ordered_representation(v)) for k, v in obj.items())
    if isinstance(obj, list):
        return sorted(ordered_representation(x) for x in obj)
    else:
        return obj

Create dictionary grouped by accession, @type, and processing_stage


In [ ]:
# Ugly code to create nested dictionary
accession_grouped = defaultdict(list)
for record in uuid_dict:
    accession_grouped[record['accession']].append(record)

In [ ]:
accession_type_grouped = defaultdict(list)
for k, v in accession_grouped.items():
    type_grouped = defaultdict(list)
    for record in v:
        type_grouped[record['@type']].append(record)
    accession_type_grouped[k].append(type_grouped)

In [ ]:
accession_type_processing_grouped = defaultdict(list)
for k, v in accession_type_grouped.items():
    for x in v:
        type_processing_grouped = defaultdict(list)
        for i, y in x.items():
            processing_stage = defaultdict(list)
            for t in y:
                processing_stage[t['processing_stage']].append(t)
            type_processing_grouped[i].append(processing_stage)
        accession_type_processing_grouped[k].append(type_processing_grouped)
accession_type_processing_grouped

Grab JSON for each QC object in group, print groups that are different


In [ ]:
duplicate_but_different_accessions = []
for k, v in accession_type_processing_grouped.items():
    for x in v:
        for i, y in x.items():
            for t in y:
                for p, d in t.items():
                    expected_number = d[0]['expected_number']
                    uuid_list = []
                    for z in d:
                        uuid_list.append(z['uuid'])
                    uuids = '&uuid='.join(uuid_list)
                    url = 'https://www.encodeproject.org/search/'\
                          '?type=QualityMetric&uuid={}&format=json'\
                          '&frame=embedded&limit=all'.format(uuids)
                    r = requests.get(url, auth=(key.authid, key.authpw))
                    search_results = r.json()['@graph']
                    unique_metrics = audit_file_redundant_qc_metrics(search_results)
                    if len(unique_metrics) != expected_number:
                        duplicate_but_different_accessions.append((k, i, p))
                        print(k)
                        print(i)
                        print(p)
                        print()
                        print(*unique_metrics, sep="\n\n")
                        print()
                        print("Number of unique metrics: {}".format(len(unique_metrics)))
                        print("Expected number of metrics: {}".format(expected_number))
                        print("\n\n")

In [ ]:
len(duplicate_but_different_accessions)

In [ ]:
duplicate_but_different_accessions

In [ ]:
# duplicate_but_different = afm[(afm.accession.isin([x[0] for x in duplicate_but_different_accessions]))
#    & (afm['@type'].isin([x[1] for x in duplicate_but_different_accessions]))]

In [ ]:
ddbd2 = afm[(afm.accession.isin([x[0] for x in duplicate_but_different_accessions]))
   & (afm['@type'].isin([x[1] for x in duplicate_but_different_accessions]))]

In [ ]:
ddbd2

In [ ]:
pd.concat([ddbd, ddbd2])\
    .sort_values('assay_term_name')\
    .reset_index(drop=True)\
    #.to_excel("duplicate_but_different_06_01_2017.xlsx")

In [ ]:
duplicate_but_different\
    .sort_values('assay_term_name')\
    .reset_index(drop=True)\
    .merge(td,
           how='left',
           on=['accession',
               '@type',
               'assay_term_name',
               'analysis_name',
               'output_type',
               'assembly',
               'processing_stage'])\
    .drop(['quality_metric_of',
           'metric_count'], axis=1)\
    .sort_values(['accession',
                  '@type',
                  'date_qc_created'])\
    .reset_index(drop=True)\
    .groupby(['assay_term_name',
              'accession',
              'output_type',
              '@type',
              'processing_stage',
              'expected_number',
              'actual_number',
              'qc_status',
              'uuid',
              'date_qc_created',
              'assembly',
              'analysis_name',
              'file_format',
              'date_file_created',
              'dataset',
              'file_status']).count()\
        .sort_index(level=[0, 1, 2, 3, 4, 9])

In [ ]:
pd.concat([ddbd, ddbd2])\
    .sort_values('assay_term_name')\
    .reset_index(drop=True)\
    .merge(td,
           how='left',
           on=['accession',
               '@type',
               'assay_term_name',
               'analysis_name',
               'output_type',
               'assembly',
               'processing_stage'])\
    .drop(['quality_metric_of',
           'metric_count'], axis=1)\
    .sort_values(['accession',
                  '@type',
                  'date_qc_created'])\
    .reset_index(drop=True)\
    .groupby(['assay_term_name',
              'accession',
              'output_type',
              '@type',
              'processing_stage',
              'expected_number',
              'actual_number',
              'qc_status',
              'uuid',
              'date_qc_created',
              'assembly',
              'analysis_name',
              'file_format',
              'date_file_created',
              'dataset',
              'file_status']).count()\
        .sort_index(level=[0, 1, 2, 3, 4, 9])#.to_excel("duplicate_but_different_grouped_06_02_2017.xlsx")

Subtract detected duplicate but different rows from consensus list


In [ ]:
len(both.accession.unique())

In [ ]:
ddbd2

In [ ]:
consensus = both[~((both.accession.isin(ddbd2.accession.values))\
                 & (both['@type'].isin(ddbd2['@type'].values)))].reset_index(drop=True)

Map Experiment to Lab


In [ ]:
consensus_uuids = both

In [ ]:
search_ids = "&@id=".join(consensus_uuids.dataset.unique())
url = 'https://www.encodeproject.org/search/'\
      '?type=Experiment&limit=all&frame=embedded&@id={}'.format(search_ids)
r = requests.get(url, auth=(key.authid, key.authpw))
search_results = r.json()['@graph']
search_id_map = {}
for experiment in search_results:
    search_id_map[experiment['@id']] = experiment['lab']['name']

In [ ]:
search_id_map

In [ ]:
consensus_uuids['experiment_lab'] = consensus_uuids.dataset.apply(lambda x: search_id_map[x])

In [ ]:
consensus_uuids.drop_duplicates('accession', keep='first').experiment_lab.value_counts()

Map Lab to wrangler


In [ ]:
lab_wrangler_lookup = {
    "bradley-bernstein": "Cricket",
    "michael-snyder": "Jason",
    "richard-myers": "Carrie",
    "john-stamatoyannopoulos": "Jason",
    "peggy-farnham": "Esther",
    "brenton-graveley": "Kath",
    "bing-ren": "Kath",
    "xiang-dong-fu": "Kath",
    "will-greenleaf": "Jason",
    "kevin-white": "Esther",
    "barbara-wold": "Aditi"
}

In [ ]:
consensus_uuids['wrangler'] = consensus_uuids.experiment_lab\
                                    .apply(lambda x: lab_wrangler_lookup.get(x, 'unknown'))

In [ ]:
consensus_uuids.drop_duplicates('accession', keep='first').wrangler.value_counts()

Pull all file attachment names and md5sums for duplicate metrics


In [ ]:
consensus_uuids['@type'].unique()

In [ ]:
# Possible atachments as defined in schema.
quality_attachments = {
    'IDRQualityMetric': ['IDR_plot_true',
                         'IDR_plot_rep1_pr',
                         'IDR_plot_rep2_pr',
                         'IDR_plot_pool_pr',
                         'IDR_parameters_true',
                         'IDR_parameters_rep1_pr',
                         'IDR_parameters_rep2_pr',
                         'IDR_parameters_pool_pr'],
    'ChipSeqFilterQualityMetric': ['cross_correlation_plot',
                                   'attachment'],
    'SamtoolsFlagstatsQualityMetric': ['attachment'],
    'StarQualityMetric': ['attachment'],
    'GenericQualityMetric': ['attachment'],
    'ComplexityXcorrQualityMetric': ['cross_correlation_plot',
                                     'attachment']
}

In [ ]:
query = '&accession='.join(consensus_uuids.accession.unique())

In [ ]:
url = 'https://www.encodeproject.org/'\
            'search/?type=File&accession={}'\
            '&frame=embedded&format=json&limit=all'.format(query)

In [ ]:
r = requests.get(url, auth=(key.authid, key.authpw))

In [ ]:
results = r.json()['@graph']

In [ ]:
records = []
error_log = []
for result in results:
    for metric in result['quality_metrics']:
        uuid = metric['uuid']
        metric_type = metric['@type'][0]
        for possible_attachment in quality_attachments[metric_type]:
            try:
                records.append((uuid,
                               metric_type,
                               possible_attachment,
                               metric[possible_attachment]['download'],
                               metric[possible_attachment]['md5sum']))
            except KeyError:
                records.append((uuid,
                               metric_type,
                               possible_attachment,
                               'no_value',
                               'no_value'))
                error_log.append("{} has no attachment of type {}.".format(metric_type,
                                                                           possible_attachment))

In [ ]:
attachment_details = pd.DataFrame(records).rename(columns={0: "uuid",
                                                           1: "@type",
                                                           2: "attachment_field",
                                                           3: "attachment_name",
                                                           4: "attachment_md5"})

attachment_details = attachment_details[(attachment_details.uuid.isin(consensus_uuids.uuid.values))]\
                                                                              .reset_index(drop=True)

In [ ]:
len(attachment_details.uuid.unique())

In [ ]:
len(consensus_uuids.uuid.unique())

In [ ]:
consensus_uuids.shape

In [ ]:
consensus_uuids[consensus_uuids.uuid.isin(attachment_details.uuid.values)].shape

In [ ]:
attachment_details.attachment_field.value_counts()

In [ ]:
consensus_uuids.columns

In [ ]:
attachment_details.columns

In [ ]:
cc = consensus_uuids.merge(attachment_details,
                      how='left',
                      on=['uuid', '@type']).groupby(['assay_term_name',
                                                     'wrangler',
                                                     'experiment_lab',
                                                     'accession',
                                                     'output_type',
                                                     '@type',
                                                     'processing_stage',
                                                     'expected_number',
                                                     'actual_number',
                                                     'attachment_field',
                                                     'qc_status',
                                                     'uuid',
                                                     'date_qc_created',
                                                     'attachment_name',
                                                     'attachment_md5',
                                                     'assembly',
                                                     'analysis_name',
                                                     'file_format',
                                                     'date_file_created',
                                                     'dataset',
                                                     'file_status']).count()\
                                            .sort_index(level=[0, 1, 2, 3, 4, 5, 6, 9, 12, 13, 14])#.to_excel("duplicate_metrics_no_value_06_05_2017.xlsx")
cc

Pick most recent UUIDs with attachments to keep.


In [ ]:
def most_recent_uuid_with_attachment(uuid_group):
    """Returns most recent quality metric uuid
    with a valid attachment MD5sum, given a sorted
    group of otherwise redundant metrics. If all
    attachments are missing (no_value) then the
    most recent uuid will be returned."""
    uuid_to_keep = None
    # Pick most recent uuid with attachment.
    for metric in uuid_group[::-1]:
        if uuid_to_keep:
            break
        if metric[4] != 'no_value':
            uuid_to_keep = metric[1]
    # If attachemnt missing for all uuids
    # then pick most recent.
    if not uuid_to_keep:
        uuid_to_keep = uuid_group[-1][1]
    return uuid_to_keep

In [ ]:
# Choose uuid to keep based on date and attachment existing.
uuids_attachments = cc.reset_index().sort_values(['accession',
                                                  '@type',
                                                  'date_qc_created']).reset_index(drop=True)
uuids_to_keep = []
for accession in uuids_attachments.accession.unique():
    # Pull DataFrame records for each accession.
    selected_rows = uuids_attachments[uuids_attachments.accession == accession]
    metric_types = selected_rows['@type'].unique()
    for metric_type in metric_types:
        processing_stages = selected_rows[selected_rows['@type'] == metric_type]\
                                                    ['processing_stage'].unique()
        for processing_stage in processing_stages:
            attachment_types = selected_rows[(selected_rows['@type'] == metric_type)
                                             & (selected_rows['processing_stage'] == processing_stage)]\
                                                                           ['attachment_field'].unique()
            uuid_set = set()
            for attachment_type in attachment_types:
                uuid_group = selected_rows[(selected_rows['@type'] == metric_type)
                                           & (selected_rows['processing_stage'] == processing_stage)
                                           & (selected_rows['attachment_field'] == attachment_type)]\
                                          .sort_values('date_qc_created')[['uuid',
                                                                           'date_qc_created',
                                                                           'qc_status',
                                                                           'attachment_md5']]\
                                                                  .reset_index(drop=True)\
                                                                  .to_records()
                uuid_set.add(most_recent_uuid_with_attachment(uuid_group))
            if len(uuid_set) == 1:
                uuids_to_keep.append(list(uuid_set)[0])
            else:
                raise ValueError('Different attachment fields between metrics.'
                                 ' Conflicting uuids: {}'.format(uuid_set))

In [ ]:
utk = pd.DataFrame(pd.DataFrame(list(set(uuids_to_keep))))
utk['action'] = 'keep'
utk = utk.rename(columns={0:'uuid'})

In [ ]:
# Proposed action.
action_proposed = consensus_uuids.merge(attachment_details,
                                       how='left', on=['uuid','@type'])\
                                .merge(utk,
                                        how="left",
                                        on=['uuid'])\
                                        .fillna("delete")\
                                        .groupby(['assay_term_name',
                                                  'wrangler',
                                                  'experiment_lab',
                                                  'accession',
                                                  'output_type',
                                                  '@type',
                                                  'processing_stage',
                                                  'expected_number',
                                                  'actual_number',
                                                  'attachment_field',
                                                  'qc_status',
                                                  'action',
                                                  'uuid',
                                                  'date_qc_created',
                                                  'attachment_name',
                                                  'attachment_md5',
                                                  'assembly',
                                                  'analysis_name',
                                                  'file_format',
                                                  'date_file_created',
                                                  'dataset',
                                                  'file_status']).count()\
                                    .sort_index(level=[0, 1, 2, 3, 4, 5, 6, 9, 13, 14])
action_proposed#.to_excel('delete_keep_all_pipelines_06_06_2017.xlsx')

Remove specific accessions dealt with by wrangers and create lists of UUIDs to keep and delete


In [ ]:
action_proposed_flat = action_proposed.reset_index()
action_proposed_flat = consensus_uuids.reset_index()

In [ ]:
# Remove metrics Esther has to decide.
action_proposed_flat = action_proposed_flat[~(action_proposed_flat.accession.isin(['ENCFF245SEV',
                                                                                   'ENCFF808NFI']))].reset_index(drop=True)

In [ ]:
# Remove metric Seth fixed.
action_proposed_flat = action_proposed_flat[~(action_proposed_flat.accession == 'ENCFF156XRJ')].reset_index(drop=True)

In [ ]:
uuids_to_keep = action_proposed_flat[action_proposed_flat.action == "keep"].uuid.unique()

In [ ]:
uuids_to_keep

In [ ]:
len(uuids_to_keep)

In [ ]:
uuids_to_delete = action_proposed_flat[action_proposed_flat.action == "delete"].uuid.unique()

In [ ]:
len(uuids_to_delete)

In [ ]:
# Make sure no overlap between lists.
set(uuids_to_keep).intersection(set(uuids_to_delete))

In [ ]:
uuids_to_keep = pd.DataFrame(uuids_to_keep)
uuids_to_delete = pd.DataFrame(uuids_to_delete)

In [ ]:
uuids_to_keep['status'] = "released"
uuids_to_delete['status'] = "deleted"
uuids_to_keep = uuids_to_keep.rename(columns={0: "uuid"})
uuids_to_delete = uuids_to_delete.rename(columns={0: "uuid"})

In [ ]:
uuids_to_keep.to_csv("../../uuids_to_keep_06_28_2017.tsv", sep="\t", index=False)
uuids_to_delete.to_csv("../../uuids_to_delete_06_28_2017.tsv", sep="\t", index=False)

In [ ]:


In [ ]:

Group quality_metrics by accession, type, processing_stage, and attachment field name


In [ ]:
cc_dict = cc.reset_index().to_dict(orient='records')

In [ ]:
cc_dict = action_proposed_flat.to_dict(orient='records')

In [ ]:
# Ugly code to create nested dictionary
accession_grouped = defaultdict(list)
for record in cc_dict:
    accession_grouped[record['accession']].append(record)

In [ ]:
accession_type_grouped = defaultdict(list)
for k, v in accession_grouped.items():
    type_grouped = defaultdict(list)
    for record in v:
        type_grouped[record['@type']].append(record)
    accession_type_grouped[k].append(type_grouped)

In [ ]:
accession_type_processing_grouped = defaultdict(list)
for k, v in accession_type_grouped.items():
    for x in v:
        type_processing_grouped = defaultdict(list)
        for i, y in x.items():
            processing_stage = defaultdict(list)
            for t in y:
                processing_stage[t['processing_stage']].append(t)
            type_processing_grouped[i].append(processing_stage)
        accession_type_processing_grouped[k].append(type_processing_grouped)
#accession_type_processing_grouped

In [ ]:
accession_type_processing_attachment_grouped = defaultdict(list)
for k, v in accession_type_processing_grouped.items():
    for x in v:
        type_processing_attachment_grouped = defaultdict(list)
        for o, q in x.items():
            for b in q:
                processing_attachment_grouped = defaultdict(list)
                for i, y in b.items():
                    attachment_field = defaultdict(list)
                    for t in y:
                        attachment_field[t['attachment_field']].append(t)
                    processing_attachment_grouped[i].append(attachment_field)
                type_processing_attachment_grouped[o].append(processing_attachment_grouped)
    accession_type_processing_attachment_grouped[k].append(type_processing_attachment_grouped)
#accession_type_processing_attachment_grouped

In [ ]:


In [ ]:
both

Check for different MD5sums/filenames of attachments.


In [ ]:
duplicate_but_different_info = []
accession_list = []
for k, v in accession_type_processing_attachment_grouped.items():
    for x in v:
        for i, y in x.items():
            for t in y:
                for w, r in t.items():
                    for u in r:
                        for p, d in u.items():
                            expected_number = d[0]['expected_number']
                            attachment_name_set = set()
                            attachment_md5_set = set()
                            metric_info = []
                            for z in d:
                                attachment_name_set.add(z['attachment_name'])
                                attachment_md5_set.add(z['attachment_md5'])
                                metric_info.append((z['uuid'],
                                                    z['attachment_name'],
                                                    z['attachment_md5'],
                                                    z['qc_status'],
                                                    z['file_status']))
                            if (len(attachment_md5_set) > 1):
                                print("MD5sums do not match.")
                                print("Kind:", end=' ')
                                accession_list.append(k)
                                print(k, i, w, p)
                                print("Number duplicates: {}".format(len(d)))
                                print("Items in group:")
                                for h, e in enumerate(metric_info):
                                    print(h, end=": ")
                                    print(e)
                                print("Filename set:")
                                print(attachment_name_set)
                                print("Md5sum set:")
                                print(attachment_md5_set)
                                print()

In [ ]:
len(action_proposed_flat.accession.unique())

In [ ]:
# Different MD5sums
len(accession_list)

In [ ]:
# Different surface values
len(duplicate_but_different.accession.unique())

Pick duplicate_but_different uuids to keep/delete


In [ ]:
# Choose uuid to keep based on date and attachment existing.
uuids_attachments = dd.reset_index().sort_values(['accession',
                                                  '@type',
                                                  'date_qc_created']).reset_index(drop=True)
uuids_to_keep = []
for accession in uuids_attachments.accession.unique():
    # Pull DataFrame records for each accession.
    selected_rows = uuids_attachments[uuids_attachments.accession == accession]
    metric_types = selected_rows['@type'].unique()
    for metric_type in metric_types:
        processing_stages = selected_rows[selected_rows['@type'] == metric_type]\
                                                    ['processing_stage'].unique()
        uuid_set = set()
        for processing_stage in processing_stages:
           
            uuid_group = selected_rows[(selected_rows['@type'] == metric_type)
                                       & (selected_rows['processing_stage'] == processing_stage)]\
                                      .sort_values('date_qc_created')[['uuid',
                                                                       'date_qc_created',
                                                                       'qc_status']]\
                                                              .reset_index(drop=True)\
                                                              .to_records()
            uuid_set.add(uuid_group[-1][1])
            if len(uuid_set) == 1:
                uuids_to_keep.append(list(uuid_set)[0])
            else:
                raise ValueError('Different attachment fields between metrics.'
                                 ' Conflicting uuids: {}'.format(uuid_set))

In [ ]:
dduuids_to_keep = pd.DataFrame(uuids_to_keep)

In [ ]:
dduuids_to_keep['status'] = 'released'

In [ ]:
dduuids_to_keep = dduuids_to_keep.rename(columns={0: 'uuid'})

In [ ]:
dd = dd.reset_index().merge(dduuids_to_keep, how='left', on='uuid').fillna('deleted')

In [ ]:
# Remove ones Seth fixed and remove RNA
dd = dd[~(dd.dataset.isin(['/experiments/ENCSR000EGT/', '/experiments/ENCSR000EGU/']))
   & ~(dd.assay_term_name=="shRNA knockdown followed by RNA-seq")].reset_index(drop=True)

In [ ]:
dd.groupby(['assay_term_name',
          'dataset',
          'accession',
          'output_type',
          '@type',
          'processing_stage',
          'expected_number',
          'actual_number',
          'qc_status',
          'status',
          'uuid',
          'date_qc_created',
          'assembly',
          'analysis_name',
          'file_format',
          'date_file_created',
          'file_status']).count()

In [ ]:
dd_uuids_to_keep = pd.DataFrame(dd[dd.status=='released'].uuid.unique())
dd_uuids_to_keep['status'] = 'released'
dd_uuids_to_keep = dd_uuids_to_keep.rename(columns={0: 'uuid'})

In [ ]:
dd_uuids_to_delete = pd.DataFrame(dd[dd.status=='deleted'].uuid.unique())
dd_uuids_to_delete['status'] = 'deleted'
dd_uuids_to_delete = dd_uuids_to_delete.rename(columns={0: 'uuid'})

In [ ]:
dd_uuids_to_keep.to_csv("dd_uuids_to_keep_06_13_2017.tsv", sep="\t", index=False)

In [ ]:
dd_uuids_to_delete.to_csv("dd_uuids_to_delete_06_13_2017.tsv", sep="\t", index=False)

In [ ]:


In [ ]:


In [ ]:

Batch uuids by experiment


In [ ]:
exp_values = action_proposed_flat.dataset.unique()
for i in range(0, len(exp_values), 10):
    batch = exp_values[i:i+10]
    data = action_proposed_flat[action_proposed_flat.dataset.isin(batch)]\
                                                        .groupby(['dataset',
                                                                  'action',
                                                                  'uuid']).count()[[]].reset_index()
    data_to_keep = data[data.action=="keep"]
    data_to_delete = data[data.action=="delete"]
    uuids_to_keep = pd.DataFrame(data_to_keep.uuid.unique())
    uuids_to_delete = pd.DataFrame(data_to_delete.uuid.unique())
    uuids_to_keep['status'] = "released"
    uuids_to_delete['status'] = "deleted"
    uuids_to_keep = uuids_to_keep.rename(columns={0: "uuid"})
    uuids_to_delete = uuids_to_delete.rename(columns={0: "uuid"})
    print(i)
    print(uuids_to_keep.shape)
    print(uuids_to_delete.shape)
    print(batch)
#     uuids_to_keep.to_csv("patch_batch/uuids_to_keep_06_13_2017_batch_{}.tsv".format(i),
#                          sep="\t",
#                          index=False)
#     uuids_to_delete.to_csv("patch_batch/uuids_to_delete_06_13_2017_batch_{}.tsv".format(i),
#                            sep="\t",
#                            index=False)

In [ ]:


In [ ]:


In [ ]:
td.dtypes

In [ ]:
consensus_uuids

In [ ]:
td["date_qc_created"] = td["date_qc_created"].apply(lambda x: pd.to_datetime(x))
action_all = td
action_all['action'] = 'keep'
action_all['audit'] = 'unique'
action_all = action_all.sort_values("assay_term_name", ascending=False).reset_index(drop=True)

In [ ]:
action_proposed_flat = action_proposed.reset_index()
apf = action_proposed_flat[['uuid', 'action', 'actual_number']].reset_index(drop=True)
apf['audit'] = 'duplicate'
action_all = td.merge(apf, how='left', on=['uuid'])
action_all['action'] = action_all['action'].fillna('keep')
action_all['audit'] = action_all['audit'].fillna('unique')
action_all['actual_number'] = action_all['actual_number'].fillna(1)
aad = action_all[action_all.audit=="duplicate"].reset_index(drop=True)
aau = action_all[action_all.audit=="unique"].reset_index(drop=True)
action_all = action_all.sort_values("assay_term_name", ascending=False).reset_index(drop=True)

In [ ]:


In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set(style="ticks", font="Lato")
import datetime
with sns.plotting_context("notebook", font_scale=1.5):
    fig = plt.figure(figsize=[12, 15])
    sns.stripplot(x="date_qc_created",
                  y="assay_term_name",
                  hue="audit",
                  data=action_all,
                  size=15,
                  jitter=True,
                  split=True,
                  palette=sns.color_palette(["black",
                                             "#de2d26"]),
                  alpha=0.3)
for x in fig.gca().findobj(mpl.collections.PathCollection):
    edge_color = x.get_facecolor()
    x.set_facecolor("white")
    x.set_edgecolor(edge_color)
    x.set_linewidth(1.5)
    x.set_snap(False)
    x.set_antialiased(True)
    
sns.despine(left=True,bottom=True)
fig.gca().xaxis.grid(True, ls='--')
fig.gca().yaxis.grid(True, ls="--")
bottom, top = fig.gca().get_ylim()

for x in np.arange(top, bottom + 1, 1):
    fig.gca().axhline(x,
                      color="black",
                      linewidth=1)
    
fig.gca().set_ylim([top - 0.1, bottom + 0.1])
box = fig.gca().get_position()
fig.gca().set_position([box.x0,
                        box.y0,
                        box.width * 0.85,
                        box.height])

sns.plt.legend(loc="upper left",
               bbox_to_anchor=(1, 0.5),
               frameon=True,
               prop={'size':'x-large'})

sns.plt.title('Quality Metrics by Assay and Date (After Patching)',
              size=16,
              fontweight='bold');

import matplotlib.dates as mdates
fig.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))

In [ ]:
action_all[action_all.audit=='duplicate']

In [ ]:


In [ ]:


In [ ]:


In [ ]:
# Order data by date_qc_created and
# and convert field to string for visualization.
td = td.sort_values("date_qc_created").reset_index(drop=True)
td["date_qc_created"] = td["date_qc_created"].astype("str")

In [ ]:
action_proposed_flat = action_proposed.reset_index()
apf = action_proposed_flat[['uuid', 'action', 'actual_number']].reset_index(drop=True)
apf['audit'] = 'duplicate'
action_all = td.merge(apf, how='left', on=['uuid'])
action_all['action'] = action_all['action'].fillna('keep')
action_all['audit'] = action_all['audit'].fillna('unique')
action_all['actual_number'] = action_all['actual_number'].fillna(1)

In [ ]:
aad = action_all[action_all.audit=="duplicate"].reset_index(drop=True)
aau = action_all[action_all.audit=="unique"].reset_index(drop=True)

In [ ]:
action_all = action_all.sort_values("assay_term_name", ascending=False).reset_index(drop=True)

In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set(style="ticks", font="Lato")
import datetime
with sns.plotting_context("notebook", font_scale=1.5):
    fig = plt.figure(figsize=[12, 15])
    sns.stripplot(x="date_qc_created",
                  y="assay_term_name",
                  hue="audit",
                  data=action_all,
                  size=15,
                  jitter=True,
                  split=True,
                  palette=sns.color_palette(["black",
                                             "#de2d26"]),
                  alpha=0.3)
for x in fig.gca().findobj(mpl.collections.PathCollection):
    edge_color = x.get_facecolor()
    x.set_facecolor("white")
    x.set_edgecolor(edge_color)
    x.set_linewidth(1.5)
    x.set_snap(False)
    x.set_antialiased(True)
    
sns.despine(left=True,bottom=True)
fig.gca().xaxis.grid(True, ls='--')
fig.gca().yaxis.grid(True, ls="--")
bottom, top = fig.gca().get_ylim()

for x in np.arange(top, bottom + 1, 1):
    fig.gca().axhline(x,
                      color="black",
                      linewidth=1)
    
fig.gca().set_ylim([top - 0.1, bottom + 0.1])
box = fig.gca().get_position()
fig.gca().set_position([box.x0,
                        box.y0,
                        box.width * 0.85,
                        box.height])

sns.plt.legend(loc="upper left",
               bbox_to_anchor=(1, 0.5),
               frameon=True,
               prop={'size':'x-large'})

sns.plt.title('Quality Metrics by Assay and Date (Before Patching)',
              size=16,
              fontweight='bold');

import matplotlib.dates as mdates
fig.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))

In [ ]:


In [ ]:
action_all[action_all['assay_term_name'] == "ATAC-seq"].audit.value_counts()

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:
action_all

In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set(style="ticks")
with sns.plotting_context("notebook", font_scale=1.5):
    fig = plt.figure(figsize=[10, 8])
    sns.stripplot(x="actual_number",
                  y="assay_term_name",
                  hue="audit",
                  data=action_all,
                  size=15,
                  #edgecolor="black",
                  palette=sns.color_palette(["gray",
                                             "#de2d26",
                                             #"#95a5a6",
                                             #"#e41a1c",
                                             #"#a6cee3",
                                             "black",
                                             "#e74c3c",
                                             "#34495e",
                                             "#3498db",
                                             "#2ecc71"]),
                  jitter=True,
                  alpha=0.9)


fig.gca().set_xticks([x for x in sorted(action_all.actual_number.unique())])

for x in fig.gca().findobj(mpl.collections.PathCollection):
    edge_color = x.get_facecolor()
    x.set_facecolor("None")
    x.set_edgecolor(edge_color)
    x.set_linewidth(0.15)
    x.set_snap(False)
    x.set_antialiased(True)

for x in fig.gca().legend_.__dict__['legendHandles']:
    x.set_linewidth(1)
    x.set_alpha(1)

sns.despine(left=True, bottom=True)
fig.gca().xaxis.grid(True)
fig.gca().yaxis.grid(True)

In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set(style="ticks")
with sns.plotting_context("notebook", font_scale=1.5):
    fig = plt.figure(figsize=[10, 8])
    sns.stripplot(x="metric_count",
                  y="file_status",
                  hue="qc_status",
                  order=['released',
                         'in progress',
                         'archived',
                         'replaced',
                         'revoked',
                         'deleted',
                         'content error'],
                  data=td,
                  size=15,
                  #edgecolor="black",
                  palette=sns.color_palette(["black",
                                             "#de2d26",
                                             #"#95a5a6",
                                             #"#e41a1c",
                                             #"#a6cee3",
                                             "black",
                                             "#e74c3c",
                                             "#34495e",
                                             "#3498db",
                                             "#2ecc71"]),
                  jitter=True,
                  alpha=0.3)

for x in fig.gca().findobj(mpl.collections.PathCollection):
    edge_color = x.get_facecolor()
    x.set_facecolor("None")
    x.set_edgecolor(edge_color)
    x.set_linewidth(0.05)
    x.set_snap(False)
    x.set_antialiased(True)

for x in fig.gca().legend_.__dict__['legendHandles']:
    x.set_linewidth(1)
    x.set_alpha(1)

sns.despine(left=True, bottom=True)
fig.gca().xaxis.grid(True)
fig.gca().yaxis.grid(True)

In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set_style("ticks")
if td["date_qc_created"].dtype == "<M8[ns]":
    pass
with sns.plotting_context("notebook", font_scale=1.4):
    fig = plt.figure(figsize=[7, 7])
    sns.stripplot(x="metric_count",
                  y="date_qc_created",
                  data=td,
                  size=4,
                  edgecolor="black",
                  jitter=True)
    
for x in fig.gca().findobj(mpl.collections.PathCollection):
    x.set_facecolor("white")
    x.set_edgecolor("black")
    x.set_linewidth(1.5)
    x.set_snap(False)
    x.set_antialiased(True)
    
sns.despine(left=True, bottom=True)
fig.gca().xaxis.grid(True)
fig.gca().yaxis.grid(True)
fig.gca().tick_params(labelbottom='on',
                      labeltop='on')

In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set_style("ticks")

with sns.plotting_context("notebook", font_scale=1.5):
    fig = plt.figure(figsize=[10, 8])
    sns.stripplot(x="metric_count",
                  y="assay_term_name",
                  data=td,
                  size=15,
                  edgecolor="black",
                  jitter=True)
    
for x in fig.gca().findobj(mpl.collections.PathCollection):
    x.set_facecolor("white")
    x.set_edgecolor("black")
    x.set_linewidth(1.5)
    x.set_snap(False)
    x.set_antialiased(True)
    
sns.despine(left=True, bottom=True)
fig.gca().xaxis.grid(True)
fig.gca().yaxis.grid(True)

In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set_style("ticks")

with sns.plotting_context("notebook", font_scale=1.5):
    fig = plt.figure(figsize=[10, 8])
    sns.stripplot(x="metric_count",
                  y="@type",
                  data=td,
                  size=15,
                  edgecolor="black",
                  jitter=True)
    
for x in fig.gca().findobj(mpl.collections.PathCollection):
    x.set_facecolor("white")
    x.set_edgecolor("black")
    x.set_linewidth(1.5)
    x.set_snap(False)
    x.set_antialiased(True)
    
sns.despine(left=True, bottom=True)
fig.gca().xaxis.grid(True)
fig.gca().yaxis.grid(True)

In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set_style("ticks")

with sns.plotting_context("notebook", font_scale=1.5):
    fig = plt.figure(figsize=[10, 8])
    sns.stripplot(x="metric_count",
                  y="output_type",
                  data=td,
                  size=15,
                  edgecolor="black",
                  jitter=True)
    
for x in fig.gca().findobj(mpl.collections.PathCollection):
    x.set_facecolor("white")
    x.set_edgecolor("black")
    x.set_linewidth(1.5)
    x.set_snap(False)
    x.set_antialiased(True)
    
sns.despine(left=True, bottom=True)
fig.gca().xaxis.grid(True)
fig.gca().yaxis.grid(True)

In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set_style("ticks")
with sns.plotting_context("notebook", font_scale=1.5):
    fig = plt.figure(figsize=[14, 8])
    sns.boxplot(x="file_format",
                y="metric_count",
                data=td,
                color="black")

    for i, y in enumerate(fig.gca().artists):
        y.set_facecolor("None")
        #y.set_edgecolor("black")
        #for line in fig.gca().lines:
            #line.set_color("red")
            #line.set_mfc("red")
            #line.set_mec("red")
    #sns.despine(ax=fig.gca())

In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set_style("ticks")
with sns.plotting_context("notebook", font_scale=1):
    fig = plt.figure(figsize=[14, 8])
    sns.boxplot(x="assembly",
                y="metric_count",
                data=td,
                color="black")

    for i, y in enumerate(fig.gca().artists):
        y.set_facecolor("None")
        #y.set_edgecolor("black")
        #for line in fig.gca().lines:
            #line.set_color("red")
            #line.set_mfc("red")
            #line.set_mec("red")
    #sns.despine(ax=fig.gca())
#fig.get_dpi()

In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set_style("ticks")
with sns.plotting_context("notebook", font_scale=1.5):
    fig = plt.figure(figsize=[14, 8])
    sns.boxplot(x="file_status",
                y="metric_count",
                data=td,
                color="black")

    for i, y in enumerate(fig.gca().artists):
        y.set_facecolor("None")
        #y.set_edgecolor("black")
        #for line in fig.gca().lines:
            #line.set_color("red")
            #line.set_mfc("red")
            #line.set_mec("red")
    #sns.despine(ax=fig.gca())

In [ ]:
#plt.rcParams['figure.dpi'] = 72
sns.set_style("ticks")
with sns.plotting_context("notebook", font_scale=1.5):
    fig = plt.figure(figsize=[14, 8])
    sns.boxplot(x="qc_status",
                y="metric_count",
                data=td,
                color="black")

    for i, y in enumerate(fig.gca().artists):
        y.set_facecolor("None")
        #y.set_edgecolor("black")
        #for line in fig.gca().lines:
            #line.set_color("red")
            #line.set_mfc("red")
            #line.set_mec("red")
    #sns.despine(ax=fig.gca())

In [ ]:
df = pd.read_excel('replaced_items_no_redirect_06_12_2017.xlsx')

In [ ]:
df.columns

In [ ]:
df[df.accession=='ENCFF008MOV']

In [ ]:
df[(df.content_md5sum_old == df.content_md5sum_new)
   & (df.content_md5sum_old == 'not_available')].shape

In [ ]: