Converts the current IDR metadata format into a datapackage-based json file. On loading the file using the jsonschema-pandas backend, both the library and processed data files are automatically converted to DataFrames.


In [1]:
study="idr0001-study.txt"

In [2]:
from fileinput import input
from fileinput import close

In [3]:
sections = [{}]
close()  # just in case
print ">>>> Unused lines:"
for line in input([study]):
    line = line.strip()
    if "Screen Number" in line:
        sections.append({})
        section = int(line[13:].strip())
        assert len(sections)-1 == section
        sections[-1]["Screen Number"] = section
    else:
        if "\t" in line:
            parts = line.split("\t", 1)
            sections[-1][parts[0]] = parts[1]
        elif line.strip():
            print "  >>", line[0:80]


>>>> Unused lines:
  >> "# Section with generic information about the study including title, description
  >> # Study
  >> Study Public Release Date
  >> # Study Publication
  >> Study PMC ID
  >> # Study Contacts
  >> "# Section containing all information relative to each screen in the study inclu
  >> # Screen; this section should be repeated if a study contains multiple screens
  >> "# Library section. The library file should be supplied separately and it should
  >> Library Experimental Conditions Term Source REF
  >> Library Experimental Conditions Term Accession
  >> Quality Control Description
  >> # Protocols
  >> # Phenotypes
  >> # Raw Data Files
  >> # Feature Level Data Files (give individual file details unless there is one fil
  >> #  Processed Data Files

In [4]:
import datapackage as dp

In [5]:
myDP = dp.DataPackage()
for k, v in sections[0].items():  # Top-level
    myDP.descriptor[k] = v
myDP.descriptor['resources'] = []
print myDP.to_json()[0:80]


{"Study Screens Number": "1", "Study PubMed ID": "25373780", "Study Organism Ter

In [6]:
from os.path import exists
from os.path import join
from jsontableschema import infer

import csv
import io

for section in sections[1:]:
    num = section["Screen Number"]
    isn = section["Comment[IDR Screen Name]"]
    isp = isn.split("/")[1]
    lib = section["Library File Name"].replace("txt", "tsv")
    pdf = section["Processed Data File Name"].replace("txt", "tsv")
    
    if not (exists(join(isp, lib)) and exists(join(isp, pdf))):
        raise Exception("Could not find in %s: %s and %s" % (isp, lib, pdf))
    for name, path in (("library", lib), ("processed data", pdf)):
        with io.open(join(isp, path)) as stream:
            headers = stream.readline().rstrip('\n').split('\t')
            values = csv.reader(stream, dialect="excel", delimiter="\t")
            schema = infer(headers, values)
            for field in schema['fields']:
                if field['type'] == 'geojson':
                    del field['type']
        myDP.descriptor['resources'].append(
            {
                "name": "%s %s file" % (isp, name),
                "path": join(isp, path),
                "schema": schema,
            }
        )

In [7]:
with open(study.replace("txt", "json"), "w") as f:
    f.write(myDP.to_json())

In [8]:
copyDP = dp.DataPackage(study.replace("txt", "json"))

In [9]:
# Requires: pip install jsontableschema-pandas
storage = dp.push_datapackage(descriptor=study.replace("txt", "json"), backend='pandas')

In [10]:
processed = storage[storage.buckets[0]]
library = storage[storage.buckets[1]]

In [11]:
processed.describe()


Out[11]:
Plate Well Number Well Characteristics [Organism] Term Source 1 REF Term Source 1 Accession Characteristics [Strain] Gene Identifier Gene Symbol Reagent Design Gene Annotation Build Analysis Gene Annotation Build Control Type Control Comments Channels Replicate Group Plate Issues
count 18720 18720 18720 18720 18720 18720 18720 18720 18720 18720 18720 18720 18720 18720 18720 18720
unique 195 96 96 2 2 2 3010 3006 1881 1 1 5 4 1 3 2
top JL_130116_J6_6 5 A8 Schizosaccharomyces pombe NCBITaxon NCBITaxon_4896 MS1404 GFP:endogenous alpha tubulin 2;Cascade blue:gr... 3
freq 96 195 195 17550 17550 17550 2473 3830 8569 18720 18720 13549 16203 18720 8832 18432

In [12]:
library.describe()


Out[12]:
Gene Identifier Gene Symbol Reproducibility of Shape Hits Reproducibility of Microtubule Hits Reproducibility of Cell Cycle Progression Hits Visual Shape Hit Visual Microtubule Hit Conservation in S. cerevisiae Conservation in Vertebrates Conservation in H. sapiens ... Phenotype 10 Phenotype 11 Phenotype 12 Phenotype 13 Phenotype 14 Phenotype 15 Phenotype 16 Phenotype 17 Phenotype 18 Phenotype 19
count 262 262 262 262 262 262 262 262 262 262 ... 262 262 262 262 262 262 262 262 262 262
unique 262 199 41 41 12 2 2 2 2 2 ... 2 2 2 2 2 2 2 2 2 2
top SPCC790.02 yes yes yes ...
freq 1 64 119 76 227 227 228 232 204 131 ... 253 261 261 233 259 261 258 260 236 260

4 rows × 32 columns


In [19]:
import pandas as pd
annotations = pd.merge(processed, library, how='outer', on=['Gene Identifier', 'Gene Symbol'])
annotations.describe()


Out[19]:
Plate Well Number Well Characteristics [Organism] Term Source 1 REF Term Source 1 Accession Characteristics [Strain] Gene Identifier Gene Symbol Reagent Design Gene Annotation Build ... Phenotype 10 Phenotype 11 Phenotype 12 Phenotype 13 Phenotype 14 Phenotype 15 Phenotype 16 Phenotype 17 Phenotype 18 Phenotype 19
count 18720 18720 18720 18720 18720 18720 18720 18731 18731 18720 ... 3620 3620 3620 3620 3620 3620 3620 3620 3620 3620
unique 195 96 96 2 2 2 3010 3006 1882 1 ... 2 2 2 2 2 2 2 2 2 2
top JL_130116_J6_6 5 A8 Schizosaccharomyces pombe NCBITaxon NCBITaxon_4896 MS1404 ...
freq 96 195 195 17550 17550 17550 2473 3830 8579 18720 ... 3514 3602 3602 3329 3205 3610 3580 3215 3359 3600

4 rows × 46 columns


In [21]:
annotations.to_csv("test.csv", index=False)

In [ ]: