Converts the current IDR metadata format into a datapackage-based json file. On loading the file using the jsonschema-pandas backend, both the library and processed data files are automatically converted to DataFrames.



In [1]:

    
study="idr0001-study.txt"



In [2]:

    
from fileinput import input
from fileinput import close



In [3]:

    
sections = [{}]
close()  # just in case
print ">>>> Unused lines:"
for line in input([study]):
    line = line.strip()
    if "Screen Number" in line:
        sections.append({})
        section = int(line[13:].strip())
        assert len(sections)-1 == section
        sections[-1]["Screen Number"] = section
    else:
        if "\t" in line:
            parts = line.split("\t", 1)
            sections[-1][parts[0]] = parts[1]
        elif line.strip():
            print "  >>", line[0:80]









    



>>>> Unused lines:
  >> "# Section with generic information about the study including title, description
  >> # Study
  >> Study Public Release Date
  >> # Study Publication
  >> Study PMC ID
  >> # Study Contacts
  >> "# Section containing all information relative to each screen in the study inclu
  >> # Screen; this section should be repeated if a study contains multiple screens
  >> "# Library section. The library file should be supplied separately and it should
  >> Library Experimental Conditions Term Source REF
  >> Library Experimental Conditions Term Accession
  >> Quality Control Description
  >> # Protocols
  >> # Phenotypes
  >> # Raw Data Files
  >> # Feature Level Data Files (give individual file details unless there is one fil
  >> #  Processed Data Files



In [4]:

    
import datapackage as dp



In [5]:

    
myDP = dp.DataPackage()
for k, v in sections[0].items():  # Top-level
    myDP.descriptor[k] = v
myDP.descriptor['resources'] = []
print myDP.to_json()[0:80]









    



{"Study Screens Number": "1", "Study PubMed ID": "25373780", "Study Organism Ter



In [6]:

    
from os.path import exists
from os.path import join
from jsontableschema import infer

import csv
import io

for section in sections[1:]:
    num = section["Screen Number"]
    isn = section["Comment[IDR Screen Name]"]
    isp = isn.split("/")[1]
    lib = section["Library File Name"].replace("txt", "tsv")
    pdf = section["Processed Data File Name"].replace("txt", "tsv")
    
    if not (exists(join(isp, lib)) and exists(join(isp, pdf))):
        raise Exception("Could not find in %s: %s and %s" % (isp, lib, pdf))
    for name, path in (("library", lib), ("processed data", pdf)):
        with io.open(join(isp, path)) as stream:
            headers = stream.readline().rstrip('\n').split('\t')
            values = csv.reader(stream, dialect="excel", delimiter="\t")
            schema = infer(headers, values)
            for field in schema['fields']:
                if field['type'] == 'geojson':
                    del field['type']
        myDP.descriptor['resources'].append(
            {
                "name": "%s %s file" % (isp, name),
                "path": join(isp, path),
                "schema": schema,
            }
        )



In [7]:

    
with open(study.replace("txt", "json"), "w") as f:
    f.write(myDP.to_json())



In [8]:

    
copyDP = dp.DataPackage(study.replace("txt", "json"))



In [9]:

    
# Requires: pip install jsontableschema-pandas
storage = dp.push_datapackage(descriptor=study.replace("txt", "json"), backend='pandas')



In [10]:

    
processed = storage[storage.buckets[0]]
library = storage[storage.buckets[1]]



In [11]:

    
processed.describe()









    Out[11]:






  
    
      
      Plate
      Well Number
      Well
      Characteristics [Organism]
      Term Source 1 REF
      Term Source 1 Accession
      Characteristics [Strain]
      Gene Identifier
      Gene Symbol
      Reagent Design Gene Annotation Build
      Analysis Gene Annotation Build
      Control Type
      Control Comments
      Channels
      Replicate Group
      Plate Issues
    
  
  
    
      count
      18720
      18720
      18720
      18720
      18720
      18720
      18720
      18720
      18720
      18720
      18720
      18720
      18720
      18720
      18720
      18720
    
    
      unique
      195
      96
      96
      2
      2
      2
      3010
      3006
      1881
      1
      1
      5
      4
      1
      3
      2
    
    
      top
      JL_130116_J6_6
      5
      A8
      Schizosaccharomyces pombe
      NCBITaxon
      NCBITaxon_4896
      MS1404
      
      
      
      
      
      
      GFP:endogenous alpha tubulin 2;Cascade blue:gr...
      3
      
    
    
      freq
      96
      195
      195
      17550
      17550
      17550
      2473
      3830
      8569
      18720
      18720
      13549
      16203
      18720
      8832
      18432



In [12]:

    
library.describe()









    Out[12]:






  
    
      
      Gene Identifier
      Gene Symbol
      Reproducibility of Shape Hits
      Reproducibility of Microtubule Hits
      Reproducibility of Cell Cycle Progression Hits
      Visual Shape Hit
      Visual Microtubule Hit
      Conservation in S. cerevisiae
      Conservation in Vertebrates
      Conservation in H. sapiens
      ...
      Phenotype 10
      Phenotype 11
      Phenotype 12
      Phenotype 13
      Phenotype 14
      Phenotype 15
      Phenotype 16
      Phenotype 17
      Phenotype 18
      Phenotype 19
    
  
  
    
      count
      262
      262
      262
      262
      262
      262
      262
      262
      262
      262
      ...
      262
      262
      262
      262
      262
      262
      262
      262
      262
      262
    
    
      unique
      262
      199
      41
      41
      12
      2
      2
      2
      2
      2
      ...
      2
      2
      2
      2
      2
      2
      2
      2
      2
      2
    
    
      top
      SPCC790.02
      
      
      
      
      
      
      yes
      yes
      yes
      ...
      
      
      
      
      
      
      
      
      
      
    
    
      freq
      1
      64
      119
      76
      227
      227
      228
      232
      204
      131
      ...
      253
      261
      261
      233
      259
      261
      258
      260
      236
      260
    
  

4 rows × 32 columns



In [19]:

    
import pandas as pd
annotations = pd.merge(processed, library, how='outer', on=['Gene Identifier', 'Gene Symbol'])
annotations.describe()









    Out[19]:






  
    
      
      Plate
      Well Number
      Well
      Characteristics [Organism]
      Term Source 1 REF
      Term Source 1 Accession
      Characteristics [Strain]
      Gene Identifier
      Gene Symbol
      Reagent Design Gene Annotation Build
      ...
      Phenotype 10
      Phenotype 11
      Phenotype 12
      Phenotype 13
      Phenotype 14
      Phenotype 15
      Phenotype 16
      Phenotype 17
      Phenotype 18
      Phenotype 19
    
  
  
    
      count
      18720
      18720
      18720
      18720
      18720
      18720
      18720
      18731
      18731
      18720
      ...
      3620
      3620
      3620
      3620
      3620
      3620
      3620
      3620
      3620
      3620
    
    
      unique
      195
      96
      96
      2
      2
      2
      3010
      3006
      1882
      1
      ...
      2
      2
      2
      2
      2
      2
      2
      2
      2
      2
    
    
      top
      JL_130116_J6_6
      5
      A8
      Schizosaccharomyces pombe
      NCBITaxon
      NCBITaxon_4896
      MS1404
      
      
      
      ...
      
      
      
      
      
      
      
      
      
      
    
    
      freq
      96
      195
      195
      17550
      17550
      17550
      2473
      3830
      8579
      18720
      ...
      3514
      3602
      3602
      3329
      3205
      3610
      3580
      3215
      3359
      3600
    
  

4 rows × 46 columns



In [21]:

    
annotations.to_csv("test.csv", index=False)



In [ ]:

	Plate	Well Number	Well	Characteristics [Organism]	Term Source 1 REF	Term Source 1 Accession	Characteristics [Strain]	Gene Identifier	Gene Symbol	Reagent Design Gene Annotation Build	Analysis Gene Annotation Build	Control Type	Control Comments	Channels	Replicate Group	Plate Issues
count	18720	18720	18720	18720	18720	18720	18720	18720	18720	18720	18720	18720	18720	18720	18720	18720
unique	195	96	96	2	2	2	3010	3006	1881	1	1	5	4	1	3	2
top	JL_130116_J6_6	5	A8	Schizosaccharomyces pombe	NCBITaxon	NCBITaxon_4896	MS1404							GFP:endogenous alpha tubulin 2;Cascade blue:gr...	3
freq	96	195	195	17550	17550	17550	2473	3830	8569	18720	18720	13549	16203	18720	8832	18432

	Gene Identifier	Gene Symbol	Reproducibility of Shape Hits	Reproducibility of Microtubule Hits	Reproducibility of Cell Cycle Progression Hits	Visual Shape Hit	Visual Microtubule Hit	Conservation in S. cerevisiae	Conservation in Vertebrates	Conservation in H. sapiens	...	Phenotype 10	Phenotype 11	Phenotype 12	Phenotype 13	Phenotype 14	Phenotype 15	Phenotype 16	Phenotype 17	Phenotype 18	Phenotype 19
count	262	262	262	262	262	262	262	262	262	262	...	262	262	262	262	262	262	262	262	262	262
unique	262	199	41	41	12	2	2	2	2	2	...	2	2	2	2	2	2	2	2	2	2
top	SPCC790.02							yes	yes	yes	...
freq	1	64	119	76	227	227	228	232	204	131	...	253	261	261	233	259	261	258	260	236	260