Converts the current IDR metadata format into a datapackage-based json file. On loading the file using the jsonschema-pandas backend, both the library and processed data files are automatically converted to DataFrames.
In [1]:
study="idr0001-study.txt"
In [2]:
from fileinput import input
from fileinput import close
In [3]:
sections = [{}]
close() # just in case
print ">>>> Unused lines:"
for line in input([study]):
line = line.strip()
if "Screen Number" in line:
sections.append({})
section = int(line[13:].strip())
assert len(sections)-1 == section
sections[-1]["Screen Number"] = section
else:
if "\t" in line:
parts = line.split("\t", 1)
sections[-1][parts[0]] = parts[1]
elif line.strip():
print " >>", line[0:80]
In [4]:
import datapackage as dp
In [5]:
myDP = dp.DataPackage()
for k, v in sections[0].items(): # Top-level
myDP.descriptor[k] = v
myDP.descriptor['resources'] = []
print myDP.to_json()[0:80]
In [6]:
from os.path import exists
from os.path import join
from jsontableschema import infer
import csv
import io
for section in sections[1:]:
num = section["Screen Number"]
isn = section["Comment[IDR Screen Name]"]
isp = isn.split("/")[1]
lib = section["Library File Name"].replace("txt", "tsv")
pdf = section["Processed Data File Name"].replace("txt", "tsv")
if not (exists(join(isp, lib)) and exists(join(isp, pdf))):
raise Exception("Could not find in %s: %s and %s" % (isp, lib, pdf))
for name, path in (("library", lib), ("processed data", pdf)):
with io.open(join(isp, path)) as stream:
headers = stream.readline().rstrip('\n').split('\t')
values = csv.reader(stream, dialect="excel", delimiter="\t")
schema = infer(headers, values)
for field in schema['fields']:
if field['type'] == 'geojson':
del field['type']
myDP.descriptor['resources'].append(
{
"name": "%s %s file" % (isp, name),
"path": join(isp, path),
"schema": schema,
}
)
In [7]:
with open(study.replace("txt", "json"), "w") as f:
f.write(myDP.to_json())
In [8]:
copyDP = dp.DataPackage(study.replace("txt", "json"))
In [9]:
# Requires: pip install jsontableschema-pandas
storage = dp.push_datapackage(descriptor=study.replace("txt", "json"), backend='pandas')
In [10]:
processed = storage[storage.buckets[0]]
library = storage[storage.buckets[1]]
In [11]:
processed.describe()
Out[11]:
In [12]:
library.describe()
Out[12]:
In [19]:
import pandas as pd
annotations = pd.merge(processed, library, how='outer', on=['Gene Identifier', 'Gene Symbol'])
annotations.describe()
Out[19]:
In [21]:
annotations.to_csv("test.csv", index=False)
In [ ]: