In [1]:
import sys, os
from bl.vl.kb import KnowledgeBase
OME_HOST = os.getenv('OME_HOST', 'localhost')
OME_USER = os.getenv('OME_USER', 'test')
OME_PASSWD = os.getenv('OME_PASSWD', 'test')
CHECK_OME_VERSION = os.getenv('CHECK_OME_VERSION', "True") == "True"
BaseProxy = KnowledgeBase(driver='omero')
class Proxy(BaseProxy):
def get_objects_dict(self, klass):
return dict((o.label, o) for o in super(Proxy, self).get_objects(klass))
kb = Proxy(OME_HOST, OME_USER, OME_PASSWD, check_ome_version=CHECK_OME_VERSION)
def cleanup():
print "# disconnecting the kb"
kb.disconnect()
sys.exitfunc = cleanup
print
print "### KB ENV PRELOADED ###"
print "# connected to %s" % OME_HOST
print "# knowledge base: kb"
print "# extra method: kb.get_objects_dict"
print "########################"
Prepare all we need to read the Illumina annotation file.
In [3]:
from bl.core.io.illumina import IllSNPReader
The function below is to map what will be generated by the illumina file reader to what is expected by create_markers_array.
In [4]:
def marker_stream(fname):
for i, r in enumerate(IllSNPReader(open(fname))):
yield {'label': r['IlmnID'], 'mask': r['TopGenomicSeq'],
'index': i, 'permutation' : False}
We will use a study to provide a context for our actions. We will create a new one if we cannot find a study of label study_label in kb
In [5]:
study_label = 'test01'
study = kb.get_study(study_label)
if study is None:
study = kb.factory.create(kb.Study, {'label': study_label }).save()
For many of the operations below, we will need an action. In general, actions are used to record how a given object has produced. Here we just need a place holder to fill a slot in API calls.
In [6]:
action = kb.create_an_action(study)
In [8]:
label, maker, model, version = 'human_exom-V1-test', 'illumina', 'human_exom12', 'v1_a'
fname = '/var/tmp/humanexome-12v1_a.csv'
mset = kb.genomics.get_markers_array(label)
if mset is None:
stream = marker_stream(fname)
mset = kb.genomics.create_markers_array(label, maker, model, version,
stream, action)
In [14]:
ref_gen_label = 'GRCh37.1'
reference_genome = kb.get_by_label(kb.ReferenceGenome, ref_gen_label)
if reference_genome is None:
conf = {'nChroms' : 26,
'maker': 'GRC',
'model': 'h37',
'release' : '1',
'label': ref_gen_label,
'status' : kb.DataSampleStatus.USABLE,
'action': action}
reference_genome = kb.factory.create(kb.ReferenceGenome, conf).save()
In [15]:
def chrom_label_fix(chr):
if chr == 'X': return 23
if chr == 'Y': return 24
if chr == 'MT': return 25
if chr == 'XY': return 26
return int(chr)
Now we should be ready to generate a VariantCallSupport object. Note that we need to explicitly set the label of the vcs, otherwise it will be a unique random string. This will be fixed in the future.
In [16]:
vcs_label = mset.label + '+' + reference_genome.label
vcs = kb.genomics.get_vcs_by_label(vcs_label)
if vcs is None:
print 'need to create'
nodes = np.array([(chrom_label_fix(r['Chr']), int(r['MapInfo']))
for r in IllSNPReader(open(fname))], kb.VariantCallSupport.NODES_DTYPE)
vcs = kb.genomics.create_vcs(mset, reference_genome, nodes, action)
vcs.label = vcs_label
vcs.save()
In [18]:
len(vcs)
Out[18]:
Note that we until we registered vcs in the biobank, it only existed in our RAM.
In [19]:
duplicates = vcs.get_multiple_origins_nodes()
In [20]:
duplicates.shape
Out[20]:
In [21]:
duplicates[:, 0:10]
Out[21]:
In [22]:
origin = vcs.get_field('origin')
dup_origin = origin[origin['index'] == 80]
In [23]:
dup_origin
Out[23]:
In [24]:
rows= kb.genomics.get_markers_array_rows(mset, [122974, 190025])
In [26]:
rows
Out[26]:
In [27]:
rows['mask'][0] == rows['mask'][1]
Out[27]: