A new Markers Array and a VariantCallSupport from an Illumina annotation file


In [1]:
import sys, os
from bl.vl.kb import KnowledgeBase

OME_HOST = os.getenv('OME_HOST', 'localhost')
OME_USER = os.getenv('OME_USER', 'test')
OME_PASSWD = os.getenv('OME_PASSWD', 'test')
CHECK_OME_VERSION = os.getenv('CHECK_OME_VERSION', "True") == "True"

BaseProxy = KnowledgeBase(driver='omero')

class Proxy(BaseProxy):
  def get_objects_dict(self, klass):
    return dict((o.label, o) for o in super(Proxy, self).get_objects(klass))

kb = Proxy(OME_HOST, OME_USER, OME_PASSWD, check_ome_version=CHECK_OME_VERSION)

def cleanup():
  print "# disconnecting the kb"
  kb.disconnect()

sys.exitfunc = cleanup

print
print "### KB ENV PRELOADED ###"
print "# connected to %s" % OME_HOST
print "# knowledge base: kb"
print "# extra method: kb.get_objects_dict"
print "########################"


### KB ENV PRELOADED ###
# connected to biobank04.crs4.it
# knowledge base: kb
# extra method: kb.get_objects_dict
########################

Prepare all we need to read the Illumina annotation file.


In [3]:
from bl.core.io.illumina import IllSNPReader

The function below is to map what will be generated by the illumina file reader to what is expected by create_markers_array.


In [4]:
def marker_stream(fname):
    for i, r in enumerate(IllSNPReader(open(fname))):
        yield {'label': r['IlmnID'], 'mask': r['TopGenomicSeq'], 
               'index': i, 'permutation' : False}

We will use a study to provide a context for our actions. We will create a new one if we cannot find a study of label study_label in kb


In [5]:
study_label = 'test01'
study = kb.get_study(study_label)
if study is None:
    study = kb.factory.create(kb.Study, {'label': study_label }).save()

For many of the operations below, we will need an action. In general, actions are used to record how a given object has produced. Here we just need a place holder to fill a slot in API calls.


In [6]:
action = kb.create_an_action(study)

In [8]:
label, maker, model, version = 'human_exom-V1-test', 'illumina', 'human_exom12', 'v1_a'
fname = '/var/tmp/humanexome-12v1_a.csv'
mset = kb.genomics.get_markers_array(label)
if mset is None:
    stream = marker_stream(fname)
    mset = kb.genomics.create_markers_array(label, maker, model, version,
                                            stream, action)

In [14]:
ref_gen_label = 'GRCh37.1'
reference_genome = kb.get_by_label(kb.ReferenceGenome, ref_gen_label)
if reference_genome is None:
    conf = {'nChroms' : 26, 
            'maker': 'GRC',
            'model': 'h37',
            'release' : '1',
            'label': ref_gen_label,
            'status' : kb.DataSampleStatus.USABLE,
            'action': action}
    reference_genome = kb.factory.create(kb.ReferenceGenome, conf).save()

In [15]:
def chrom_label_fix(chr):
    if chr == 'X': return 23
    if chr == 'Y': return 24
    if chr == 'MT': return 25
    if chr == 'XY': return 26
    return int(chr)

Now we should be ready to generate a VariantCallSupport object. Note that we need to explicitly set the label of the vcs, otherwise it will be a unique random string. This will be fixed in the future.


In [16]:
vcs_label = mset.label + '+' + reference_genome.label
vcs = kb.genomics.get_vcs_by_label(vcs_label)
if vcs is None:
    print 'need to create'
    nodes = np.array([(chrom_label_fix(r['Chr']), int(r['MapInfo'])) 
                       for r in IllSNPReader(open(fname))], kb.VariantCallSupport.NODES_DTYPE)
    vcs = kb.genomics.create_vcs(mset, reference_genome, nodes, action)
    vcs.label = vcs_label
    vcs.save()


need to create

In [18]:
len(vcs)


Out[18]:
247039

Note that we until we registered vcs in the biobank, it only existed in our RAM.


In [19]:
duplicates = vcs.get_multiple_origins_nodes()

In [20]:
duplicates.shape


Out[20]:
(2, 831)

In [21]:
duplicates[:, 0:10]


Out[21]:
array([[  80,  689,  692, 1036, 1162, 1472, 1612, 1815, 2154, 2447],
       [   2,    2,    2,    2,    2,    2,    2,    2,    2,    2]])

In [22]:
origin = vcs.get_field('origin')
dup_origin = origin[origin['index'] == 80]

In [23]:
dup_origin


Out[23]:
array([(80, 'V09FC739CAA22D4A2CB1C9E34E42AA3CD9', 122974),
       (80, 'V09FC739CAA22D4A2CB1C9E34E42AA3CD9', 190025)], 
      dtype=[('index', '<i8'), ('vid', 'S34'), ('vpos', '<i8')])

In [24]:
rows= kb.genomics.get_markers_array_rows(mset, [122974, 190025])

In [26]:
rows


Out[26]:
array([ ('exm2253593-0_B_R_1975255069', 122974, 'GGACGGATGGTTGTACGCCGTGGGGGGTAACGACGGTAGCTCCAGCCTCAACTCCATCGA[A/G]AAGTACAACCCGAGGACCAACAAGTGGGTGGCCGCATCCTGCATGTTCACCCGGCGCAGC', 0, 'V034C742A8C79148B28CFE8D0EDAEE772D'),
       ('exm596-0_T_F_1921553921', 190025, 'GGACGGATGGTTGTACGCCGTGGGGGGTAACGACGGTAGCTCCAGCCTCAACTCCATCGA[A/G]AAGTACAACCCGAGGACCAACAAGTGGGTGGCCGCATCCTGCATGTTCACCCGGCGCAGC', 0, 'V034C742A8C79148B28CFE8D0EDAEE772D')], 
      dtype=[('label', 'S128'), ('index', '<i8'), ('mask', 'S1024'), ('permutation', 'i1'), ('op_vid', 'S34')])

In [27]:
rows['mask'][0] == rows['mask'][1]


Out[27]:
True