We will load a VCF and check how many snp are already present in an exome microarray. The specific dataset is from the Corpas Family publicly available dataset. See below.
In [5]:
from IPython.display import HTML
HTML('<iframe src=http://manuelcorpas.com/tag/exome/ width=1000 height=200></iframe>')
Out[5]:
In [1]:
import sys, os
from bl.vl.kb import KnowledgeBase
OME_HOST = os.getenv('OME_HOST', 'localhost')
OME_USER = os.getenv('OME_USER', 'test')
OME_PASSWD = os.getenv('OME_PASSWD', 'test')
CHECK_OME_VERSION = os.getenv('CHECK_OME_VERSION', "True") == "True"
BaseProxy = KnowledgeBase(driver='omero')
class Proxy(BaseProxy):
def get_objects_dict(self, klass):
return dict((o.label, o) for o in super(Proxy, self).get_objects(klass))
kb = Proxy(OME_HOST, OME_USER, OME_PASSWD, check_ome_version=CHECK_OME_VERSION)
def cleanup():
print "# disconnecting the kb"
kb.disconnect()
sys.exitfunc = cleanup
print
print "### KB ENV PRELOADED ###"
print "# connected to %s" % OME_HOST
print "# knowledge base: kb"
print "# extra method: kb.get_objects_dict"
print "########################"
In [7]:
import vcf # PyVCF V6
FIXME: we could put here an http slurp or something like that. Also, I had to upcase 'contig' to 'CONTIG' in the header.
In [8]:
vcf_reader = vcf.Reader(open('/var/tmp/Son_s_VCF_file_v1.vcf', 'r'))
In [9]:
contigs = {}
def chrom_label_fix(chr):
if chr == 'X': return 23
if chr == 'Y': return 24
if chr == 'MT': return 25
if chr == 'XY': return 26
if chr.isdigit(): return int(chr)
if chr not in contigs:
contigs[chr] = 26 + len(contigs)
return contigs[chr]
In [10]:
selected = np.array([(chrom_label_fix(r.CHROM), r.POS, i, r.REF, str(r.ALT)) for i, r in enumerate(vcf_reader) if r.is_snp],
dtype=[('chrom', 'i8'), ('pos', 'i8'), ('vpos', 'i8'), ('ref', '|S10'), ('alt', '|S2000')])
In [11]:
nodes = np.zeros(len(selected), dtype=kb.VariantCallSupport.NODES_DTYPE)
In [12]:
nodes['chrom'] = selected['chrom']
nodes['pos'] = selected['pos']
In [13]:
origin = np.zeros(len(selected), dtype=kb.VariantCallSupport.ATTR_ORIGIN_DTYPE)
origin['vpos'] = selected['vpos']
origin['vid'] = 'V03033333' # this would be the VID of the DataSample that points to the VCF file.
origin['index'] = np.arange(len(origin))
In [14]:
ref_genome = kb.get_by_label(kb.ReferenceGenome, 'GRCh37.1')
We are assuming that the VCF data has been aligned on GRCh37.
In [15]:
study_label = 'test01'
study = kb.get_study(study_label)
In [16]:
action = kb.create_an_action(study)
In [17]:
conf = {'action' : action, 'label': 'son_snps', 'status': kb.DataSampleStatus.USABLE, 'referenceGenome' : ref_genome}
vcs_son = kb.factory.create(kb.VariantCallSupport, conf)
In [18]:
vcs_son.define_support(nodes)
vcs_son.define_field('origin', origin)
In [19]:
vcss = kb.get_objects_dict(kb.VariantCallSupport)
In [20]:
vcss
Out[20]:
In [21]:
vcs_exom = kb.genomics.get_vcs_by_label('human_exom-V1-test+GRCh37.1')
In [22]:
vcs_intersect = vcs_exom.intersection(vcs_son)
In [23]:
len(vcs_intersect)
Out[23]:
In [24]:
intersect_orig = vcs_intersect.get_field('origin')
In [25]:
intersect_orig[0:4]
Out[25]:
In [26]:
mset = kb.genomics.get_markers_array(vid='V09FC739CAA22D4A2CB1C9E34E42AA3CD9')
In [27]:
rows_on_intersect = kb.genomics.get_markers_array_rows(mset, indices=intersect_orig['vpos'][intersect_orig['vid'] == mset.id])
In [28]:
rows_on_intersect[0]
Out[28]:
In [52]:
selected[selected['vpos'] == 4]
Out[52]:
In [30]:
rows_on_intersect[1]
Out[30]:
In [31]:
selected[selected['vpos'] == 9]
Out[31]:
Which is interesting.
In [33]:
for i in range(10):
print selected[selected['vpos'] == intersect_orig[2*i][2]]
print rows_on_intersect[i]