Compare a VCF exome file with an exome microarray

We will load a VCF and check how many snp are already present in an exome microarray. The specific dataset is from the Corpas Family publicly available dataset. See below.


In [5]:
from IPython.display import HTML
HTML('<iframe src=http://manuelcorpas.com/tag/exome/ width=1000 height=200></iframe>')


Out[5]:

In [1]:
import sys, os
from bl.vl.kb import KnowledgeBase

OME_HOST = os.getenv('OME_HOST', 'localhost')
OME_USER = os.getenv('OME_USER', 'test')
OME_PASSWD = os.getenv('OME_PASSWD', 'test')
CHECK_OME_VERSION = os.getenv('CHECK_OME_VERSION', "True") == "True"

BaseProxy = KnowledgeBase(driver='omero')

class Proxy(BaseProxy):
  def get_objects_dict(self, klass):
    return dict((o.label, o) for o in super(Proxy, self).get_objects(klass))

kb = Proxy(OME_HOST, OME_USER, OME_PASSWD, check_ome_version=CHECK_OME_VERSION)

def cleanup():
  print "# disconnecting the kb"
  kb.disconnect()

sys.exitfunc = cleanup

print
print "### KB ENV PRELOADED ###"
print "# connected to %s" % OME_HOST
print "# knowledge base: kb"
print "# extra method: kb.get_objects_dict"
print "########################"


### KB ENV PRELOADED ###
# connected to biobank04.crs4.it
# knowledge base: kb
# extra method: kb.get_objects_dict
########################

In [7]:
import vcf # PyVCF V6

FIXME: we could put here an http slurp or something like that. Also, I had to upcase 'contig' to 'CONTIG' in the header.


In [8]:
vcf_reader = vcf.Reader(open('/var/tmp/Son_s_VCF_file_v1.vcf', 'r'))

In [9]:
contigs = {}
def chrom_label_fix(chr):
    if chr == 'X': return 23
    if chr == 'Y': return 24
    if chr == 'MT': return 25
    if chr == 'XY': return 26
    if chr.isdigit(): return int(chr)
    if chr not in contigs:
        contigs[chr] = 26 + len(contigs)
    return contigs[chr]

In [10]:
selected = np.array([(chrom_label_fix(r.CHROM), r.POS, i, r.REF, str(r.ALT)) for i, r in enumerate(vcf_reader) if r.is_snp], 
                    dtype=[('chrom', 'i8'), ('pos', 'i8'), ('vpos', 'i8'), ('ref', '|S10'), ('alt', '|S2000')])

In [11]:
nodes = np.zeros(len(selected), dtype=kb.VariantCallSupport.NODES_DTYPE)

In [12]:
nodes['chrom'] = selected['chrom']
nodes['pos'] = selected['pos']

In [13]:
origin = np.zeros(len(selected), dtype=kb.VariantCallSupport.ATTR_ORIGIN_DTYPE)
origin['vpos'] = selected['vpos']
origin['vid'] = 'V03033333' # this would be the VID of the DataSample that points to the VCF file.
origin['index'] = np.arange(len(origin))

In [14]:
ref_genome = kb.get_by_label(kb.ReferenceGenome, 'GRCh37.1')

We are assuming that the VCF data has been aligned on GRCh37.


In [15]:
study_label = 'test01'
study = kb.get_study(study_label)

In [16]:
action = kb.create_an_action(study)

In [17]:
conf = {'action' : action, 'label': 'son_snps', 'status': kb.DataSampleStatus.USABLE, 'referenceGenome' : ref_genome}
vcs_son = kb.factory.create(kb.VariantCallSupport, conf)

In [18]:
vcs_son.define_support(nodes)
vcs_son.define_field('origin', origin)

In [19]:
vcss = kb.get_objects_dict(kb.VariantCallSupport)

In [20]:
vcss


Out[20]:
{'human_exom-V1-test+GRCh37.1': <bl.vl.kb.drivers.omero.variant_call_support.VariantCallSupport at 0x509e910>}

In [21]:
vcs_exom = kb.genomics.get_vcs_by_label('human_exom-V1-test+GRCh37.1')

In [22]:
vcs_intersect = vcs_exom.intersection(vcs_son)

In [23]:
len(vcs_intersect)


Out[23]:
4296

In [24]:
intersect_orig = vcs_intersect.get_field('origin')

In [25]:
intersect_orig[0:4]


Out[25]:
array([(0, 'V03033333', 4),
       (0, 'V09FC739CAA22D4A2CB1C9E34E42AA3CD9', 122961),
       (1, 'V03033333', 9),
       (1, 'V09FC739CAA22D4A2CB1C9E34E42AA3CD9', 215079)], 
      dtype=[('index', '<i8'), ('vid', 'S34'), ('vpos', '<i8')])

In [26]:
mset = kb.genomics.get_markers_array(vid='V09FC739CAA22D4A2CB1C9E34E42AA3CD9')

In [27]:
rows_on_intersect = kb.genomics.get_markers_array_rows(mset, indices=intersect_orig['vpos'][intersect_orig['vid'] == mset.id])

In [28]:
rows_on_intersect[0]


Out[28]:
('exm2253575-0_B_R_1975256659', 122961, 'TGATCTCCAGCTGGATCTCCCGGTCACGCAGCTTGCGCCAGTGGCTGTAGTACAAGGTCA[A/G]GGGTGTCCCCTCTTCCCGGGTCAGCTTCTCCCAGGCTTCCTGGGGGGTTGGGGGAGTTCA', 0, 'V034C742A8C79148B28CFE8D0EDAEE772D')

In [52]:
selected[selected['vpos'] == 4]


Out[52]:
array([(1, 881627, 4, 'G', '[A]')], 
      dtype=[('chrom', '<i8'), ('pos', '<i8'), ('vpos', '<i8'), ('ref', 'S10'), ('alt', 'S2000')])

In [30]:
rows_on_intersect[1]


Out[30]:
('exm773-0_B_F_1921422091', 215079, 'GGGTCAGAGGCAGGCACAGACACAGGCACCGAGGGGAGTGGGCTCGAGGGCGTGGGTCCA[C/G]GTGCACTCTGCATGGCACTGAGGAACTGCAGGGAGAGCCAGGGCTGGGGCCTGGGTGGCA', 0, 'V034C742A8C79148B28CFE8D0EDAEE772D')

In [31]:
selected[selected['vpos'] == 9]


Out[31]:
array([(1, 909238, 9, 'G', '[C]')], 
      dtype=[('chrom', '<i8'), ('pos', '<i8'), ('vpos', '<i8'), ('ref', 'S10'), ('alt', 'S2000')])

Which is interesting.


In [33]:
for i in range(10):
    print selected[selected['vpos'] == intersect_orig[2*i][2]]
    print rows_on_intersect[i]


[(1, 881627, 4, 'G', '[A]')]
('exm2253575-0_B_R_1975256659', 122961, 'TGATCTCCAGCTGGATCTCCCGGTCACGCAGCTTGCGCCAGTGGCTGTAGTACAAGGTCA[A/G]GGGTGTCCCCTCTTCCCGGGTCAGCTTCTCCCAGGCTTCCTGGGGGGTTGGGGGAGTTCA', 0, 'V034C742A8C79148B28CFE8D0EDAEE772D')
[(1, 909238, 9, 'G', '[C]')]
('exm773-0_B_F_1921422091', 215079, 'GGGTCAGAGGCAGGCACAGACACAGGCACCGAGGGGAGTGGGCTCGAGGGCGTGGGTCCA[C/G]GTGCACTCTGCATGGCACTGAGGAACTGCAGGGAGAGCCAGGGCTGGGGCCTGGGTGGCA', 0, 'V034C742A8C79148B28CFE8D0EDAEE772D')
[(1, 1266738, 28, 'G', '[A]')]
('exm2745-0_T_F_1919151215', 142253, 'CCCAGTCGGGGCAGCCACCTGCCGTGCCTGTTGGAAGTTGCCTCTGCCATGCTGGGCCCT[A/G]CTGTCCTGGGCCTCAGCCTCTGGGCTCTCCTGCACCCTGGGACGGGGGCCCCATTGTGCC', 0, 'V034C742A8C79148B28CFE8D0EDAEE772D')
[(1, 1551927, 38, 'T', '[C]')]
('exm3962-0_B_F_1921383699', 160161, 'GCCGTCCAGTCGCCCGCCGGGCCGCCTTGAGGCTCCTGGGCTGCAGCCCTGATGCCTGGA[A/G]ACTTTGGGACTGGCCTCTAGCCTCGCTGGGCTTCCAACCCTGTGGCCGAGGGGAGATGCC', 0, 'V034C742A8C79148B28CFE8D0EDAEE772D')
[(1, 1650787, 54, 'T', '[C]')]
('exm4399-0_B_F_1921512567', 166994, 'CTCGGAAAGAAAAAGTTCATCACAGAAAAGATGAAAAGAGAAAAGAAAAATGTAGGCATC[A/G]TAGCCATTCAGCAGAAGGGGGTACAGAAGCATTTACTTTCTTTGACATCATGTTCAAGTG', 0, 'V034C742A8C79148B28CFE8D0EDAEE772D')
[(1, 1900232, 81, 'T', '[C]')]
('exm4957-0_B_F_1921459951', 175316, 'TGAGAAGCGACTCCACGCAGGGCCTTTGAGGAGGAGCAGAAGCTCAGAAAGCAGGAGATC[A/G]TCAGTCGGATTCTGAAAGAGGAGGCTGAGGAGGAAAAGAGGAAGAAACAGCATCCCCCCA', 0, 'V034C742A8C79148B28CFE8D0EDAEE772D')
[(1, 3329051, 104, 'G', '[A]')]
('exm6605-0_T_F_1919138473', 198747, 'GCCCTCGCCCACAACTTGCTGGTCAAGGCCGAGCCAAAGTCACCCCGGGACGCCCTCAAG[A/G]TGGGCGGCCCCAGTGCCGAGTGCCCCTTTGATCTCACCACCAAGCCCAAAGACGTGAAGC', 0, 'V034C742A8C79148B28CFE8D0EDAEE772D')
[(1, 3421897, 109, 'A', '[G]')]
('exm6998-0_B_R_1919190104', 204435, 'TCACAGGCCACGCCCACTGGGCAGGTGCATGCCTGCCAGCACCCCGGCCCAAAGTAGCCC[A/G]GCTCACACTCTGCAGGGCGTGAGAGAGGGGTGGGTGGGGTTAACCGACCCTGGCGCCCCC', 0, 'V034C742A8C79148B28CFE8D0EDAEE772D')
[(1, 5935162, 128, 'A', '[T]')]
('exm8262-0_T_R_1921578292', 222466, 'ATAGCCCCTTGTGGAGGAGAGTGTGCCTGGGACCCTGTCCACCCATGTCCTCTTGTCTGC[A/T]GGCGCAGCAGAGCGTCCGCACACAGCACTTGCGGGACCTACAGGTCATCGCCGCCTACCG', 0, 'V034C742A8C79148B28CFE8D0EDAEE772D')
[(1, 6278414, 145, 'A', '[G]')]
('exm8990-0_T_F_1921490279', 233158, 'AGGCACCCGTGGATGAGCAGTCAGAGAGTCTACAGAACACGCACGACGACAGCAGGAACA[A/G]CGCGGCCTCAGCCAGGTAAAGCAAGTCTCTCCACTGGAGAGTGTGCACGCGATGTGGCTT', 0, 'V034C742A8C79148B28CFE8D0EDAEE772D')