The goal of this example is to show how it is possible to load genotyping information held in the pedlink format into omero.biobank.
We will first do some experiments with the data, just to understand its structure, and then we will build the appropriate omero.biobank objects.
At the end of the exercise, we will show how to extract pedlink files on sub pedigrees and sub set of markers.
In [1]:
ped_fname='/var/tmp/T2D_28102013/T2D_28102013_CogoniQC.ped'
map_fname='/var/tmp/T2D_28102013/T2D_28102013_CogoniQC.map'
fam_fname='/var/tmp/T2D_28102013/T2D_28102013_CogoniQC.fam'
pheno_fname='/var/tmp/T2D_28102013/T2DPheno_28102013.txt'
We will be using the ped reading utilities present in bl/vl/genotype/io.
In [2]:
from bl.vl.genotype.io import MapReader, PedLineParser
import csv
class PedReader(object):
def __init__(self, map_fname, ped_fname):
self.map_data = [x for x in MapReader(open(map_fname))]
# we assume 1 column for affection status
# and then len(map_data) markers
dat_types = ['A'] + ['M'] * len(self.map_data)
self.pedline_parser = PedLineParser(dat_types)
def __iter__(self):
labels = [x[1] for x in self.map_data]
with open(ped_fname) as f:
for l in f:
data = self.pedline_parser.parse(l)
yield data[0:6], dict(zip(labels, data[6:]))
In [3]:
ped_reader = PedReader(map_fname, ped_fname)
In [4]:
for i, d in enumerate(ped_reader):
if i > 2:
break
print [(k, val) for k, val in d[1].iteritems()][:3]
Next thing, we will check the rs codes against dbSNP139. This is done using an external program/galaxy application. FIXME it would be nice if we could directly invoke a workflow from here.
In [5]:
rs_label_fname = '/var/tmp/rs_list.tsv'
rs_extra_fname = '/var/tmp/rs_extracted.tsv'
with open(rs_label_fname, 'w') as f:
writer = csv.DictWriter(f, fieldnames=['label'], delimiter='\t')
writer.writeheader()
for r in ped_reader.map_data:
writer.writerow({'label': r[1]})
In [ ]:
# python ~/work/vs/git/biobank/tools/snp_manager manage_db --db-file dbSNP139.db \
# --join rs_list.tsv > /var/tmp/extracted.tsv')
In [6]:
def read_rs_labels(fname):
with open(fname) as f:
reader = csv.DictReader(f, delimiter='\t')
rs_labels = [r['label'] for r in reader]
return rs_labels
rs_list = set(read_rs_labels(rs_label_fname))
rs_ext = set(read_rs_labels(rs_extra_fname))
len(rs_list), len(rs_ext)
Out[6]:
It appears that only a subset of the rs labels actually exists in dbSNP139.
In [7]:
[x for x in (rs_list - rs_ext)][:4]
Out[7]:
In [8]:
from IPython.display import HTML
HTML("""<iframe src=http://www.ncbi.nlm.nih.gov/snp/?term=rs6422346
width=1000 height=400></iframe>""")
Out[8]:
One more thing, in rs_extra_fname we now have compact absolute markers definitions that we will use to define the marker array in omero.biobank.
In [8]:
import itertools as it
In [9]:
with open(rs_extra_fname) as f:
for l in it.islice(f, 3):
print l,
The allele order in the mask defines what is the A and the B alleles. We now need to extract the proper allele order from the masks so that we can properly convert the genotyped values to probabilities.
In [10]:
import bl.vl.utils.snp as snp_utils
alleles_of_rs = {}
ordered_rs_labels = []
prob_profiles = []
with open(rs_extra_fname) as f:
for r in csv.DictReader(f, delimiter='\t'):
_, alleles, _ = snp_utils.split_mask(r['mask'])
alleles_of_rs[r['label']] = alleles
ordered_rs_labels.append(r['label'])
prob_profiles.append({(alleles[0], alleles[0]): [1, 0],
(alleles[0], alleles[1]): [0, 0],
(alleles[1], alleles[0]): [0, 0],
(alleles[1], alleles[1]): [0, 1],
('0', '0'): [1/3., 1/3.]})
With this info we can now convert to the gdo (probability arrays) format. FIXME Note that we are setting the confidence to 0.
In [11]:
def convert_to_array(rs_labels, prob_profiles, data):
N = len(rs_labels)
prob = np.zeros((2, N), dtype=np.float32)
conf = np.zeros((N,), dtype=np.float32)
for i in xrange(N):
prob[:,i] = prob_profiles[i][tuple(data[rs_labels[i]])]
return prob, conf
ped_reader = PedReader(map_fname, ped_fname)
for r in it.islice(ped_reader, 3):
prob, conf = convert_to_array(ordered_rs_labels, prob_profiles, r[1])
print prob[:,:10]
Ok, now we are ready to upload data into biobank.
In [39]:
import sys, os
from bl.vl.kb import KnowledgeBase
OME_HOST = os.getenv('OME_HOST', 'localhost')
OME_USER = os.getenv('OME_USER', 'test')
OME_PASSWD = os.getenv('OME_PASSWD', 'test')
CHECK_OME_VERSION = os.getenv('CHECK_OME_VERSION', "True") == "True"
BaseProxy = KnowledgeBase(driver='omero')
class Proxy(BaseProxy):
def get_objects_dict(self, klass):
return dict((o.label, o) for o in super(Proxy, self).get_objects(klass))
kb = Proxy(OME_HOST, OME_USER, OME_PASSWD, check_ome_version=CHECK_OME_VERSION)
kb.connect()
kb.start_keep_alive()
def cleanup():
print "# disconnecting the kb"
kb.disconnect()
sys.exitfunc = cleanup
print
print "### KB ENV PRELOADED ###"
print "# connected to %s" % OME_HOST
print "# knowledge base: kb"
print "# extra method: kb.get_objects_dict"
print "########################"
In the following, we will define a new set of individuals and enroll them in a study. Note that studyCode is the code assigned to each individual in a specific study. After the definition, we run a consistency check as follows. NOTE: this is not the fastest way to load individuals, see the FIXME importer code for a more efficient implementation.
In [13]:
def family_reader(N=None):
fieldnames = ['FID', 'IID', 'Father', 'Mother', 'sex', 'T2D']
reader = csv.DictReader(open(fam_fname), fieldnames=fieldnames, delimiter='\t')
return reader if N is None else it.islice(reader, 0, N)
In [14]:
def load_family_in_biobank(action, study, reader):
gender_map = {'1': kb.Gender.MALE, '2': kb.Gender.FEMALE}
by_label = {}
for r in reader:
conf = {'gender': gender_map[r['sex']], 'action': action}
if r['Father'] != '0':
conf['father'] = by_label[r['Father']]
if r['Mother'] != '0':
conf['mother'] = by_label[r['Mother']]
i = kb.factory.create(kb.Individual, conf).save()
by_label[r['IID']] = i
conf = {'study': study, 'individual': i,
'studyCode': r['IID']}
kb.factory.create(kb.Enrollment, conf).save()
return by_label
Rather than loading the whole dataset, we will now try with a small subset with only 10 individuals.
In [15]:
study_label = 'ped_file_round_trip'
reader = family_reader(10)
study = kb.get_study(study_label)
if study is None:
# we assume that if we do not have a study, we don't have the individuals.
study = kb.factory.create(kb.Study, {'label': study_label}).save()
action = kb.create_an_action()
by_label = load_family_in_biobank(action, study, reader)
else:
by_label = dict([(e.studyCode, e.individual) for e in kb.get_enrolled(study)])
action = (by_label.values()[0]).action # hack !@@@@!!!
In [16]:
import uuid
def add_genotype_data_sample(mset, action, probs, confs):
conf = {'label': uuid.uuid4().hex,
'status': kb.DataSampleStatus.USABLE,
'action': action,
'snpMarkersSet': mset}
sample = kb.factory.create(kb.GenotypeDataSample, conf).save()
kb.genomics.add_gdo_data_object(action, sample, probs, confs)
def convert_to_gdo(mset, ped_reader, rs_labels, by_label):
for r in ped_reader:
probs, confs = convert_to_array(rs_labels, prob_profiles, r[1])
action = kb.create_an_action(target=by_label[r[0][1]])
add_genotype_data_sample(mset, action, probs, confs)
In [17]:
N = 10
label, maker, model, version = ('T2D_28102013',
'ICL', 'Cogoni', 'v1_a')
ped_reader = it.islice(PedReader(map_fname, ped_fname), N)
mset = kb.genomics.get_markers_array(label)
if mset is None:
stream = csv.DictReader(open(rs_extra_fname), delimiter='\t')
mset = kb.genomics.create_markers_array(
label, maker, model, version, stream, action)
convert_to_gdo(mset, ped_reader, ordered_rs_labels, by_label)
In [18]:
def create_support_nodes(mset, ped_reader):
pos_by_rs = dict([(x[1], (x[0], x[3])) for x in ped_reader.map_data])
rows = kb.genomics.get_markers_array_rows(mset)
return np.array([pos_by_rs[r['label']] for r in rows],
kb.VariantCallSupport.NODES_DTYPE)
def get_vcs(mset, ped_reader, ref_genome, action):
vcs_label = mset.label + '+' + ref_genome.label
vcs = kb.genomics.get_vcs_by_label(vcs_label)
if vcs is None:
print 'need to create'
nodes = create_support_nodes(mset, ped_reader)
vcs = kb.genomics.create_vcs(mset, ref_genome, nodes, action)
vcs.label = vcs_label
vcs.save()
return vcs
def get_ref_genome(ref_gen_label, action):
ref_genome = kb.get_by_label(kb.ReferenceGenome, ref_gen_label)
if not ref_genome:
conf = {'nChroms' : 26,
'maker': 'GRC', 'model': 'h37', 'release' : '1',
'label': ref_gen_label,
'status' : kb.DataSampleStatus.USABLE,
'action': action}
ref_genome = kb.factory.create(kb.ReferenceGenome, conf).save()
return ref_genome
In [19]:
ref_gen_label = 'GRCh37.1'
ped_reader = PedReader(map_fname, ped_fname)
# we are re-using action because this is a mock-up
# do not do it in real code!
ref_genome = get_ref_genome(ref_gen_label, action)
vcs = get_vcs(mset, ped_reader, ref_genome, action)
In [20]:
vcs_selected = vcs.selection(((20, 1), (20, 100000000)))
In [21]:
len(vcs_selected)
Out[21]:
In [22]:
origin = vcs_selected.get_field('origin')
In [23]:
origin[:10]
Out[23]:
In [24]:
data_sample_by_id = dict(
[(e.individual.id, kb.get_data_samples(e.individual, data_sample_klass_name='GenotypeDataSample').next())
for e in kb.get_enrolled(study)])
In [25]:
data_sample_by_id
Out[25]:
In [26]:
from bl.vl.genotype.io import PedWriter
ped_writer = PedWriter(vcs_selected, base_path='foo_ped')
ped_writer.write_map()
ped_writer.write_family(study.label,
[e.individual for e in kb.get_enrolled(study)],
data_sample_by_id=data_sample_by_id)
ped_writer.close()
In [27]:
def pheno_reader(N=None):
reader = csv.DictReader(open(pheno_fname), delimiter='\t')
return reader if N is None else it.islice(reader, 0, N)
In [32]:
for r in pheno_reader(3):
print r
Here we have a problem. The phenotype data provided has an 'age' column, which is unclear if it is an age of onset of disease, or simply the individual age when the sample was collected. For the time being, we will record only if the individual either has a diagnosis of diabetes of type 2 or not. In a later example we will consider BMI and age details.
In [37]:
DIAGNOSIS = 'openEHR-EHR-EVALUATION.problem-diagnosis.v1'
DIAGNOSIS_TERM = 'at0002.1'
DIABETES_TYPE_2 = 'icd10-cm:E11'
DIAGNOSIS_AGE_AT_ONSET_TERM = 'at0004'
#--
EXCLUSION = 'openEHR-EHR-EVALUATION.exclusion-problem_diagnosis.v1'
EXCLUSION_FIELD = 'at0002.1' # ????
NO_SIGNIFICANT_MEDICAL_HISTORY = 'local:at0.3' # This is a LOCAL code
import time
def save_clinical_record(individual, archetype, fields):
# hack! Fill minimal timestamp and action just to get things going
timestamp = int(time.time() * 1000)
action = kb.create_an_action(target=individual).save()
kb.add_ehr_record(action, timestamp, archetype, fields)
affected_code = '2'
def save_clinical_records(reader, by_label):
for r in reader:
if r['IID'] in by_label:
if r['T2D'] == affected_code:
archetype = DIAGNOSIS
fields = {DIAGNOSIS_TERM: DIABETES_TYPE_2}
else:
archetype = EXCLUSION
fields = {EXCLUSION_FIELD: NO_SIGNIFICANT_MEDICAL_HISTORY}
save_clinical_record(by_label[r['IID']], archetype, fields)
In [41]:
for e in kb.get_enrolled(study):
assert e.individual == by_label[e.studyCode]
In [43]:
save_clinical_records(pheno_reader(), by_label)
In [51]:
ehr = kb.get_ehr(by_label.values()[0])
ehr.recs
Out[51]:
In [55]:
pheno_by_id = {}
for k, i in by_label.iteritems():
ehr = kb.get_ehr(i)
pheno_by_id[i.id] = '2' if ehr.matches(archetype=DIAGNOSIS,
field=DIAGNOSIS_TERM,
value=DIABETES_TYPE_2) else '0'
In [56]:
ped_writer = PedWriter(vcs_selected, base_path='foo_ped_with_pheno')
ped_writer.write_map()
ped_writer.write_family(study.label,
[e.individual for e in kb.get_enrolled(study)],
data_sample_by_id=data_sample_by_id,
phenotype_by_id=pheno_by_id)
ped_writer.close()
This example describes how one can use vl to select a cohort of individuals.
The basic idea is that the selected individuals, e.g., by phenotype and age, are enrolled in an ad-hoc study.
For instance, in this example, we will select an affected and a control group with the same proportion of male/female.
FIXME extend example with age at onset.
In [ ]:
DIAGNOSIS = 'openEHR-EHR-EVALUATION.problem-diagnosis.v1'
DIAGNOSIS_TERM = 'at0002.1'
DIABETES_TYPE_1 = 'icd10-cm:E10'
#--
EXCLUSION = 'openEHR-EHR-EVALUATION.exclusion-problem_diagnosis.v1'
EXCLUSION_FIELD = 'at0002.1' # ????
def get_ehr_iterator(self):
inds = self.kb.get_objects(self.kb.Individual)
inds_by_vid = dict([(i.id, i) for i in inds])
for e in self.kb.get_ehr_iterator():
if not e[0] in inds_by_vid:
#FIXME we need to do this for potential stray records left by testing
continue
yield (inds_by_vid[e[0]], e[1])
# A more sophisticated example
# will keep only records where age of onset is between 10y and 15y
# for i, ehr in self.get_ehr_iterator():
# if (ehr.matches(DIAGNOSIS, DIAGNOSIS_TERM, DIABETES_TYPE_1)
# and
# ehr.matches(DIAGNOSIS, DIAGNOSIS_AGE_OF_ONSET, ("10y", "15y")):
# affected.append(i)
In [ ]: