Python for Bioinformatics

This Jupyter notebook is intented to be used alongside the book Python for Bioinformatics

Chapter 22: Drawing Marker Positions Using Data Stored in a Database

Note: Before opening the file, this file should be accesible from this Jupyter notebook. In order to do so, the following commands will download these files from Github and extract them into a directory called samples.


In [1]:
!curl https://raw.githubusercontent.com/Serulab/Py4Bio/master/samples/samples.tar.bz2 -o samples.tar.bz2
!mkdir samples
!tar xvfj samples.tar.bz2 -C samples


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 16.5M  100 16.5M    0     0  15.6M      0  0:00:01  0:00:01 --:--:-- 15.6M
mkdir: cannot create directory 'samples': File exists
BLAST_output.xml
TAIR7_Transcripts_by_map_position.gz
pMOSBlue.txt
fishbacteria.csv
UniVec_Core.nsq
t3beta.fasta
PythonU.db
input4align.dnd
pdb1apk.ent.gz
readme.txt
contig1.ace
example.aln
hsc1.fasta
bioinfo/seqs/15721870.fasta
primers.txt
bioinfo/seqs/4586830.fasta
bioinfo/seqs/7638455.fasta
GSM188012.CEL
3seqs.fas
sampleX.fas
sampleXblast.xml
B1.csv
phd1
conglycinin.phy
bioinfo/seqs/218744616.fasta
spfile.txt
bioinfo/seqs/513419.fasta
bioinfo/seqs/513710.fasta
prot.fas
cas9align.fasta
seqA.fas
bioinfo/seqs/
bioinfo/
pdbaa
other.xml
vectorssmall.fasta
t3.fasta
a19.gp
data.csv
input4align.fasta
B1IXL9.txt
fasta22.fas
bioinfo/seqs/7415878.fasta
bioinfo/seqs/513718.fasta
bioinfo/seqs/513719.fasta
bioinfo/seqs/6598312.fasta
UniVec_Core.nin
Q5R5X8.fas
bioinfo/seqs/513717.fasta
BcrA.gp
bioinfo/seqs/2623545.fasta
bioinfo/seqs/63108399.fasta
conglycinin.dnd
NC2033.txt
fishdata.csv
uniprotrecord.xml
BLAST_output.html
Q9JJE1.xml
test3.csv
UniVec_Core.nhr
sampledata.xlsx
UniVec_Core
NC_006581.gb
conglycinin.multiple.phy
conglycinin.fasta

The following programs needs a MongoDB database in order to run. Enter your DB parameters in the line starting with CONNECTION_STRING.

Listing 22.1: createdb.py: Convert the data from a CSV file to insert it into MongoDB


In [1]:
import csv
import gzip
import os

from pymongo import MongoClient, TEXT

FILE_NAME = 'samples/TAIR7_Transcripts_by_map_position.gz'
CONNECTION_STRING = os.getenv('MONGODB_CS', 'localhost:27017')

# Get a file handler of an uncompressed file:
with gzip.open(FILE_NAME, "rt", newline="") as f_unzip:
    rows = csv.reader(f_unzip, delimiter='\t')
    next(rows) # Skip the header
    # Dictionary for storing markers and associated information:
    at_d = {}
    # Load the dictionary using the data in the file:
    for row in rows:
        if row[0] in at_d:
            chromosome, left_val, right_val = at_d[row[0]]
            c7 = int(row[7])
            left = c7 if c7<int(left_val) else left_val
            c8 = int(row[8])
            right = c8 if c8>int(right_val) else right_val
            at_d[row[0]] = (int(chromosome), left, right)
        else:
            at_d[row[0]] = (int(row[5]), int(row[7]), int(row[8]))
# Make a list with dictionaries to be stored as documents in
# MongoDB
markers = []
for marker in at_d:
    markers.append({'marker_id': marker, 'chromosome':
                    at_d[marker][0], 'start': at_d[marker][1],
                    'end': at_d[marker][2]})
client = MongoClient(CONNECTION_STRING)
db = client.pr4
collection = db.markers_map4
collection.create_index([('marker_id', TEXT)])
collection.insert_many(markers)


---------------------------------------------------------------------------
ServerSelectionTimeoutError               Traceback (most recent call last)
<ipython-input-1-f976de1277d9> in <module>()
     35 db = client.pr4
     36 collection = db.markers_map4
---> 37 collection.create_index([('marker_id', TEXT)])
     38 collection.insert_many(markers)

~/anaconda3_431/lib/python3.6/site-packages/pymongo/collection.py in create_index(self, keys, **kwargs)
   1385         keys = helpers._index_list(keys)
   1386         name = kwargs.setdefault("name", helpers._gen_index_name(keys))
-> 1387         self.__create_index(keys, kwargs)
   1388         return name
   1389 

~/anaconda3_431/lib/python3.6/site-packages/pymongo/collection.py in __create_index(self, keys, index_options)
   1292         index.update(index_options)
   1293 
-> 1294         with self._socket_for_writes() as sock_info:
   1295             cmd = SON([('createIndexes', self.name), ('indexes', [index])])
   1296             try:

~/anaconda3_431/lib/python3.6/contextlib.py in __enter__(self)
     80     def __enter__(self):
     81         try:
---> 82             return next(self.gen)
     83         except StopIteration:
     84             raise RuntimeError("generator didn't yield") from None

~/anaconda3_431/lib/python3.6/site-packages/pymongo/mongo_client.py in _get_socket(self, selector)
    760     @contextlib.contextmanager
    761     def _get_socket(self, selector):
--> 762         server = self._get_topology().select_server(selector)
    763         try:
    764             with server.get_socket(self.__all_credentials) as sock_info:

~/anaconda3_431/lib/python3.6/site-packages/pymongo/topology.py in select_server(self, selector, server_selection_timeout, address)
    208         return random.choice(self.select_servers(selector,
    209                                                  server_selection_timeout,
--> 210                                                  address))
    211 
    212     def select_server_by_address(self, address,

~/anaconda3_431/lib/python3.6/site-packages/pymongo/topology.py in select_servers(self, selector, server_selection_timeout, address)
    184                 if server_timeout == 0 or now > end_time:
    185                     raise ServerSelectionTimeoutError(
--> 186                         self._error_message(selector))
    187 
    188                 self._ensure_opened()

ServerSelectionTimeoutError: localhost:27017: [Errno 111] Connection refused

Listing 22.2: drawmarker.py: Draw markers in chromosomes from data extracted from a MongoDB database


In [2]:
import os
import re

from pymongo import MongoClient
from Bio.Graphics import BasicChromosome
from reportlab.lib import colors
from reportlab.lib.units import cm

CONNECTION_STRING = os.getenv('MONGODB_CS', 'localhost:27017')

def sortmarkers(crms,end):
    """ Sort markers into chromosomes """
    i = 0
    crms_o = [[] for r in range(len(end))]
    crms_fo = [[] for r in range(len(end))]
    for crm in crms:
        for marker in crm:
            # add the marker start position at each chromosome.
            crms_fo[i].append(marker[1])
        crms_fo[i].sort() # Sort the marker positions.
        i += 1
    i = 0
    for order in crms_fo:
        # Using the marker order set in crms_fo, fill crms_o
        # with all the marker information
        for pos in order:
            for mark in crms[i]:
                if pos==mark[1]:
                    crms_o[i].append(mark)
        i += 1
    return crms_o

def getchromo(crms_o, end):
    """ From an ordered list of markers, generate chromosomes.
    """
    chromo = [[] for r in range(len(end))]
    i = 0
    for crm_o in crms_o:
        j = 0
        if len(crm_o)>1:
            for mark in crm_o:
                if mark==crm_o[0]: #first marker
                    chromo[i].append(('', None, mark[1]))
                    chromo[i].append((mark[0], colors.red,
                                      mark[2]-mark[1]))
                    ant = mark[2]
                elif mark==crm_o[-1]: #last marker
                    chromo[i].append(('', None, mark[1]-ant))
                    chromo[i].append((mark[0], colors.red,
                                      mark[2]-mark[1]))
                    chromo[i].append(('', None, end[i]-mark[2]))
                else:
                    chromo[i].append(('', None, mark[1]-ant))
                    chromo[i].append((mark[0], colors.red,
                                      mark[2]-mark[1]))
                    ant=mark[2]
        elif len(crm_o)==1: # For chromosomes with one marker
            chromo[i].append(('', None, crm_o[0][1]))
            chromo[i].append((crm_o[0][0], colors.red,
                              crm_o[0][2]-crm_o[0][1]))
            chromo[i].append(('', None, end[i]-crm_o[0][2]))
        else:
            # For chromosomes without markers
            # Add 3% of each chromosome.
            chromo[i].append(('', None, int(0.03*end[i])))
            chromo[i].append(('', None, end[i]))
            chromo[i].append(('', None, int(0.03*end[i])))
        i += 1
        j += 1
    return chromo

def addends(chromo):
    """ Adds a 3% of blank region at both ends for better
        graphic output.
    """
    size = 0
    for x in chromo:
        size += x[2]
    # get 3% of size of each chromosome:
    endsize = int(float(size)*.03)
    # add this size to both ends in chromo:
    chromo.insert(0,('', None, endsize))
    chromo.append(('', None, endsize))
    return chromo

def load_chrom(chr_name):
    """ Generate a chromosome with information
    """
    cur_chromosome = BasicChromosome.Chromosome(chr_name[0])
    chr_segment_info = chr_name[1]

    for seg_info_num in range(len(chr_segment_info)):
        label, color, scale = chr_segment_info[seg_info_num]
        # make the top and bottom telomeres
        if seg_info_num == 0:
            cur_segment = BasicChromosome.TelomereSegment()
        elif seg_info_num == len(chr_segment_info) - 1:
            cur_segment = BasicChromosome.TelomereSegment(1)
        # otherwise, they are just regular segments
        else:
            cur_segment = BasicChromosome.ChromosomeSegment()
        cur_segment.label = label
        cur_segment.label_size = 12
        cur_segment.fill_color = color
        cur_segment.scale = scale
        cur_chromosome.add(cur_segment)

    cur_chromosome.scale_num = max(END) + (max(END)*.04)
    return cur_chromosome

def dblookup(atgids):
    """ Code to retrieve all marker data from name using MongoDB
    """
    client = MongoClient(CONNECTION_STRING)
    db = client.pr4
    collection = db.markers_map4
    markers = []
    for marker in atgids:
        mrk = collection.find_one({'marker_id': marker})
        if mrk:
            markers.append((marker, (mrk['chromosome'],
                            mrk['start'], mrk['end'])))
        else:
            print('Marker {0} is not in the DB'.format(marker))
    return markers

# Size of each chromosome:
END = (30427563, 19696817, 23467989, 18581571, 26986107)
gids = []
rx_rid = re.compile('^AT[1-5]G\d{5}$')
print('''Enter AT ID or press 'enter' to stop entering IDs.
Valid IDs:
AT2G28000
AT3G03020
Also you can enter DBDEMO to use predefined set of markers
fetched from a MongoDB database. Enter NODBDEMO to use a
predefined set of markers without database access.''')
while True:
    rid = input('Enter Gene ID: ')
    if not rid:
        break
    if rid=='DBDEMO':
        gids = ['AT3G14890','AT1G66160','AT3G55260','AT5G59570',
                'AT4G32551','AT1G01430','AT4G26000','AT2G28000',
                'AT5G21090','AT5G10470']
        break
    elif rid=='NODBDEMO':
        samplemarkers=[('AT3G14890', (3, 5008749, 5013275)),
                       ('AT1G66160', (1, 24640827, 24642411)),
                       ('AT3G55260', (3, 20500225, 20504056)),
                       ('AT1G10960', (1, 3664385, 3665040)),
                       ('AT5G23350', (5, 7857646, 7859280)),
                       ('AT5G15250', (5, 4950414, 4952780)),
                       ('AT1G55700', (1, 20825263, 20827306)),
                       ('AT5G21090', (5, 7164583, 7167257)),
                       ('AT5G10470', (5, 3289228, 3297249)),
                       ('AT2G28000', (2, 11933524, 11936523)),
                       ('AT3G03020', (3, 680920, 682009)),
                       ('AT4G26000', (4, 13197255, 13199845)),
                       ('AT4G32551', (4, 15707516, 15713587))]
        break
    if rx_rid.match(rid):
        gids.append(rid)
    else:
        print("Bad format, please enter it again")

if rid!='NODBDEMO':
    samplemarkers = dblookup(gids)

crms = [[] for r in range(len(END))]
for x in samplemarkers:
    crms[int(x[1][0])-1].append((x[0], x[1][1], x[1][2]))

crms_o = sortmarkers(crms, END)
chromo = getchromo(crms_o, END)
all_chr_info = [('Chr I', chromo[0]), ('Chr II', chromo[1]),
                ('Chr III', chromo[2]), ('Chr IV', chromo[3]),
                ('Chr V', chromo[4])]

organism = BasicChromosome.Organism()
organism.page_size = (29.7*cm, 21*cm) #A4 landscape
for chr_info in all_chr_info:
    newcrom = (chr_info[0], addends(chr_info[1]))
    organism.add(load_chrom(newcrom))

organism.draw('at.pdf','Arabidopsis thaliana')


---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
~/anaconda3_431/lib/python3.6/site-packages/Bio/Graphics/__init__.py in <module>()
     10 try:
---> 11     import reportlab as r
     12     del r

ModuleNotFoundError: No module named 'reportlab'

During handling of the above exception, another exception occurred:

MissingPythonDependencyError              Traceback (most recent call last)
<ipython-input-2-5319f3a06955> in <module>()
      3 
      4 from pymongo import MongoClient
----> 5 from Bio.Graphics import BasicChromosome
      6 from reportlab.lib import colors
      7 from reportlab.lib.units import cm

~/anaconda3_431/lib/python3.6/site-packages/Bio/Graphics/__init__.py in <module>()
     14     from Bio import MissingPythonDependencyError
     15     raise MissingPythonDependencyError(
---> 16         "Please install ReportLab if you want "
     17         "to use Bio.Graphics. You can find ReportLab at "
     18         "http://www.reportlab.com/software/opensource/")

MissingPythonDependencyError: Please install ReportLab if you want to use Bio.Graphics. You can find ReportLab at http://www.reportlab.com/software/opensource/