XMAP plotter

Imports


In [172]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
#import matplotlib as plt
#plt.use('TkAgg') 

import operator
import re
from collections     import defaultdict

import pylab
pylab.show()

%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['pylab']
`%matplotlib` prevents importing * from pylab and numpy

Definitions


In [173]:
fileUrl = "../S_lycopersicum_chromosomes.2.50.BspQI_to_EXP_REFINEFINAL1_xmap.txt"
MIN_CONF = 10.0
FULL_FIG_W , FULL_FIG_H  = 16, 8
CHROM_FIG_W, CHROM_FIG_H = FULL_FIG_W, 20

Setup

Figure sizes controller


In [174]:
class size_controller(object):
    def __init__(self, w, h):
        self.w = w
        self.h = h
        
    def __enter__(self):
        self.o = rcParams['figure.figsize']
        rcParams['figure.figsize'] = self.w, self.h
        return None
    
    def __exit__(self, type, value, traceback):
        rcParams['figure.figsize'] = self.o

Column type definition


In [175]:
col_type_int = np.int64
col_type_flo = np.float64
col_type_str = np.object
col_info =[
    [ "XmapEntryID" , col_type_int ],
    [ "QryContigID" , col_type_int ],
    [ "RefContigID" , col_type_int ],
    [ "QryStartPos" , col_type_flo ],
    [ "QryEndPos"   , col_type_flo ],
    [ "RefStartPos" , col_type_flo ],
    [ "RefEndPos"   , col_type_flo ],
    [ "Orientation" , col_type_str ],
    [ "Confidence"  , col_type_flo ],
    [ "HitEnum"     , col_type_str ],
    [ "QryLen"      , col_type_flo ],
    [ "RefLen"      , col_type_flo ],
    [ "LabelChannel", col_type_str ],
    [ "Alignment"   , col_type_str ],
]

col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
col_types


Out[175]:
{'Alignment': object,
 'Confidence': numpy.float64,
 'HitEnum': object,
 'LabelChannel': object,
 'Orientation': object,
 'QryContigID': numpy.int64,
 'QryEndPos': numpy.float64,
 'QryLen': numpy.float64,
 'QryStartPos': numpy.float64,
 'RefContigID': numpy.int64,
 'RefEndPos': numpy.float64,
 'RefLen': numpy.float64,
 'RefStartPos': numpy.float64,
 'XmapEntryID': numpy.int64}

In [176]:
CONVERTERS = {
    'info': filter_conv
}
SKIP_ROWS = 9
NROWS     = None
gffData = pd.read_csv(fileUrl, names=col_names, index_col='XmapEntryID', dtype=col_types, header=None, skiprows=SKIP_ROWS, delimiter="\t", comment="#", verbose=True, nrows=NROWS)
gffData.head()


Tokenization took: 8.72 ms
Type conversion took: 20.52 ms
Parser memory cleanup took: 0.01 ms
Out[176]:
QryContigID RefContigID QryStartPos QryEndPos RefStartPos RefEndPos Orientation Confidence HitEnum QryLen RefLen LabelChannel Alignment
XmapEntryID
1 141 1 528400.6 571697.5 10672.0 54237.5 + 6.65 4M2D2M 1439123.5 21805821 1 (1,34)(2,34)(3,35)(4,36)(5,37)(6,38)(8,38)(9,39)
2 174 1 21236.5 1568390.0 10672.0 1553561.0 + 79.35 2M3D1M1D1M1D4M1I2M1D2M1D1M2I2D9M3I3M1D6M1D2M2D... 1568410.0 21805821 1 (1,2)(2,2)(3,3)(6,4)(7,4)(9,5)(11,6)(12,7)(13,...
3 1129 1 444677.7 340490.1 52972.0 154578.0 - 9.08 4M1D2M1D2M 650802.9 21805821 1 (6,27)(8,27)(9,26)(10,25)(11,24)(13,23)(14,22)...
4 42 1 1439722.2 1522476.2 54237.5 135860.0 + 7.53 6M1D1M 2153950.0 21805821 1 (7,99)(9,99)(10,100)(11,101)(12,102)(13,103)(1...
5 1752 1 210082.7 158973.4 55635.0 106448.0 - 6.59 5M 703838.0 21805821 1 (10,15)(11,14)(12,13)(13,12)(14,11)

Add length column


In [177]:
gffData['qry_match_len'] = abs(gffData['QryEndPos'] - gffData['QryStartPos'])
gffData['ref_match_len'] = abs(gffData['RefEndPos'] - gffData['RefStartPos'])
gffData['match_prop'   ] = gffData['qry_match_len'] / gffData['ref_match_len']
gffData = gffData[gffData['Confidence'] >= MIN_CONF]
del gffData['LabelChannel']
gffData.head()


Out[177]:
QryContigID RefContigID QryStartPos QryEndPos RefStartPos RefEndPos Orientation Confidence HitEnum QryLen RefLen Alignment qry_match_len ref_match_len match_prop
XmapEntryID
2 174 1 21236.5 1568390.0 10672 1553561 + 79.35 2M3D1M1D1M1D4M1I2M1D2M1D1M2I2D9M3I3M1D6M1D2M2D... 1568410.0 21805821 (1,2)(2,2)(3,3)(6,4)(7,4)(9,5)(11,6)(12,7)(13,... 1547153.5 1542889 1.002764
32 430 1 20.0 651018.8 1493902 2123769 + 48.60 9M1D1M4D2M1I1D4M2D3M1I1D1M1D1M1I1D6M1D6M1D4M1D... 651038.8 21805821 (161,1)(162,2)(163,3)(164,4)(165,5)(166,6)(167... 650998.8 629867 1.033550
49 877 1 185486.6 20.0 2289130 2470486 - 13.00 3M1I1D2M2D6M2D2M1I1D2M 803876.2 21805821 (236,17)(237,16)(238,15)(240,13)(241,12)(244,1... 185466.6 181356 1.022666
53 563 1 614521.1 20.0 2470486 3076619 - 39.72 2M1I1D1M1I5M1D5M1D1M1D4M2D1M1D3M1D3M1D2M1D1M1D... 614541.2 21805821 (256,43)(257,42)(259,40)(260,38)(261,37)(262,3... 614501.1 606133 1.013806
63 276 1 599215.1 211191.6 3450761 3837771 - 18.32 1M1I3M3D5M2D1M4D2M1D2M1I2D2M2D3M1D4M 1006781.1 21805821 (335,47)(336,45)(337,44)(338,43)(342,42)(343,4... 388023.5 387010 1.002619

In [178]:
re_matches    = re.compile("(\d+)M")
re_insertions = re.compile("(\d+)I")
re_deletions  = re.compile("(\d+)D")
def process_cigar(cigar, **kwargs):
    """
    2M3D1M1D1M1D4M1I2M1D2M1D1M2I2D9M3I3M1D6M1D2M2D1M1D6M1D1M1D1M2D2M2D1M1I1D1M1D5M2D4M2D1M2D2M1D2M1D3M1D1M1D2M3I3D1M1D1M3D2M3D1M2I1D1M2D1M1D1M1I2D3M2I1M1D2M1D1M1D1M2I3D3M3D1M2D1M1D1M1D5M2D12M
    """
    assert(set([x for x in cigar]) <= set(['M', 'D', 'I', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']))

    cigar_matches    = 0
    cigar_insertions = 0
    cigar_deletions  = 0

    i_matches = re_matches   .finditer(cigar)
    i_inserts = re_insertions.finditer(cigar)
    i_deletes = re_deletions .finditer(cigar)

    for i in i_matches:
        n                 = i.group(1)
        cigar_matches    += int(n)

    for i in i_inserts:
        n                 = i.group(1)
        cigar_insertions += int(n)

    for i in i_deletes:
        n                 = i.group(1)
        cigar_deletions  += int(n)

    return cigar_matches, cigar_insertions, cigar_deletions

gffData[['cigar_matches', 'cigar_insertions', 'cigar_deletions']] = gffData['HitEnum'].apply(process_cigar, axis=1).apply(pd.Series, 1)
del gffData['HitEnum']

gffData.head()


Out[178]:
QryContigID RefContigID QryStartPos QryEndPos RefStartPos RefEndPos Orientation Confidence QryLen RefLen Alignment qry_match_len ref_match_len match_prop cigar_matches cigar_insertions cigar_deletions
XmapEntryID
2 174 1 21236.5 1568390.0 10672 1553561 + 79.35 1568410.0 21805821 (1,2)(2,2)(3,3)(6,4)(7,4)(9,5)(11,6)(12,7)(13,... 1547153.5 1542889 1.002764 105 17 63
32 430 1 20.0 651018.8 1493902 2123769 + 48.60 651038.8 21805821 (161,1)(162,2)(163,3)(164,4)(165,5)(166,6)(167... 650998.8 629867 1.033550 52 4 15
49 877 1 185486.6 20.0 2289130 2470486 - 13.00 803876.2 21805821 (236,17)(237,16)(238,15)(240,13)(241,12)(244,1... 185466.6 181356 1.022666 15 2 6
53 563 1 614521.1 20.0 2470486 3076619 - 39.72 614541.2 21805821 (256,43)(257,42)(259,40)(260,38)(261,37)(262,3... 614501.1 606133 1.013806 41 2 19
63 276 1 599215.1 211191.6 3450761 3837771 - 18.32 1006781.1 21805821 (335,47)(336,45)(337,44)(338,43)(342,42)(343,4... 388023.5 387010 1.002619 23 2 15

In [179]:
re_alignment  = re.compile("\((\d+),(\d+)\)")

def process_alignment(alignment, **kwargs):
    """
     Alignment (4862,48)(4863,48)(4864,47)(4865,46)(4866,45)(4867,44)(4870,43)(4873,42)(4874,41)(4875,40)(4877,40)(4878,39)(4879,38)(4880,37)(4883,36)(4884,36)(4885,35)(4886,34)(4887,33)(4888,33)(4889,32)(4890,30)(4891,30)(4892,29)(4893,28)(4894,28)(4899,27)(4900,26)(4901,25)(4902,24)(4903,23)(4904,22)(4906,21)(4907,21)(4908,20)(4910,19)(4911,18)(4912,17)(4913,16)(4915,15)(4917,14)(4918,13)(4919,12)(4920,11)(4922,10)(4923,9)(4925,8)(4927,7)(4930,6)(4931,5)(4932,3)(4933,2)(4934,1)
    """
    assert(set([x for x in alignment]) <= set(['(', ')', ',', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']))

    count_refs             = defaultdict(int)
    count_queries          = defaultdict(int)
    count_refs_colapses    = 0
    count_queries_colapses = 0

    i_alignment = re_alignment.finditer(alignment)
    for i in i_alignment:
        c_r           = int(i.group(1))
        c_q           = int(i.group(2))

        count_refs   [c_r] += 1
        count_queries[c_q] += 1

    count_refs_colapses    = sum([count_refs[   x] for x in count_refs    if count_refs[   x] > 1])
    count_queries_colapses = sum([count_queries[x] for x in count_queries if count_queries[x] > 1])

    return len(count_refs), len(count_queries), count_refs_colapses, count_queries_colapses

gffData[['len_count_refs', 'len_count_queries', 'count_refs_colapses', 'count_queries_colapses']] = gffData['Alignment'].apply(process_alignment, axis=1).apply(pd.Series, 1)
del gffData['Alignment']
gffData.head()


Out[179]:
QryContigID RefContigID QryStartPos QryEndPos RefStartPos RefEndPos Orientation Confidence QryLen RefLen qry_match_len ref_match_len match_prop cigar_matches cigar_insertions cigar_deletions len_count_refs len_count_queries count_refs_colapses count_queries_colapses
XmapEntryID
2 174 1 21236.5 1568390.0 10672 1553561 + 79.35 1568410.0 21805821 1547153.5 1542889 1.002764 105 17 63 119 105 0 28
32 430 1 20.0 651018.8 1493902 2123769 + 48.60 651038.8 21805821 650998.8 629867 1.033550 52 4 15 54 52 0 4
49 877 1 185486.6 20.0 2289130 2470486 - 13.00 803876.2 21805821 185466.6 181356 1.022666 15 2 6 16 15 0 2
53 563 1 614521.1 20.0 2470486 3076619 - 39.72 614541.2 21805821 614501.1 606133 1.013806 41 2 19 45 41 0 8
63 276 1 599215.1 211191.6 3450761 3837771 - 18.32 1006781.1 21805821 388023.5 387010 1.002619 23 2 15 26 23 0 6

More stats


In [180]:
ref_qry           = gffData[['RefContigID','QryContigID']]
ref_qry           = ref_qry.sort('RefContigID')
print ref_qry.head()

ref_qry_grpby_ref = ref_qry.groupby('RefContigID', sort=True)
ref_qry_grpby_ref.head()


             RefContigID  QryContigID
XmapEntryID                          
2                      1          174
168                    1          308
122                    1          165
114                    1           41
104                    1          473
Out[180]:
RefContigID QryContigID
RefContigID XmapEntryID
1 2 1 174
168 1 308
122 1 165
114 1 41
104 1 473
2 665 2 670
714 2 1008
705 2 539
695 2 283
694 2 1303
3 1222 3 812
1202 3 77
1196 3 340
1191 3 301
1184 3 261
4 1686 4 21
1661 4 517
1684 4 504
1682 4 554
1669 4 139
5 2185 5 238
2191 5 970
2201 5 900
2179 5 1362
2195 5 406
6 2740 6 969
2722 6 1190
2735 6 1101
2733 6 1325
2730 6 1154
7 3138 7 1167
3125 7 482
3123 7 49
3092 7 1254
3093 7 548
8 3641 8 728
3596 8 1300
3637 8 881
3633 8 1031
3626 8 584
9 4158 9 306
4155 9 576
4147 9 417
4140 9 2566
4134 9 867
10 4616 10 240
4579 10 234
4609 10 398
4603 10 388
4599 10 586
11 5204 11 163
5183 11 1308
5178 11 404
5175 11 1019
5173 11 985
12 5652 12 438
5584 12 89
5600 12 171
5603 12 745
5613 12 893
13 6178 13 965
6150 13 623
6168 13 344
6160 13 253
6154 13 1977

In [181]:
qry_ref           = gffData[['QryContigID','RefContigID']]
qry_ref           = qry_ref.sort('QryContigID')
print qry_ref.head()

qry_ref_grpby_qry = qry_ref.groupby('QryContigID', sort=True)
qry_ref_grpby_qry.head()


             QryContigID  RefContigID
XmapEntryID                          
1467                   1            4
4003                   1            9
2920                   2            7
552                    2            2
937                    2            3
Out[181]:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 806 entries, (1, 1467) to (2770, 854)
Data columns:
QryContigID    806  non-null values
RefContigID    806  non-null values
dtypes: int64(2)

In [182]:
def stats_from_data_vals(RefContigID, QryContigID, groups, indexer, data, data_vals, valid_data_poses):
    ref_lens             = [ ( x["RefStartPos"], x["RefEndPos"] ) for x in data_vals ]
    qry_lens             = [ ( x["QryStartPos"], x["QryEndPos"] ) for x in data_vals ]

    num_qry_matches = []
    for RefContigID_l in groups["QryContigID_RefContigID"][QryContigID]:
        for match_pos in groups["QryContigID_RefContigID"][QryContigID][RefContigID_l]:
            if match_pos in valid_data_poses:
                num_qry_matches.append(RefContigID_l)

    #num_qry_matches      = len( groups["QryContigID_RefContigID"][QryContigID] )
    num_qry_matches      = len( set(num_qry_matches)                       )
    num_orientations     = len( set([x["Orientation"] for x in data_vals]) )

    ref_no_gap_len       = sum( [ max(x)-min(x) for x in ref_lens ] )
    ref_min_coord        = min( [ min(x)        for x in ref_lens ] )
    ref_max_coord        = max( [ max(x)        for x in ref_lens ] )
    ref_gap_len          = ref_max_coord - ref_min_coord

    qry_no_gap_len       = sum( [ max(x)-min(x) for x in qry_lens ] )
    qry_min_coord        = min( [ min(x)        for x in qry_lens ] )
    qry_max_coord        = max( [ max(x)        for x in qry_lens ] )
    qry_gap_len          = qry_max_coord - qry_min_coord

    XmapEntryIDs         = groups["QryContigID_XmapEntryID"][QryContigID].keys()

    Confidences          = []
    for XmapEntryID in XmapEntryIDs:
        data_pos = list(indexer["XmapEntryID"][XmapEntryID])[0]
        if data_pos not in valid_data_poses:
            continue
        Confidences.append( [ data[data_pos]["Confidence"], data[data_pos]["RefContigID"] ] )

    max_confidence       = max([ x[0] for x in Confidences ])
    max_confidence_chrom = [ x[1] for x in Confidences if x[0] == max_confidence][0]

    stats = {}
    stats["_meta_is_max_confidence_for_qry_chrom"      ] = max_confidence_chrom == RefContigID

    stats["_meta_len_ref_match_gapped"                 ] = ref_gap_len
    stats["_meta_len_ref_match_no_gap"                 ] = ref_no_gap_len
    stats["_meta_len_qry_match_gapped"                 ] = qry_gap_len
    stats["_meta_len_qry_match_no_gap"                 ] = qry_no_gap_len

    stats["_meta_max_confidence_for_qry"               ] = max_confidence
    stats["_meta_max_confidence_for_qry_chrom"         ] = max_confidence_chrom

    stats["_meta_num_orientations"                     ] = num_orientations
    stats["_meta_num_qry_matches"                      ] = num_qry_matches
    stats["_meta_qry_matches"                          ] = ','.join( [ str(x) for x in sorted(list(set([ x[1] for x in Confidences ]))) ] )

    stats["_meta_proportion_sizes_gapped"              ] = (ref_gap_len    * 1.0)/ qry_gap_len
    stats["_meta_proportion_sizes_no_gap"              ] = (ref_no_gap_len * 1.0)/ qry_no_gap_len

    return stats

In [183]:
for QryContigID in sorted(QryContigIDs):
    data_poses           = list(groups["RefContigID_QryContigID"][RefContigID][QryContigID])
    all_data_poses       = list(indexer["QryContigID"][QryContigID])
    data_vals            = [ data[x] for x in data_poses ]

    stats                = stats_from_data_vals(RefContigID, QryContigID, groups, indexer, data, data_vals, all_data_poses)

    #print "RefContigID %4d QryContigID %6d" % ( RefContigID, QryContigID )
    for data_val in data_vals:
        cigar                = data_val["HitEnum"]
        cigar_matches, cigar_insertions, cigar_deletions = process_cigar(cigar)

        Alignment            = data_val["Alignment"]
        alignment_count_queries, alignment_count_refs, alignment_count_refs_colapses, alignment_count_queries_colapses = process_alignment(Alignment)

        for stat in stats:
            data_val[stat] = stats[stat]


        data_val["_meta_proportion_query_len_gapped"          ] = (data_val['_meta_len_qry_match_gapped'] * 1.0)/ data_val["QryLen"]
        data_val["_meta_proportion_query_len_no_gap"          ] = (data_val['_meta_len_qry_match_no_gap'] * 1.0)/ data_val["QryLen"]

        #print " ", " ".join( ["%s %s" % (x, str(data_val[x])) for x in sorted(data_val)] )
        reporter.write(            "\t".join( [ str(data_val[x])                                   for x in valid_fields['names'  ] ] ) + "\n"  )


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-183-640d7ffc0f00> in <module>()
----> 1 for QryContigID in sorted(QryContigIDs):
      2     data_poses           = list(groups["RefContigID_QryContigID"][RefContigID][QryContigID])
      3     all_data_poses       = list(indexer["QryContigID"][QryContigID])
      4     data_vals            = [ data[x] for x in data_poses ]
      5 

NameError: name 'QryContigIDs' is not defined

Good part

Column types


In [ ]:
gffData.dtypes

Global statistics


In [ ]:
gffData[['Confidence', 'QryLen', 'qry_match_len', 'ref_match_len', 'match_prop']].describe()

List of chromosomes


In [ ]:
chromosomes = np.unique(gffData['RefContigID'].values)
chromosomes

Quality distribution


In [ ]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
    bq = gffData.boxplot(column='Confidence')

Quality distribution per chromosome


In [ ]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
    bqc = gffData.boxplot(column='Confidence', by='RefContigID')

Position distribution


In [ ]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
    hs = gffData['RefStartPos'].hist()

Position distribution per chromosome


In [45]:
hsc = gffData['qry_match_len'].hist(by=gffData['RefContigID'], figsize=(CHROM_FIG_W, CHROM_FIG_H), layout=(len(chromosomes),1))



In [45]:
hsc = gffData['RefStartPos'].hist(by=gffData['RefContigID'], figsize=(CHROM_FIG_W, CHROM_FIG_H), layout=(len(chromosomes),1))


Length distribution


In [46]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
    hl = gffData['qry_match_len'].hist()


Length distribution per chromosome


In [47]:
hlc = gffData['qry_match_len'].hist(by=gffData['RefContigID'], figsize=(CHROM_FIG_W, CHROM_FIG_H), layout=(len(chromosomes),1))