In [172]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import matplotlib as plt
#plt.use('TkAgg')
import operator
import re
from collections import defaultdict
import pylab
pylab.show()
%pylab inline
In [173]:
fileUrl = "../S_lycopersicum_chromosomes.2.50.BspQI_to_EXP_REFINEFINAL1_xmap.txt"
MIN_CONF = 10.0
FULL_FIG_W , FULL_FIG_H = 16, 8
CHROM_FIG_W, CHROM_FIG_H = FULL_FIG_W, 20
In [174]:
class size_controller(object):
def __init__(self, w, h):
self.w = w
self.h = h
def __enter__(self):
self.o = rcParams['figure.figsize']
rcParams['figure.figsize'] = self.w, self.h
return None
def __exit__(self, type, value, traceback):
rcParams['figure.figsize'] = self.o
In [175]:
col_type_int = np.int64
col_type_flo = np.float64
col_type_str = np.object
col_info =[
[ "XmapEntryID" , col_type_int ],
[ "QryContigID" , col_type_int ],
[ "RefContigID" , col_type_int ],
[ "QryStartPos" , col_type_flo ],
[ "QryEndPos" , col_type_flo ],
[ "RefStartPos" , col_type_flo ],
[ "RefEndPos" , col_type_flo ],
[ "Orientation" , col_type_str ],
[ "Confidence" , col_type_flo ],
[ "HitEnum" , col_type_str ],
[ "QryLen" , col_type_flo ],
[ "RefLen" , col_type_flo ],
[ "LabelChannel", col_type_str ],
[ "Alignment" , col_type_str ],
]
col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
col_types
Out[175]:
In [176]:
CONVERTERS = {
'info': filter_conv
}
SKIP_ROWS = 9
NROWS = None
gffData = pd.read_csv(fileUrl, names=col_names, index_col='XmapEntryID', dtype=col_types, header=None, skiprows=SKIP_ROWS, delimiter="\t", comment="#", verbose=True, nrows=NROWS)
gffData.head()
Out[176]:
In [177]:
gffData['qry_match_len'] = abs(gffData['QryEndPos'] - gffData['QryStartPos'])
gffData['ref_match_len'] = abs(gffData['RefEndPos'] - gffData['RefStartPos'])
gffData['match_prop' ] = gffData['qry_match_len'] / gffData['ref_match_len']
gffData = gffData[gffData['Confidence'] >= MIN_CONF]
del gffData['LabelChannel']
gffData.head()
Out[177]:
In [178]:
re_matches = re.compile("(\d+)M")
re_insertions = re.compile("(\d+)I")
re_deletions = re.compile("(\d+)D")
def process_cigar(cigar, **kwargs):
"""
2M3D1M1D1M1D4M1I2M1D2M1D1M2I2D9M3I3M1D6M1D2M2D1M1D6M1D1M1D1M2D2M2D1M1I1D1M1D5M2D4M2D1M2D2M1D2M1D3M1D1M1D2M3I3D1M1D1M3D2M3D1M2I1D1M2D1M1D1M1I2D3M2I1M1D2M1D1M1D1M2I3D3M3D1M2D1M1D1M1D5M2D12M
"""
assert(set([x for x in cigar]) <= set(['M', 'D', 'I', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']))
cigar_matches = 0
cigar_insertions = 0
cigar_deletions = 0
i_matches = re_matches .finditer(cigar)
i_inserts = re_insertions.finditer(cigar)
i_deletes = re_deletions .finditer(cigar)
for i in i_matches:
n = i.group(1)
cigar_matches += int(n)
for i in i_inserts:
n = i.group(1)
cigar_insertions += int(n)
for i in i_deletes:
n = i.group(1)
cigar_deletions += int(n)
return cigar_matches, cigar_insertions, cigar_deletions
gffData[['cigar_matches', 'cigar_insertions', 'cigar_deletions']] = gffData['HitEnum'].apply(process_cigar, axis=1).apply(pd.Series, 1)
del gffData['HitEnum']
gffData.head()
Out[178]:
In [179]:
re_alignment = re.compile("\((\d+),(\d+)\)")
def process_alignment(alignment, **kwargs):
"""
Alignment (4862,48)(4863,48)(4864,47)(4865,46)(4866,45)(4867,44)(4870,43)(4873,42)(4874,41)(4875,40)(4877,40)(4878,39)(4879,38)(4880,37)(4883,36)(4884,36)(4885,35)(4886,34)(4887,33)(4888,33)(4889,32)(4890,30)(4891,30)(4892,29)(4893,28)(4894,28)(4899,27)(4900,26)(4901,25)(4902,24)(4903,23)(4904,22)(4906,21)(4907,21)(4908,20)(4910,19)(4911,18)(4912,17)(4913,16)(4915,15)(4917,14)(4918,13)(4919,12)(4920,11)(4922,10)(4923,9)(4925,8)(4927,7)(4930,6)(4931,5)(4932,3)(4933,2)(4934,1)
"""
assert(set([x for x in alignment]) <= set(['(', ')', ',', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']))
count_refs = defaultdict(int)
count_queries = defaultdict(int)
count_refs_colapses = 0
count_queries_colapses = 0
i_alignment = re_alignment.finditer(alignment)
for i in i_alignment:
c_r = int(i.group(1))
c_q = int(i.group(2))
count_refs [c_r] += 1
count_queries[c_q] += 1
count_refs_colapses = sum([count_refs[ x] for x in count_refs if count_refs[ x] > 1])
count_queries_colapses = sum([count_queries[x] for x in count_queries if count_queries[x] > 1])
return len(count_refs), len(count_queries), count_refs_colapses, count_queries_colapses
gffData[['len_count_refs', 'len_count_queries', 'count_refs_colapses', 'count_queries_colapses']] = gffData['Alignment'].apply(process_alignment, axis=1).apply(pd.Series, 1)
del gffData['Alignment']
gffData.head()
Out[179]:
In [180]:
ref_qry = gffData[['RefContigID','QryContigID']]
ref_qry = ref_qry.sort('RefContigID')
print ref_qry.head()
ref_qry_grpby_ref = ref_qry.groupby('RefContigID', sort=True)
ref_qry_grpby_ref.head()
Out[180]:
In [181]:
qry_ref = gffData[['QryContigID','RefContigID']]
qry_ref = qry_ref.sort('QryContigID')
print qry_ref.head()
qry_ref_grpby_qry = qry_ref.groupby('QryContigID', sort=True)
qry_ref_grpby_qry.head()
Out[181]:
In [182]:
def stats_from_data_vals(RefContigID, QryContigID, groups, indexer, data, data_vals, valid_data_poses):
ref_lens = [ ( x["RefStartPos"], x["RefEndPos"] ) for x in data_vals ]
qry_lens = [ ( x["QryStartPos"], x["QryEndPos"] ) for x in data_vals ]
num_qry_matches = []
for RefContigID_l in groups["QryContigID_RefContigID"][QryContigID]:
for match_pos in groups["QryContigID_RefContigID"][QryContigID][RefContigID_l]:
if match_pos in valid_data_poses:
num_qry_matches.append(RefContigID_l)
#num_qry_matches = len( groups["QryContigID_RefContigID"][QryContigID] )
num_qry_matches = len( set(num_qry_matches) )
num_orientations = len( set([x["Orientation"] for x in data_vals]) )
ref_no_gap_len = sum( [ max(x)-min(x) for x in ref_lens ] )
ref_min_coord = min( [ min(x) for x in ref_lens ] )
ref_max_coord = max( [ max(x) for x in ref_lens ] )
ref_gap_len = ref_max_coord - ref_min_coord
qry_no_gap_len = sum( [ max(x)-min(x) for x in qry_lens ] )
qry_min_coord = min( [ min(x) for x in qry_lens ] )
qry_max_coord = max( [ max(x) for x in qry_lens ] )
qry_gap_len = qry_max_coord - qry_min_coord
XmapEntryIDs = groups["QryContigID_XmapEntryID"][QryContigID].keys()
Confidences = []
for XmapEntryID in XmapEntryIDs:
data_pos = list(indexer["XmapEntryID"][XmapEntryID])[0]
if data_pos not in valid_data_poses:
continue
Confidences.append( [ data[data_pos]["Confidence"], data[data_pos]["RefContigID"] ] )
max_confidence = max([ x[0] for x in Confidences ])
max_confidence_chrom = [ x[1] for x in Confidences if x[0] == max_confidence][0]
stats = {}
stats["_meta_is_max_confidence_for_qry_chrom" ] = max_confidence_chrom == RefContigID
stats["_meta_len_ref_match_gapped" ] = ref_gap_len
stats["_meta_len_ref_match_no_gap" ] = ref_no_gap_len
stats["_meta_len_qry_match_gapped" ] = qry_gap_len
stats["_meta_len_qry_match_no_gap" ] = qry_no_gap_len
stats["_meta_max_confidence_for_qry" ] = max_confidence
stats["_meta_max_confidence_for_qry_chrom" ] = max_confidence_chrom
stats["_meta_num_orientations" ] = num_orientations
stats["_meta_num_qry_matches" ] = num_qry_matches
stats["_meta_qry_matches" ] = ','.join( [ str(x) for x in sorted(list(set([ x[1] for x in Confidences ]))) ] )
stats["_meta_proportion_sizes_gapped" ] = (ref_gap_len * 1.0)/ qry_gap_len
stats["_meta_proportion_sizes_no_gap" ] = (ref_no_gap_len * 1.0)/ qry_no_gap_len
return stats
In [183]:
for QryContigID in sorted(QryContigIDs):
data_poses = list(groups["RefContigID_QryContigID"][RefContigID][QryContigID])
all_data_poses = list(indexer["QryContigID"][QryContigID])
data_vals = [ data[x] for x in data_poses ]
stats = stats_from_data_vals(RefContigID, QryContigID, groups, indexer, data, data_vals, all_data_poses)
#print "RefContigID %4d QryContigID %6d" % ( RefContigID, QryContigID )
for data_val in data_vals:
cigar = data_val["HitEnum"]
cigar_matches, cigar_insertions, cigar_deletions = process_cigar(cigar)
Alignment = data_val["Alignment"]
alignment_count_queries, alignment_count_refs, alignment_count_refs_colapses, alignment_count_queries_colapses = process_alignment(Alignment)
for stat in stats:
data_val[stat] = stats[stat]
data_val["_meta_proportion_query_len_gapped" ] = (data_val['_meta_len_qry_match_gapped'] * 1.0)/ data_val["QryLen"]
data_val["_meta_proportion_query_len_no_gap" ] = (data_val['_meta_len_qry_match_no_gap'] * 1.0)/ data_val["QryLen"]
#print " ", " ".join( ["%s %s" % (x, str(data_val[x])) for x in sorted(data_val)] )
reporter.write( "\t".join( [ str(data_val[x]) for x in valid_fields['names' ] ] ) + "\n" )
http://pandas.pydata.org/pandas-docs/dev/visualization.html
https://bespokeblog.wordpress.com/2011/07/11/basic-data-plotting-with-matplotlib-part-3-histograms/
http://nbviewer.ipython.org/github/mwaskom/seaborn/blob/master/examples/plotting_distributions.ipynb
http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/week3/exploratory_graphs.ipynb
http://pandas.pydata.org/pandas-docs/version/0.15.0/visualization.html
http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/
In [ ]:
gffData.dtypes
In [ ]:
gffData[['Confidence', 'QryLen', 'qry_match_len', 'ref_match_len', 'match_prop']].describe()
In [ ]:
chromosomes = np.unique(gffData['RefContigID'].values)
chromosomes
In [ ]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
bq = gffData.boxplot(column='Confidence')
In [ ]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
bqc = gffData.boxplot(column='Confidence', by='RefContigID')
In [ ]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
hs = gffData['RefStartPos'].hist()
In [45]:
hsc = gffData['qry_match_len'].hist(by=gffData['RefContigID'], figsize=(CHROM_FIG_W, CHROM_FIG_H), layout=(len(chromosomes),1))
In [45]:
hsc = gffData['RefStartPos'].hist(by=gffData['RefContigID'], figsize=(CHROM_FIG_W, CHROM_FIG_H), layout=(len(chromosomes),1))
In [46]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
hl = gffData['qry_match_len'].hist()
In [47]:
hlc = gffData['qry_match_len'].hist(by=gffData['RefContigID'], figsize=(CHROM_FIG_W, CHROM_FIG_H), layout=(len(chromosomes),1))