Setup


In [1]:
%run ../../shared_setup.ipynb


docker image cggh/biipy:v1.6.0

In [2]:
def tabulate(f):
    class Tabulated(etl.Table):
        def __init__(self, *args, **kwargs):
            self.args = args
            self.kwargs = kwargs
        def __iter__(self):
            return f(*self.args, **self.kwargs)
    return Tabulated

In [3]:
@tabulate
def tabulate_core_windows(window_size):
    yield 'chrom', 'start', 'stop'
    for rec in tbl_regions_1b.eq('region_type', 'Core').records():
        for start in range(rec.region_start, rec.region_stop, window_size):
            yield rec.region_chrom, start, start + window_size - 1

In [4]:
import re
motif_jiang_a = 'TA[TA]GTTAGT[CG]AAG[TG]TAAGACC'
motif_jiang_b = 'GCA[TC][CA][TG]AG[GT]TGC'
motif_jiang_c = '[TG]GA[TA]GAAG[AG][TG]GA'
motif_jiang_d = '[TA]GT[TA]AGTAGT[CA]A'
motif_jiang_e = '[TG]G[AG]TGA[TA]GA[AT][AG][AC]'


@tabulate
def tabulate_motifs(motif):
    motif = motif.lower()
    yield 'chrom', 'start', 'stop', 'sequence'
    for chrom in sorted(fasta.keys()):
        seq = fasta[chrom][:].lower()
        matches = re.finditer(motif, seq)
        for match in matches:
            start, stop = match.span()
            match_seq = seq[start:stop]
            yield chrom, start, stop, match_seq

In [5]:
tabulate_motifs(motif_jiang_a)


Out[5]:
0|chrom 1|start 2|stop 3|sequence
Pf3D7_01_v3 620248 620269 tatgttagtcaagttaagacc
Pf3D7_01_v3 620710 620731 tatgttagtcaagttaagacc
Pf3D7_01_v3 620731 620752 tatgttagtcaagttaagacc
Pf3D7_01_v3 621717 621738 tatgttagtgaagttaagacc
Pf3D7_01_v3 622221 622242 tatgttagtcaagttaagacc

...


In [6]:
tabulate_motifs(motif_jiang_b)


Out[6]:
0|chrom 1|start 2|stop 3|sequence
Pf3D7_01_v3 617229 617241 gcacctagttgc
Pf3D7_02_v3 937954 937966 gcatataggtgc
Pf3D7_02_v3 945012 945024 gcacctaggtgc
Pf3D7_03_v3 1040520 1040532 gcacctagttgc
Pf3D7_03_v3 1040547 1040559 gcacctagttgc

...


In [7]:
tabulate_motifs(motif_jiang_c)


Out[7]:
0|chrom 1|start 2|stop 3|sequence
Pf3D7_01_v3 242882 242894 tgaagaagagga
Pf3D7_01_v3 338162 338174 tgatgaagatga
Pf3D7_01_v3 338201 338213 tgatgaagatga
Pf3D7_01_v3 338213 338225 tgatgaagatga
Pf3D7_01_v3 338231 338243 tgatgaagatga

...


In [8]:
tabulate_motifs(motif_jiang_d)


Out[8]:
0|chrom 1|start 2|stop 3|sequence
Pf3D7_01_v3 640054 640066 tgttagtagtca
Pf3D7_01_v3 640114 640126 tgttagtagtca
Pf3D7_01_v3 640138 640150 tgtaagtagtca
Pf3D7_01_v3 640174 640186 tgttagtagtca
Pf3D7_01_v3 640186 640198 tgttagtagtca

...


In [9]:
tabulate_motifs(motif_jiang_e)


Out[9]:
0|chrom 1|start 2|stop 3|sequence
Pf3D7_01_v3 33552 33564 ggatgaagataa
Pf3D7_01_v3 95362 95374 tgatgaagaaaa
Pf3D7_01_v3 98910 98922 tgatgatgaaga
Pf3D7_01_v3 101936 101948 tgatgaagaaaa
Pf3D7_01_v3 101981 101993 tgatgaagaaaa

...


In [10]:
tbl_co = (
    etl
    .frompickle(os.path.join(PUBLIC_DIR, 'tbl_co.pickle'))
    .convert('chrom', lambda v: str(v, 'ascii'))
)
display_with_nrows(tbl_co, caption='CO events')


CO events (1194 rows)
0|sample 1|chrom 2|co_pos_mid 3|co_pos_min 4|co_pos_max 5|co_pos_range 6|cross 7|co_from_parent 8|co_to_parent
B1SD/PG0015-C/ERR019044 Pf3D7_01_v3 145052 144877 145227 350 hb3_dd2 hb3 dd2
GC03/PG0021-C/ERR015447 Pf3D7_01_v3 163584 163145 164024 879 hb3_dd2 dd2 hb3
XF12/PG0102-C/ERR029143 Pf3D7_01_v3 206769 205803 207736 1933 7g8_gb4 gb4 7g8
7C159/PG0040-Cx/ERR107475 Pf3D7_01_v3 206905 206074 207736 1662 hb3_dd2 hb3 dd2
CH3_61/PG0033-Cx/ERR175544 Pf3D7_01_v3 206905 206074 207736 1662 hb3_dd2 dd2 hb3

...


In [11]:
def tabulate_tr(chrom):
    fn = '/data/plasmodium/pfalciparum/pf-crosses/data/genome/sanger/version3/September_2012/%s.tr.pickle' % chrom
    tbl_tr = etl.frompickle(fn)
    return tbl_tr
    

tbl_tr = (
    etl
    .cat(*[tabulate_tr(chrom) for chrom in sorted(fasta.keys())])
    .sort(key=('chrom', 'start'))
)


def select_tr_threshold(row):
    unit_length = row.unit_length
    tract_length = row.tract_length
    return (
        ((unit_length == 1) and (tract_length >= 6)) or
        ((unit_length == 2) and (tract_length >= 9)) or
        ((unit_length == 3) and (tract_length >= 11)) or
        ((unit_length == 4) and (tract_length >= 13)) or
        ((unit_length == 5) and (tract_length >= 14)) or
        ((unit_length == 6) and (tract_length >= 16)) or
        ((unit_length >= 7) and (tract_length >= 18))
    )


tbl_tr_proper = tbl_tr.select(select_tr_threshold).cache()
tbl_tr_proper.display()

is_tr = dict()
for chrom in sorted(fasta.keys()):
    log(chrom)
    is_tr[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
    for rec in tbl_tr_proper.eq('chrom', chrom).records():
        start = rec.start - 1
        stop = rec.stop
        is_tr[chrom][start:stop] = True
is_tr


0|chrom 1|start 2|stop 3|repeat_unit 4|n_units 5|unit_length 6|last_unit_length 7|tract_length 8|region_type
Pf3D7_01_v3 13 37 cctaaac 3 7 3 24 SubtelomericRepeat
Pf3D7_01_v3 24 65 aaccctaaaccctg 2 14 13 41 SubtelomericRepeat
Pf3D7_01_v3 38 107 aaccctaaaccctgaacccta 3 21 6 69 SubtelomericRepeat
Pf3D7_01_v3 52 72 aacccta 2 7 6 20 SubtelomericRepeat
Pf3D7_01_v3 73 93 aacccta 2 7 6 20 SubtelomericRepeat

...

2016-03-15 10:23:48.973313 :: Pf3D7_01_v3
2016-03-15 10:29:32.155088 :: Pf3D7_02_v3
2016-03-15 10:29:32.446752 :: Pf3D7_03_v3
2016-03-15 10:29:32.800177 :: Pf3D7_04_v3
2016-03-15 10:29:33.145088 :: Pf3D7_05_v3
2016-03-15 10:29:33.516784 :: Pf3D7_06_v3
2016-03-15 10:29:33.868503 :: Pf3D7_07_v3
2016-03-15 10:29:34.233496 :: Pf3D7_08_v3
2016-03-15 10:29:34.621624 :: Pf3D7_09_v3
2016-03-15 10:29:35.005701 :: Pf3D7_10_v3
2016-03-15 10:29:35.449407 :: Pf3D7_11_v3
2016-03-15 10:29:35.907885 :: Pf3D7_12_v3
2016-03-15 10:29:36.415968 :: Pf3D7_13_v3
2016-03-15 10:29:37.041578 :: Pf3D7_14_v3
Out[11]:
{'Pf3D7_01_v3': array([False, False, False, ...,  True,  True,  True], dtype=bool),
 'Pf3D7_02_v3': array([ True,  True,  True, ..., False, False, False], dtype=bool),
 'Pf3D7_03_v3': array([False, False, False, ..., False, False, False], dtype=bool),
 'Pf3D7_04_v3': array([False, False, False, ...,  True,  True,  True], dtype=bool),
 'Pf3D7_05_v3': array([ True,  True,  True, ..., False, False, False], dtype=bool),
 'Pf3D7_06_v3': array([False, False, False, ..., False, False, False], dtype=bool),
 'Pf3D7_07_v3': array([False, False, False, ...,  True,  True, False], dtype=bool),
 'Pf3D7_08_v3': array([False, False,  True, ...,  True,  True,  True], dtype=bool),
 'Pf3D7_09_v3': array([False, False, False, ..., False, False, False], dtype=bool),
 'Pf3D7_10_v3': array([ True,  True,  True, ..., False, False, False], dtype=bool),
 'Pf3D7_11_v3': array([False, False, False, ..., False, False, False], dtype=bool),
 'Pf3D7_12_v3': array([False, False, False, ..., False, False, False], dtype=bool),
 'Pf3D7_13_v3': array([ True,  True,  True, ...,  True,  True,  True], dtype=bool),
 'Pf3D7_14_v3': array([False, False,  True, ..., False, False, False], dtype=bool)}

In [12]:
is_exon = dict()
for chrom in sorted(fasta.keys()):
    log(chrom)
    is_exon[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
    for rec in tbl_exons.eq('feature_chrom', chrom).records():
        start = rec.feature_start - 1
        stop = rec.feature_stop
        is_exon[chrom][start:stop] = True


2016-03-15 10:29:37.700494 :: Pf3D7_01_v3
2016-03-15 10:29:37.767273 :: Pf3D7_02_v3
2016-03-15 10:29:37.834538 :: Pf3D7_03_v3
2016-03-15 10:29:37.899519 :: Pf3D7_04_v3
2016-03-15 10:29:37.968023 :: Pf3D7_05_v3
2016-03-15 10:29:38.036432 :: Pf3D7_06_v3
2016-03-15 10:29:38.104313 :: Pf3D7_07_v3
2016-03-15 10:29:38.173077 :: Pf3D7_08_v3
2016-03-15 10:29:38.240904 :: Pf3D7_09_v3
2016-03-15 10:29:38.313166 :: Pf3D7_10_v3
2016-03-15 10:29:38.391950 :: Pf3D7_11_v3
2016-03-15 10:29:38.465714 :: Pf3D7_12_v3
2016-03-15 10:29:38.542520 :: Pf3D7_13_v3
2016-03-15 10:29:38.644136 :: Pf3D7_14_v3

In [13]:
%%R
library(AER)


/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: car

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: lmtest

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: zoo

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: 
Attaching package: 'zoo'


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: The following objects are masked from 'package:base':

    as.Date, as.Date.numeric


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: sandwich

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: survival

  res = super(Function, self).__call__(*new_args, **new_kwargs)

In [14]:
callsets = load_callsets(COMBINED_CALLSET_FN_TEMPLATE, 
                         variant_filter='FILTER_PASS',
                         call_filter=combined_conf_calls, 
                         sample_exclusions=excessive_recomb_samples)


2016-03-15 10:29:40.400196 :: loading /data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/3d7_hb3.combined.final.npz
2016-03-15 10:29:40.683172 :: filter variants: excluding 157 (0.4%) retaining 42087 (99.6%) of 42244 variants
2016-03-15 10:29:40.724450 :: filter calls: excluding 2439 (0.3%) retaining 881388 (99.7%) of 883827 calls
2016-03-15 10:29:40.725696 :: loading /data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/hb3_dd2.combined.final.npz
2016-03-15 10:29:41.087023 :: filter variants: excluding 450 (1.2%) retaining 36461 (98.8%) of 36911 variants
2016-03-15 10:29:41.106950 :: filter samples: excluding ['SC01/PG0025-C/ERR019045'] including ['HB3/PG0004-CW/ERR012788', 'DD2/PG0008-CW/ERR012840', '1BB5/PG0023-C/ERR015449', '3BA6/PG0022-Cx/ERR126027', '3BD5/PG0024-C/ERR019053', '7C101/PG0074-C/ERR019048', '7C111/PG0038-C/ERR015457', '7C12/PG0035-Cx/ERR037704', '7C126/PG0047-C/ERR015452', '7C140/PG0039-C/ERR015454', '7C159/PG0040-Cx/ERR107475', '7C16/PG0036-C/ERR015455', '7C170/PG0041-C/ERR015446', '7C183/PG0042-C/ERR015448', '7C188/PG0030-C/ERR019046', '7C20/PG0037-C/ERR015451', '7C3/PG0034-C/ERR019047', '7C408/PG0031-C/ERR015458', '7C421/PG0043-C/ERR015459', '7C424/PG0044-C/ERR019043', '7C46/PG0046-Cx/ERR107476', '7C7/PG0048-C/ERR019049', 'B1SD/PG0015-C/ERR019044', 'B4R3/PG0018-C/ERR019042', 'CH3_116/PG0032-Cx/ERR037703', 'CH3_61/PG0033-Cx/ERR175544', 'D43/PG0029-Cx/ERR107474', 'GC03/PG0021-C/ERR015447', 'GC06/PG0028-C/ERR015456', 'QC01/PG0017-C/ERR019050', 'QC13/PG0016-C/ERR012895', 'QC23/PG0045-C/ERR012892', 'QC34/PG0026-C/ERR015453', 'SC05/PG0019-C/ERR019051', 'TC05/PG0027-C/ERR015450', 'TC08/PG0020-C/ERR019052']
2016-03-15 10:29:41.186485 :: filter calls: excluding 28934 (2.2%) retaining 1283662 (97.8%) of 1312596 calls
2016-03-15 10:29:41.188606 :: loading /data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/7g8_gb4.combined.final.npz
2016-03-15 10:29:41.631121 :: filter variants: excluding 304 (0.9%) retaining 34471 (99.1%) of 34775 variants
2016-03-15 10:29:41.649818 :: filter samples: excluding ['D2/PG0094-CW/ERR045632'] including ['7G8/PG0083-C/ERR027099', 'GB4/PG0084-C/ERR027100', 'AL2/PG0103-CW/ERR045627', 'AUD/PG0112-C/ERR029406', 'AUD/PG0112-CW/ERR045639', 'DAN/PG0098-C/ERR027110', 'DEV/PG0081-CW/ERR045633', 'JB12/PG0099-C/ERR029146', 'JB8/PG0087-C/ERR029091', 'JC3/PG0077-CW/ERR045636', 'JC9/PG0111-C/ERR029409', 'JC9/PG0111-CW/ERR045634', 'JE11/PG0100-C/ERR029404', 'JE11/PG0100-CW/ERR045630', 'JF6/PG0079-C/ERR027102', 'JF6/PG0079-CW/ERR045637', 'JON/PG0107-C/ERR029408', 'KA6/PG0091-C/ERR027117', 'KB8/PG0104-C/ERR029148', 'KB8/PG0104-CW/ERR045642', 'KH7/PG0088-C/ERR027111', 'LA10/PG0086-C/ERR029090', 'LA10/PG0086-CW/ERR045629', 'NF10/PG0096-C/ERR027108', 'NIC/PG0095-C/ERR027107', 'NIC/PG0095-CW/ERR045631', 'QF5/PG0078-C/ERR029092', 'QF5/PG0078-CW/ERR045638', 'TF1/PG0080-C/ERR027103', 'WC4/PG0082-C/ERR029093', 'WE2/PG0085-C/ERR027101', 'WF12/PG0097-C/ERR027109', 'XB3/PG0093-C/ERR029105', 'XD8/PG0105-C/ERR029144', 'XD8/PG0105-CW/ERR045628', 'XE7/PG0106-C/ERR029407', 'XF12/PG0102-C/ERR029143', 'XF12/PG0102-CW/ERR045635', 'XG10/PG0109-C/ERR029405']
2016-03-15 10:29:41.718755 :: filter calls: excluding 19937 (1.5%) retaining 1324432 (98.5%) of 1344369 calls

In [15]:
pos = dict()
for cross in CROSSES:
    variants = callsets[cross]['variants']
    pos[cross] = allel.SortedMultiIndex(variants['CHROM'], variants['POS'])

In [16]:
is_snp = dict()
for cross in CROSSES:
    variants = callsets[cross]['variants']
    is_snp[cross] = variants['is_snp']

In [27]:
def distance_to_centromere(row):
    cen_id = 'PF3D7_CEN' + row.chrom[6:8]
    try:
        cen = lkp_feature[cen_id]
        cen_pos = (cen['feature_start'] + cen['feature_stop'])/2
        window_pos = (row.start + row.stop) / 2
        return abs(window_pos - cen_pos)
    except KeyError:
        return None
    
    
def distance_to_telomere(row):
    chrom = row.chrom
    chrlen = len(fasta[chrom])
    l = row.start
    r = chrlen - row.stop
    return min(l, r)


def distance_to_core_edge(row):
    chrom = row.chrom
    core_start = tbl_regions_1b.eq('region_chrom', chrom).eq('region_type', 'Core').values('region_start').min()
    core_stop = tbl_regions_1b.eq('region_chrom', chrom).eq('region_type', 'Core').values('region_stop').max()
    l = row.start - core_start
    r = core_stop - row.stop
    return min(l, r)


def n_coding(row):
    return np.count_nonzero(is_exon[row.chrom][row.start-1:row.stop])


def n_tr(row):
    return np.count_nonzero(is_tr[row.chrom][row.start-1:row.stop])


def gc(row):
    seq = fasta[row.chrom][row.start-1:row.stop].lower()
    com = collections.Counter(seq)
    return (com['g'] + com['c']) / len(seq) 


def n_variants(row):
    nv = ni = ns = 0
    start = row.start
    stop = row.stop
    chrom = row.chrom.encode('ascii')
    for cross in CROSSES:
        try:
            loc = pos[cross].locate_range(chrom, start, stop)
        except KeyError:
            pass
        else:
            x = is_snp[cross][loc]
            ns += nnz(x)
            ni += nnz(~x)
    return ns, ni
    
    
@functools.lru_cache(maxsize=None)
def distance_to_motif(motif, vmax=None):
    tbl_motif = tabulate_motifs(motif)
    lkp_motif = tbl_motif.recordlookup('chrom')
    def _addfield(row):
        chrom = row.chrom
        window_mid = (row.start + row.stop) / 2
        try:
            motifs = lkp_motif[chrom]
        except KeyError:
            if vmax is None:
                return 4000000
            else:
                return vmax
        else:
            motif_mids = [(motif.start + motif.stop) / 2 for motif in motifs]
            nearest = min([abs(window_mid - motif_mid) for motif_mid in motif_mids])
            if vmax is None:
                return nearest
            else:
                return min(nearest, vmax)
    return _addfield

In [28]:
@functools.lru_cache(maxsize=None)
def tabulate_co_predictors(window_size, vmax_motif=None):
    fn = os.path.join(CACHE_DIR, 'tbl_co_predictors.%s.%s.pickle' % (window_size, vmax_motif))
    if not os.path.exists(fn):
        log('building')
    
        # tabulate windows
        tbl_windows = tabulate_core_windows(window_size)

        # count COs in windows
        tbl_windows_co = (
            tbl_windows
            .intervalleftjoin(tbl_co, lkey='chrom', lstart='start', lstop='stop',
                              rkey='chrom', rstart='co_pos_min', rstop='co_pos_max',
                              include_stop=True)
            .cutout(4)
            .aggregate(key=('chrom', 'start', 'stop'),
                       aggregation=lambda vals: sum(1 for v in vals if v is not None),
                       value='cross')
            .rename('value', 'co_count')

        )

        # augment
        tbl_aug = (
            tbl_windows_co
            .addfield('cen_dist', distance_to_centromere)
            .addfield('tel_dist', distance_to_telomere)
            .notnone('cen_dist')
            .addfield('n_coding', n_coding)
            .addfield('gc', gc)
            .addfield('n_variants', n_variants)
            .unpack('n_variants', ('n_snps', 'n_indels'))
            .addfield('motif_jiang_a_dist', distance_to_motif(motif_jiang_a, vmax=vmax_motif))
            .addfield('motif_jiang_b_dist', distance_to_motif(motif_jiang_b, vmax=vmax_motif))
            .addfield('motif_jiang_c_dist', distance_to_motif(motif_jiang_c, vmax=vmax_motif))
            .addfield('motif_jiang_d_dist', distance_to_motif(motif_jiang_d, vmax=vmax_motif))
            .addfield('motif_jiang_e_dist', distance_to_motif(motif_jiang_e, vmax=vmax_motif))
            .addfield('motif_aat_dist', distance_to_motif('AA[TC]AA[TC]AA[TC]AA[TC]', vmax=vmax_motif))
            .addfield('motif_tta_dist', distance_to_motif('TT[AG]TT[AG]TT[AG]TT[AG]', vmax=vmax_motif))
            .addfield('motif_at_dist', distance_to_motif('ATATATATATAT', vmax=vmax_motif))
            .addfield('motif_a_dist', distance_to_motif('AAAAAAAAAAAA', vmax=vmax_motif))
            .addfield('motif_t_dist', distance_to_motif('TTTTTTTTTTTT', vmax=vmax_motif))
        )
        tbl_aug.progress(1000).topickle(fn)

    tbl_aug = (etl
        .frompickle(fn)
        # hack in extra field to avoid completely regenerating
        .addfield('n_tr', n_tr)
        .addfield('core_edge_dist', distance_to_core_edge)
    )
    # convert to dataframe
    df = tbl_aug.todataframe()
    
    return df

Exploratory analysis


In [31]:
df = tabulate_co_predictors(5000, vmax_motif=None)
df.head()


Out[31]:
chrom start stop co_count cen_dist tel_dist n_coding gc n_snps n_indels ... motif_jiang_c_dist motif_jiang_d_dist motif_jiang_e_dist motif_aat_dist motif_tta_dist motif_at_dist motif_a_dist motif_t_dist n_tr core_edge_dist
0 Pf3D7_01_v3 92901 97900 0 363720.5 92901 0 0.1644 29 13 ... 147487.5 544659.5 32.5 961.5 3004.5 389.5 651.5 921.5 849 0
1 Pf3D7_01_v3 97901 102900 0 358720.5 97901 3258 0.2336 8 5 ... 142487.5 539659.5 1484.5 2297.5 1851.5 1760.5 1658.5 1195.5 1115 5000
2 Pf3D7_01_v3 102901 107900 0 353720.5 102901 354 0.1518 8 20 ... 137487.5 534659.5 3158.5 3098.5 2453.5 93.5 525.5 495.5 1164 10000
3 Pf3D7_01_v3 107901 112900 0 348720.5 107901 1805 0.2098 7 18 ... 132487.5 529659.5 8158.5 325.5 501.5 122.5 191.5 530.5 959 15000
4 Pf3D7_01_v3 112901 117900 0 343720.5 112901 2899 0.2128 7 14 ... 127487.5 524659.5 13158.5 5325.5 243.5 1539.5 463.5 1462.5 790 20000

5 rows × 22 columns


In [45]:
df.columns


Out[45]:
Index(['chrom', 'start', 'stop', 'co_count', 'cen_dist', 'tel_dist',
       'n_coding', 'gc', 'n_snps', 'n_indels', 'motif_jiang_a_dist',
       'motif_jiang_b_dist', 'motif_jiang_c_dist', 'motif_jiang_d_dist',
       'motif_jiang_e_dist', 'motif_aat_dist', 'motif_tta_dist',
       'motif_at_dist', 'motif_a_dist', 'motif_t_dist', 'n_tr',
       'core_edge_dist'],
      dtype='object')

In [32]:
for v in ('cen_dist',
          'tel_dist',
          'core_edge_dist'):
    sns.regplot(v, 'co_count', data=df, x_bins=20)
    plt.title(v)
    plt.xlim(left=0)
    plt.ylim(0, 0.8)
    plt.show();



In [33]:
for v in ('motif_jiang_a_dist', 
          'motif_jiang_b_dist', 
          'motif_jiang_c_dist', 
          'motif_jiang_d_dist', 
          'motif_jiang_e_dist'):
    sns.regplot(v, 'co_count', data=df, x_bins=np.arange(0, 200000, 10000))
    plt.xlim(0, 200000)
    plt.ylim(0, 0.8)
    plt.title(v)
    plt.show();



In [36]:
for v in ('motif_aat_dist', 
          'motif_tta_dist',
          'motif_at_dist',
          'motif_a_dist',
          'motif_t_dist',
         ):
    sns.regplot(v, 'co_count', data=df, x_bins=20)
    plt.title(v)
    plt.xlim(left=0)
    plt.ylim(0, .8)
    plt.show();



In [37]:
sns.regplot('n_coding', 'co_count', data=df, x_bins=20)
plt.xlim(left=0)
plt.ylim(0, .8);



In [38]:
sns.regplot('n_tr', 'co_count', data=df, x_bins=20)
plt.xlim(left=0)
plt.ylim(0, .8);



In [39]:
sns.regplot('gc', 'co_count', data=df, x_bins=10)
plt.ylim(0, .8);



In [40]:
sns.regplot('n_snps', 'co_count', data=df, x_bins=20)
plt.xlim(left=0)
plt.ylim(0, .8);



In [41]:
sns.regplot('n_indels', 'co_count', data=df, x_bins=20)
plt.ylim(0, .8);


Modelling


In [81]:
@functools.lru_cache(maxsize=None)
def glm_poisson(window_size, vmax_motif=None, cen_threshold=100000):
    df = tabulate_co_predictors(window_size, vmax_motif=vmax_motif)
    df_cen = df[df.cen_dist <= cen_threshold].copy()
    df_tel = df[df.cen_dist > cen_threshold].copy()

#     # standardise predictors
#     for v in df.columns:
#         if v not in ('chrom', 'start', 'stop', 'co_count'):
#             a = df_cen[v].values
#             df_cen[v] = (a - a.mean()) / a.std(ddof=0)
#             a = df_tel[v].values
#             df_tel[v] = (a - a.mean()) / a.std(ddof=0)

    rd_cen = %R -i df_cen -o rd rd <- glm(co_count ~ cen_dist + tel_dist + n_coding + n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + motif_t_dist, data = df_cen, family = poisson)
    summ_cen = %R summary(rd)
    t_cen = %R -i rd -o t t <- dispersiontest(rd, trafo=1, alternative="greater")

    rd_tel = %R -i df_tel -o rd rd <- glm(co_count ~ cen_dist + tel_dist + n_coding + n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + motif_t_dist, data = df_tel, family = poisson)
    summ_tel = %R summary(rd)
    t_tel = %R -i rd -o t t <- dispersiontest(rd, trafo=1, alternative="greater")
    
    return summ_cen, t_cen, summ_tel, t_tel

In [94]:
@functools.lru_cache(maxsize=None)
def glm_poisson_v2(window_size, vmax_motif=None, cen_threshold=100000):
    df = tabulate_co_predictors(window_size, vmax_motif=vmax_motif).copy()
    
    # clip distances
    df['cen_dist'] = df.cen_dist.clip(upper=cen_threshold)
    df['core_edge_dist'] = df.core_edge_dist.clip(upper=cen_threshold)

    # standardise predictors
    for v in df.columns:
        if v not in ('chrom', 'start', 'stop', 'co_count'):
            a = df[v].values
            df[v] = (a - a.mean()) / a.std(ddof=0)
            
    rd = %R -i df -o rd rd <- glm(co_count ~ cen_dist + core_edge_dist + n_coding + n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + motif_t_dist, data = df, family = poisson)
    summ = %R summary(rd)
    t = %R -i rd -o t t <- dispersiontest(rd, trafo=1, alternative="greater")

    return summ, t

In [95]:
@functools.lru_cache(maxsize=None)
def glm_poisson_v3(window_size, vmax_motif=None, cen_threshold=100000):
    df = tabulate_co_predictors(window_size, vmax_motif=vmax_motif)
    df_edge = df[(df.cen_dist <= cen_threshold) | (df.core_edge_dist <= cen_threshold)].copy()
    df_main = df[(df.cen_dist > cen_threshold) & (df.core_edge_dist > cen_threshold)].copy()
    
    # standardise predictors
    for v in df.columns:
        if v not in ('chrom', 'start', 'stop', 'co_count'):
            a = df_edge[v].values
            df_edge[v] = (a - a.mean()) / a.std(ddof=0)
            a = df_main[v].values
            df_main[v] = (a - a.mean()) / a.std(ddof=0)


    rd_edge = %R -i df_edge -o rd rd <- glm(co_count ~ cen_dist + core_edge_dist + n_coding + n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + motif_t_dist, data = df_edge, family = poisson)
    summ_edge = %R summary(rd)
    t_edge = %R -i rd -o t t <- dispersiontest(rd, trafo=1, alternative="greater")

    rd_main = %R -i df_main -o rd rd <- glm(co_count ~ cen_dist + core_edge_dist + n_coding + n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + motif_t_dist, data = df_main, family = poisson)
    summ_main = %R summary(rd)
    t_main = %R -i rd -o t t <- dispersiontest(rd, trafo=1, alternative="greater")
    
    return summ_edge, t_edge, summ_main, t_main

In [96]:
glm_results = dict()

vmax_motif = 100000
for window_size in 50000, 20000, 10000, 5000, 2000, 1000:
    summ_cen, t_cen, summ_tel, t_tel = glm_poisson(window_size, vmax_motif=vmax_motif)
    glm_results[window_size] = summ_cen, t_cen, summ_tel, t_tel
    print('========================================')
    print(window_size)
    print(summ_cen)
    print(t_cen)
    print(summ_tel)
    print(t_tel)


========================================
50000

Call:
glm(formula = co_count ~ cen_dist + tel_dist + n_coding + n_tr + 
    gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_cen)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.15545  -0.99226  -0.02544   0.57703   1.89893  

Coefficients: (2 not defined because of singularities)
                     Estimate Std. Error z value Pr(>|z|)   
(Intercept)        -1.233e+01  6.827e+00  -1.807  0.07084 . 
cen_dist           -1.169e-06  4.285e-06  -0.273  0.78490   
tel_dist            2.748e-08  3.072e-07   0.089  0.92873   
n_coding            2.711e-05  3.110e-05   0.872  0.38329   
n_tr                4.551e-05  1.447e-04   0.314  0.75319   
gc                  7.636e+00  1.283e+01   0.595  0.55183   
n_snps             -4.634e-04  4.103e-04  -1.129  0.25871   
n_indels            1.335e-02  4.580e-03   2.914  0.00357 **
motif_jiang_a_dist  8.730e-05  6.500e-05   1.343  0.17926   
motif_jiang_b_dist         NA         NA      NA       NA   
motif_jiang_c_dist -3.155e-06  3.539e-06  -0.891  0.37267   
motif_jiang_d_dist         NA         NA      NA       NA   
motif_jiang_e_dist -1.352e-05  7.499e-06  -1.803  0.07136 . 
motif_aat_dist      3.898e-05  2.938e-05   1.327  0.18460   
motif_tta_dist     -4.104e-05  1.835e-05  -2.236  0.02533 * 
motif_at_dist       2.774e-05  1.669e-04   0.166  0.86800   
motif_a_dist        8.517e-05  1.069e-04   0.797  0.42570   
motif_t_dist       -5.701e-05  1.371e-04  -0.416  0.67764   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 107.664  on 58  degrees of freedom
Residual deviance:  56.442  on 43  degrees of freedom
AIC: 238.89

Number of Fisher Scoring iterations: 5



	Overdispersion test

data:  rd
z = -1.1338, p-value = 0.8716
alternative hypothesis: true alpha is greater than 0
sample estimates:
     alpha 
-0.1512691 



Call:
glm(formula = co_count ~ cen_dist + tel_dist + n_coding + n_tr + 
    gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_tel)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7021  -0.8178  -0.1550   0.5811   2.8176  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -4.035e+00  1.506e+00  -2.679  0.00738 ** 
cen_dist            3.286e-08  9.081e-08   0.362  0.71748    
tel_dist            6.140e-08  9.771e-08   0.628  0.52975    
n_coding            5.514e-05  8.775e-06   6.284 3.30e-10 ***
n_tr                1.913e-04  4.703e-05   4.068 4.75e-05 ***
gc                 -2.279e+00  4.454e+00  -0.512  0.60880    
n_snps              5.302e-04  5.358e-04   0.990  0.32241    
n_indels            2.828e-03  1.617e-03   1.749  0.08034 .  
motif_jiang_a_dist  4.314e-06  2.055e-05   0.210  0.83372    
motif_jiang_b_dist  1.277e-05  1.646e-05   0.776  0.43791    
motif_jiang_c_dist -1.787e-06  1.119e-06  -1.597  0.11034    
motif_jiang_d_dist -2.811e-06  1.828e-06  -1.537  0.12421    
motif_jiang_e_dist -1.938e-06  3.068e-06  -0.632  0.52757    
motif_aat_dist      1.482e-05  9.748e-06   1.521  0.12837    
motif_tta_dist     -1.591e-06  9.736e-06  -0.163  0.87017    
motif_at_dist      -2.776e-05  4.253e-05  -0.653  0.51390    
motif_a_dist        3.992e-06  3.257e-05   0.123  0.90245    
motif_t_dist        2.733e-05  3.512e-05   0.778  0.43646    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 523.03  on 345  degrees of freedom
Residual deviance: 374.21  on 328  degrees of freedom
AIC: 1309.5

Number of Fisher Scoring iterations: 5



	Overdispersion test

data:  rd
z = -0.11731, p-value = 0.5467
alternative hypothesis: true alpha is greater than 0
sample estimates:
       alpha 
-0.008093398 


========================================
20000

Call:
glm(formula = co_count ~ cen_dist + tel_dist + n_coding + n_tr + 
    gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_cen)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3594  -1.0870  -0.2651   0.7026   3.1278  

Coefficients: (1 not defined because of singularities)
                     Estimate Std. Error z value Pr(>|z|)   
(Intercept)         8.506e+01  7.474e+03   0.011  0.99092   
cen_dist            1.027e-05  3.431e-06   2.993  0.00276 **
tel_dist            8.743e-08  2.674e-07   0.327  0.74371   
n_coding            1.083e-04  5.444e-05   1.989  0.04672 * 
n_tr                4.241e-04  1.968e-04   2.156  0.03111 * 
gc                 -5.375e+00  1.005e+01  -0.535  0.59272   
n_snps              3.377e-04  5.433e-04   0.622  0.53419   
n_indels            1.751e-03  6.311e-03   0.277  0.78140   
motif_jiang_a_dist -9.250e-03  8.176e-01  -0.011  0.99097   
motif_jiang_b_dist  8.373e-03  7.429e-01   0.011  0.99101   
motif_jiang_c_dist  1.027e-06  2.789e-06   0.368  0.71266   
motif_jiang_d_dist         NA         NA      NA       NA   
motif_jiang_e_dist -2.111e-06  6.939e-06  -0.304  0.76092   
motif_aat_dist      1.219e-06  2.394e-05   0.051  0.95941   
motif_tta_dist     -1.381e-05  1.779e-05  -0.777  0.43732   
motif_at_dist      -1.142e-04  1.391e-04  -0.821  0.41149   
motif_a_dist        4.640e-05  8.008e-05   0.579  0.56227   
motif_t_dist        7.787e-05  1.025e-04   0.760  0.44728   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 223.71  on 135  degrees of freedom
Residual deviance: 171.01  on 119  degrees of freedom
AIC: 419.32

Number of Fisher Scoring iterations: 13



	Overdispersion test

data:  rd
z = 1.1694, p-value = 0.1211
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.1608989 



Call:
glm(formula = co_count ~ cen_dist + tel_dist + n_coding + n_tr + 
    gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_tel)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0295  -1.3544  -0.2083   0.5801   3.6192  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -2.685e+00  1.220e+00  -2.202   0.0277 *  
cen_dist           -1.487e-07  8.664e-08  -1.716   0.0862 .  
tel_dist            7.348e-08  9.136e-08   0.804   0.4212    
n_coding            1.078e-04  1.601e-05   6.732 1.67e-11 ***
n_tr                3.429e-04  7.645e-05   4.486 7.27e-06 ***
gc                 -3.516e+00  3.310e+00  -1.062   0.2881    
n_snps             -3.248e-06  9.503e-04  -0.003   0.9973    
n_indels            1.996e-03  2.544e-03   0.785   0.4326    
motif_jiang_a_dist -1.528e-05  2.075e-05  -0.736   0.4615    
motif_jiang_b_dist  2.562e-05  1.714e-05   1.495   0.1350    
motif_jiang_c_dist -6.899e-07  1.082e-06  -0.638   0.5236    
motif_jiang_d_dist -2.063e-06  1.851e-06  -1.115   0.2650    
motif_jiang_e_dist -2.166e-06  3.095e-06  -0.700   0.4839    
motif_aat_dist      1.393e-05  8.088e-06   1.722   0.0850 .  
motif_tta_dist      6.023e-07  8.292e-06   0.073   0.9421    
motif_at_dist      -9.469e-05  4.859e-05  -1.949   0.0513 .  
motif_a_dist       -3.050e-05  3.599e-05  -0.847   0.3968    
motif_t_dist        6.633e-05  3.488e-05   1.902   0.0572 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1192.0  on 847  degrees of freedom
Residual deviance: 1079.3  on 830  degrees of freedom
AIC: 2463.7

Number of Fisher Scoring iterations: 5



	Overdispersion test

data:  rd
z = 2.553, p-value = 0.00534
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.1372251 


========================================
10000

Call:
glm(formula = co_count ~ cen_dist + tel_dist + n_coding + n_tr + 
    gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_cen)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9436  -1.0547  -0.5922   0.5160   2.5825  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)         3.892e+00  1.196e+01   0.325  0.74487    
cen_dist            1.198e-05  3.072e-06   3.900 9.63e-05 ***
tel_dist           -2.784e-08  2.522e-07  -0.110  0.91210    
n_coding            2.630e-04  8.136e-05   3.232  0.00123 ** 
n_tr                3.623e-04  3.107e-04   1.166  0.24352    
gc                 -1.201e+01  7.095e+00  -1.693  0.09049 .  
n_snps             -8.782e-04  1.626e-03  -0.540  0.58905    
n_indels            8.180e-03  9.080e-03   0.901  0.36767    
motif_jiang_a_dist -2.260e-02  1.071e+00  -0.021  0.98317    
motif_jiang_b_dist  1.957e-02  9.293e-01   0.021  0.98320    
motif_jiang_c_dist  1.560e-06  2.582e-06   0.604  0.54558    
motif_jiang_d_dist  2.981e-03  1.419e-01   0.021  0.98324    
motif_jiang_e_dist  9.462e-07  6.573e-06   0.144  0.88554    
motif_aat_dist      3.342e-06  2.194e-05   0.152  0.87891    
motif_tta_dist     -1.403e-05  1.753e-05  -0.801  0.42335    
motif_at_dist       7.850e-05  1.208e-04   0.650  0.51597    
motif_a_dist       -9.295e-05  9.102e-05  -1.021  0.30721    
motif_t_dist       -1.457e-04  1.024e-04  -1.423  0.15460    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 346.96  on 259  degrees of freedom
Residual deviance: 286.87  on 242  degrees of freedom
AIC: 595.4

Number of Fisher Scoring iterations: 14



	Overdispersion test

data:  rd
z = 1.4518, p-value = 0.07328
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.1038759 



Call:
glm(formula = co_count ~ cen_dist + tel_dist + n_coding + n_tr + 
    gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_tel)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1439  -1.1618  -0.8437   0.4521   4.0794  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -3.274e+00  1.206e+00  -2.714  0.00665 ** 
cen_dist           -1.790e-07  7.958e-08  -2.250  0.02445 *  
tel_dist            6.901e-08  8.333e-08   0.828  0.40753    
n_coding            1.517e-04  2.571e-05   5.900 3.64e-09 ***
n_tr                4.903e-04  1.096e-04   4.475 7.64e-06 ***
gc                 -5.264e-01  2.384e+00  -0.221  0.82521    
n_snps             -1.372e-03  1.652e-03  -0.831  0.40617    
n_indels           -5.375e-03  3.594e-03  -1.496  0.13478    
motif_jiang_a_dist -1.739e-05  2.198e-05  -0.791  0.42885    
motif_jiang_b_dist  3.568e-05  1.811e-05   1.970  0.04887 *  
motif_jiang_c_dist -1.040e-06  1.010e-06  -1.030  0.30323    
motif_jiang_d_dist -2.795e-06  1.677e-06  -1.666  0.09562 .  
motif_jiang_e_dist -2.571e-06  2.832e-06  -0.908  0.36400    
motif_aat_dist      6.733e-06  8.050e-06   0.836  0.40294    
motif_tta_dist     -1.275e-05  8.145e-06  -1.566  0.11735    
motif_at_dist       5.075e-05  4.025e-05   1.261  0.20737    
motif_a_dist       -9.023e-05  3.151e-05  -2.863  0.00419 ** 
motif_t_dist       -1.437e-05  3.114e-05  -0.462  0.64440    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 2098.6  on 1685  degrees of freedom
Residual deviance: 1980.0  on 1668  degrees of freedom
AIC: 3806.7

Number of Fisher Scoring iterations: 6



	Overdispersion test

data:  rd
z = 3.4687, p-value = 0.0002615
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.1441716 


========================================
5000

Call:
glm(formula = co_count ~ cen_dist + tel_dist + n_coding + n_tr + 
    gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_cen)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4379  -0.9076  -0.6826   0.3888   2.6451  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)         5.995e+00  1.089e+01   0.550 0.582075    
cen_dist            1.287e-05  2.784e-06   4.625 3.75e-06 ***
tel_dist            2.141e-07  2.269e-07   0.944 0.345316    
n_coding            3.992e-04  1.199e-04   3.329 0.000873 ***
n_tr                9.992e-04  4.246e-04   2.353 0.018606 *  
gc                 -5.904e+00  4.862e+00  -1.215 0.224555    
n_snps             -8.449e-03  1.042e-02  -0.811 0.417464    
n_indels            4.682e-03  1.249e-02   0.375 0.707748    
motif_jiang_a_dist -4.588e-02  2.696e+00  -0.017 0.986420    
motif_jiang_b_dist  3.979e-02  2.339e+00   0.017 0.986424    
motif_jiang_c_dist  1.822e-06  2.478e-06   0.735 0.462317    
motif_jiang_d_dist  5.997e-03  3.571e-01   0.017 0.986600    
motif_jiang_e_dist  1.420e-06  6.145e-06   0.231 0.817227    
motif_aat_dist      1.010e-05  1.977e-05   0.511 0.609263    
motif_tta_dist     -6.328e-06  1.573e-05  -0.402 0.687394    
motif_at_dist      -1.342e-04  1.026e-04  -1.309 0.190576    
motif_a_dist        7.424e-05  7.217e-05   1.029 0.303634    
motif_t_dist        7.697e-05  7.765e-05   0.991 0.321554    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 546.68  on 509  degrees of freedom
Residual deviance: 475.00  on 492  degrees of freedom
AIC: 852.75

Number of Fisher Scoring iterations: 16



	Overdispersion test

data:  rd
z = 1.7742, p-value = 0.03801
alternative hypothesis: true alpha is greater than 0
sample estimates:
     alpha 
0.09381233 



Call:
glm(formula = co_count ~ cen_dist + tel_dist + n_coding + n_tr + 
    gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_tel)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5729  -0.9484  -0.8250   0.6113   4.2226  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -3.159e+00  1.099e+00  -2.875  0.00404 ** 
cen_dist           -2.864e-07  7.235e-08  -3.959 7.52e-05 ***
tel_dist            1.152e-07  7.378e-08   1.561  0.11854    
n_coding            2.285e-04  3.988e-05   5.730 1.00e-08 ***
n_tr                7.199e-04  1.576e-04   4.566 4.96e-06 ***
gc                 -1.892e+00  1.656e+00  -1.143  0.25316    
n_snps             -5.544e-03  3.260e-03  -1.700  0.08907 .  
n_indels           -1.602e-02  5.025e-03  -3.187  0.00144 ** 
motif_jiang_a_dist -1.997e-05  2.096e-05  -0.953  0.34066    
motif_jiang_b_dist  4.085e-05  1.733e-05   2.357  0.01842 *  
motif_jiang_c_dist -1.668e-06  9.289e-07  -1.796  0.07246 .  
motif_jiang_d_dist -3.053e-06  1.524e-06  -2.003  0.04513 *  
motif_jiang_e_dist -1.826e-06  2.551e-06  -0.716  0.47417    
motif_aat_dist      1.007e-05  7.131e-06   1.412  0.15793    
motif_tta_dist     -1.226e-05  7.281e-06  -1.684  0.09221 .  
motif_at_dist      -8.354e-06  3.742e-05  -0.223  0.82334    
motif_a_dist       -5.811e-05  2.878e-05  -2.019  0.04347 *  
motif_t_dist        3.208e-05  2.879e-05   1.114  0.26514    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 3639.6  on 3363  degrees of freedom
Residual deviance: 3484.3  on 3346  degrees of freedom
AIC: 5846.7

Number of Fisher Scoring iterations: 6



	Overdispersion test

data:  rd
z = 5.7684, p-value = 4.002e-09
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.1976622 


========================================
2000

Call:
glm(formula = co_count ~ cen_dist + tel_dist + n_coding + n_tr + 
    gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_cen)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1967  -0.7081  -0.5618  -0.3948   3.3324  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)         4.642e+00  8.627e+00   0.538  0.59056    
cen_dist            1.349e-05  2.362e-06   5.710 1.13e-08 ***
tel_dist            5.147e-08  1.985e-07   0.259  0.79546    
n_coding            8.249e-04  1.923e-04   4.291 1.78e-05 ***
n_tr                9.985e-04  6.108e-04   1.635  0.10212    
gc                 -7.723e+00  2.939e+00  -2.628  0.00859 ** 
n_snps             -7.185e-03  1.144e-02  -0.628  0.52991    
n_indels            1.599e-02  1.843e-02   0.868  0.38558    
motif_jiang_a_dist -1.568e-01  6.409e+00  -0.024  0.98048    
motif_jiang_b_dist  1.360e-01  5.560e+00   0.024  0.98049    
motif_jiang_c_dist -3.230e-07  2.190e-06  -0.148  0.88273    
motif_jiang_d_dist  2.073e-02  8.490e-01   0.024  0.98052    
motif_jiang_e_dist  2.763e-06  5.395e-06   0.512  0.60851    
motif_aat_dist      1.084e-05  1.810e-05   0.599  0.54901    
motif_tta_dist     -5.082e-06  1.368e-05  -0.372  0.71017    
motif_at_dist      -2.432e-05  8.172e-05  -0.298  0.76601    
motif_a_dist        5.310e-05  5.849e-05   0.908  0.36399    
motif_t_dist        1.966e-05  6.987e-05   0.281  0.77841    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 983.12  on 1250  degrees of freedom
Residual deviance: 894.95  on 1233  degrees of freedom
AIC: 1422.5

Number of Fisher Scoring iterations: 18



	Overdispersion test

data:  rd
z = 2.4498, p-value = 0.007146
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.1130616 



Call:
glm(formula = co_count ~ cen_dist + tel_dist + n_coding + n_tr + 
    gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_tel)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.3376  -0.7603  -0.6647  -0.5045   4.8691  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -2.692e+00  8.357e-01  -3.221  0.00128 ** 
cen_dist           -4.304e-07  5.988e-08  -7.187 6.61e-13 ***
tel_dist            2.262e-07  5.802e-08   3.898 9.68e-05 ***
n_coding            3.288e-04  6.204e-05   5.300 1.16e-07 ***
n_tr                6.824e-04  2.269e-04   3.008  0.00263 ** 
gc                 -2.635e+00  9.643e-01  -2.733  0.00628 ** 
n_snps             -1.435e-02  5.420e-03  -2.648  0.00809 ** 
n_indels           -3.473e-02  7.005e-03  -4.958 7.12e-07 ***
motif_jiang_a_dist -2.157e-05  1.662e-05  -1.298  0.19444    
motif_jiang_b_dist  3.971e-05  1.387e-05   2.863  0.00419 ** 
motif_jiang_c_dist -2.503e-06  7.663e-07  -3.267  0.00109 ** 
motif_jiang_d_dist -2.112e-06  1.299e-06  -1.626  0.10405    
motif_jiang_e_dist -7.014e-07  2.055e-06  -0.341  0.73283    
motif_aat_dist      1.215e-05  5.724e-06   2.122  0.03385 *  
motif_tta_dist     -5.616e-06  5.768e-06  -0.974  0.33020    
motif_at_dist       1.847e-05  2.915e-05   0.634  0.52626    
motif_a_dist       -4.453e-05  2.267e-05  -1.965  0.04947 *  
motif_t_dist        5.176e-05  2.208e-05   2.344  0.01907 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 7434.2  on 8405  degrees of freedom
Residual deviance: 7149.7  on 8388  degrees of freedom
AIC: 10806

Number of Fisher Scoring iterations: 6



	Overdispersion test

data:  rd
z = 8.7819, p-value < 2.2e-16
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.2469231 


========================================
1000

Call:
glm(formula = co_count ~ cen_dist + tel_dist + n_coding + n_tr + 
    gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_cen)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.3726  -0.6181  -0.4960  -0.3754   3.5725  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -1.076e+00  6.108e+00  -0.176  0.86011    
cen_dist            1.413e-05  1.924e-06   7.343 2.09e-13 ***
tel_dist            6.142e-08  1.637e-07   0.375  0.70758    
n_coding            1.058e-03  2.604e-04   4.063 4.85e-05 ***
n_tr                8.856e-04  7.596e-04   1.166  0.24367    
gc                 -5.144e+00  1.903e+00  -2.704  0.00686 ** 
n_snps             -2.366e-02  1.952e-02  -1.212  0.22546    
n_indels            2.339e-02  2.216e-02   1.055  0.29130    
motif_jiang_a_dist -3.437e-01  9.185e+00  -0.037  0.97015    
motif_jiang_b_dist  2.982e-01  7.968e+00   0.037  0.97015    
motif_jiang_c_dist -9.773e-07  1.847e-06  -0.529  0.59674    
motif_jiang_d_dist  4.556e-02  1.217e+00   0.037  0.97013    
motif_jiang_e_dist  3.309e-06  4.498e-06   0.736  0.46192    
motif_aat_dist      1.201e-05  1.505e-05   0.798  0.42503    
motif_tta_dist     -1.139e-05  1.173e-05  -0.971  0.33164    
motif_at_dist      -1.537e-05  6.332e-05  -0.243  0.80818    
motif_a_dist        8.245e-05  4.629e-05   1.781  0.07488 .  
motif_t_dist        4.256e-05  5.542e-05   0.768  0.44255    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1642.3  on 2507  degrees of freedom
Residual deviance: 1520.8  on 2490  degrees of freedom
AIC: 2294

Number of Fisher Scoring iterations: 19



	Overdispersion test

data:  rd
z = 2.6988, p-value = 0.00348
alternative hypothesis: true alpha is greater than 0
sample estimates:
     alpha 
0.09264667 



Call:
glm(formula = co_count ~ cen_dist + tel_dist + n_coding + n_tr + 
    gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_tel)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4239  -0.6847  -0.5884  -0.4686   5.4210  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -2.776e+00  6.476e-01  -4.287 1.81e-05 ***
cen_dist           -4.994e-07  4.842e-08 -10.315  < 2e-16 ***
tel_dist            3.136e-07  4.535e-08   6.916 4.63e-12 ***
n_coding            4.961e-04  8.278e-05   5.993 2.07e-09 ***
n_tr                1.119e-03  2.662e-04   4.205 2.61e-05 ***
gc                 -1.642e+00  6.097e-01  -2.693 0.007089 ** 
n_snps             -2.799e-02  7.323e-03  -3.821 0.000133 ***
n_indels           -5.812e-02  8.307e-03  -6.997 2.62e-12 ***
motif_jiang_a_dist -3.132e-05  1.445e-05  -2.168 0.030191 *  
motif_jiang_b_dist  4.656e-05  1.242e-05   3.749 0.000177 ***
motif_jiang_c_dist -3.499e-06  6.207e-07  -5.637 1.73e-08 ***
motif_jiang_d_dist -1.950e-06  1.052e-06  -1.854 0.063796 .  
motif_jiang_e_dist  7.633e-07  1.624e-06   0.470 0.638399    
motif_aat_dist      1.287e-05  4.545e-06   2.832 0.004623 ** 
motif_tta_dist     -6.349e-06  4.594e-06  -1.382 0.167022    
motif_at_dist       2.174e-05  2.264e-05   0.960 0.337052    
motif_a_dist       -3.521e-05  1.779e-05  -1.980 0.047727 *  
motif_t_dist        7.250e-05  1.717e-05   4.221 2.43e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 13276  on 16790  degrees of freedom
Residual deviance: 12715  on 16773  degrees of freedom
AIC: 18452

Number of Fisher Scoring iterations: 6



	Overdispersion test

data:  rd
z = 12.046, p-value < 2.2e-16
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.2706418 



In [97]:
glm_results_v2 = dict()

vmax_motif = 100000
for window_size in 50000, 20000, 10000, 5000, 2000, 1000:
    summ, t = glm_poisson_v2(window_size, vmax_motif=vmax_motif)
    glm_results_v2[window_size] = summ, t
    print('========================================')
    print(window_size)
    print(summ)
    print(t)


========================================
50000

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.6448  -0.8773  -0.1451   0.5901   2.7139  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)         0.9537963  0.0337909  28.226  < 2e-16 ***
cen_dist            0.0481926  0.0347353   1.387  0.16531    
core_edge_dist      0.0005682  0.0535129   0.011  0.99153    
n_coding            0.2935413  0.0468073   6.271 3.58e-10 ***
n_tr                0.2318742  0.0580502   3.994 6.49e-05 ***
gc                 -0.0267640  0.0443534  -0.603  0.54622    
n_snps              0.0157451  0.0325115   0.484  0.62818    
n_indels            0.1722385  0.0617032   2.791  0.00525 ** 
motif_jiang_a_dist  0.0512677  0.1633442   0.314  0.75363    
motif_jiang_b_dist  0.0906028  0.1495637   0.606  0.54466    
motif_jiang_c_dist -0.0527997  0.0315332  -1.674  0.09405 .  
motif_jiang_d_dist -0.0473472  0.0296410  -1.597  0.11019    
motif_jiang_e_dist -0.0407283  0.0341438  -1.193  0.23293    
motif_aat_dist      0.0557053  0.0303719   1.834  0.06664 .  
motif_tta_dist     -0.0369530  0.0303290  -1.218  0.22307    
motif_at_dist      -0.0314104  0.0544076  -0.577  0.56373    
motif_a_dist        0.0137204  0.0470266   0.292  0.77047    
motif_t_dist        0.0238585  0.0421201   0.566  0.57109    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 630.93  on 404  degrees of freedom
Residual deviance: 444.04  on 387  degrees of freedom
AIC: 1529.8

Number of Fisher Scoring iterations: 5



	Overdispersion test

data:  rd
z = -0.014771, p-value = 0.5059
alternative hypothesis: true alpha is greater than 0
sample estimates:
        alpha 
-0.0009153641 


========================================
20000

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1901  -1.3214  -0.2319   0.6021   3.3668  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)         0.143459   0.031016   4.625 3.74e-06 ***
cen_dist            0.085107   0.034762   2.448   0.0144 *  
core_edge_dist      0.069661   0.048228   1.444   0.1486    
n_coding            0.324358   0.046244   7.014 2.31e-12 ***
n_tr                0.284970   0.047429   6.008 1.87e-09 ***
gc                 -0.025698   0.042938  -0.598   0.5495    
n_snps              0.010103   0.031390   0.322   0.7476    
n_indels            0.041142   0.046515   0.884   0.3764    
motif_jiang_a_dist -0.125075   0.110501  -1.132   0.2577    
motif_jiang_b_dist  0.146155   0.120960   1.208   0.2269    
motif_jiang_c_dist -0.020285   0.030557  -0.664   0.5068    
motif_jiang_d_dist -0.023116   0.028908  -0.800   0.4239    
motif_jiang_e_dist -0.019890   0.033643  -0.591   0.5544    
motif_aat_dist      0.050610   0.028400   1.782   0.0747 .  
motif_tta_dist     -0.006402   0.029990  -0.213   0.8310    
motif_at_dist      -0.110845   0.051254  -2.163   0.0306 *  
motif_a_dist       -0.030737   0.041595  -0.739   0.4599    
motif_t_dist        0.091129   0.039819   2.289   0.0221 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1416.4  on 983  degrees of freedom
Residual deviance: 1269.2  on 966  degrees of freedom
AIC: 2867.9

Number of Fisher Scoring iterations: 5



	Overdispersion test

data:  rd
z = 3.146, p-value = 0.0008275
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.1571471 


========================================
10000

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1691  -1.1746  -0.7726   0.4405   3.7965  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -0.404395   0.029059 -13.916  < 2e-16 ***
cen_dist            0.090858   0.032274   2.815  0.00487 ** 
core_edge_dist      0.102666   0.045426   2.260  0.02382 *  
n_coding            0.316461   0.047733   6.630 3.36e-11 ***
n_tr                0.243449   0.043118   5.646 1.64e-08 ***
gc                  0.002058   0.040678   0.051  0.95964    
n_snps             -0.058458   0.063013  -0.928  0.35355    
n_indels           -0.038449   0.041167  -0.934  0.35031    
motif_jiang_a_dist -0.140275   0.097132  -1.444  0.14869    
motif_jiang_b_dist  0.169892   0.113915   1.491  0.13586    
motif_jiang_c_dist -0.023875   0.028718  -0.831  0.40576    
motif_jiang_d_dist -0.031487   0.026095  -1.207  0.22758    
motif_jiang_e_dist -0.017548   0.030921  -0.568  0.57036    
motif_aat_dist      0.022803   0.027427   0.831  0.40574    
motif_tta_dist     -0.048141   0.029359  -1.640  0.10107    
motif_at_dist       0.054541   0.047126   1.157  0.24713    
motif_a_dist       -0.112827   0.039761  -2.838  0.00455 ** 
motif_t_dist       -0.038586   0.037988  -1.016  0.30976    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 2445.8  on 1945  degrees of freedom
Residual deviance: 2300.7  on 1928  degrees of freedom
AIC: 4400

Number of Fisher Scoring iterations: 5



	Overdispersion test

data:  rd
z = 4.41, p-value = 5.17e-06
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.1580157 


========================================
5000

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6419  -0.9551  -0.8298   0.6071   4.0492  

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -0.91684    0.02694 -34.032  < 2e-16 ***
cen_dist            0.11610    0.03000   3.870 0.000109 ***
core_edge_dist      0.14802    0.04194   3.529 0.000417 ***
n_coding            0.32251    0.04780   6.747 1.50e-11 ***
n_tr                0.25092    0.03983   6.300 2.97e-10 ***
gc                 -0.02885    0.03907  -0.738 0.460229    
n_snps             -0.14276    0.08298  -1.720 0.085354 .  
n_indels           -0.09593    0.03651  -2.627 0.008602 ** 
motif_jiang_a_dist -0.14386    0.08817  -1.632 0.102734    
motif_jiang_b_dist  0.17325    0.10609   1.633 0.102452    
motif_jiang_c_dist -0.04132    0.02653  -1.557 0.119407    
motif_jiang_d_dist -0.03136    0.02359  -1.330 0.183659    
motif_jiang_e_dist -0.00638    0.02805  -0.227 0.820082    
motif_aat_dist      0.03767    0.02457   1.534 0.125151    
motif_tta_dist     -0.04005    0.02651  -1.511 0.130819    
motif_at_dist      -0.03892    0.04289  -0.907 0.364190    
motif_a_dist       -0.05609    0.03554  -1.578 0.114528    
motif_t_dist        0.04513    0.03275   1.378 0.168185    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 4186.5  on 3873  degrees of freedom
Residual deviance: 4006.2  on 3856  degrees of freedom
AIC: 6710.3

Number of Fisher Scoring iterations: 6



	Overdispersion test

data:  rd
z = 6.8011, p-value = 5.192e-12
alternative hypothesis: true alpha is greater than 0
sample estimates:
   alpha 
0.198634 


========================================
2000

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.2979  -0.7583  -0.6736  -0.4791   4.7057  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -1.442611   0.022168 -65.075  < 2e-16 ***
cen_dist            0.147582   0.025346   5.823 5.79e-09 ***
core_edge_dist      0.220620   0.036094   6.112 9.82e-10 ***
n_coding            0.259458   0.040037   6.480 9.14e-11 ***
n_tr                0.146468   0.032925   4.449 8.64e-06 ***
gc                 -0.085575   0.033385  -2.563 0.010369 *  
n_snps             -0.180362   0.076095  -2.370 0.017777 *  
n_indels           -0.129661   0.028971  -4.475 7.62e-06 ***
motif_jiang_a_dist -0.144232   0.067260  -2.144 0.032000 *  
motif_jiang_b_dist  0.123770   0.083104   1.489 0.136402    
motif_jiang_c_dist -0.077143   0.022200  -3.475 0.000511 ***
motif_jiang_d_dist -0.008378   0.020091  -0.417 0.676683    
motif_jiang_e_dist  0.004351   0.022963   0.189 0.849720    
motif_aat_dist      0.042096   0.019962   2.109 0.034959 *  
motif_tta_dist     -0.022017   0.021283  -1.035 0.300901    
motif_at_dist       0.008203   0.033022   0.248 0.803821    
motif_a_dist       -0.048065   0.027825  -1.727 0.084095 .  
motif_t_dist        0.054082   0.025748   2.100 0.035690 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 8422.6  on 9656  degrees of freedom
Residual deviance: 8132.0  on 9639  degrees of freedom
AIC: 12280

Number of Fisher Scoring iterations: 6



	Overdispersion test

data:  rd
z = 10.1, p-value < 2.2e-16
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.2470581 


========================================
1000

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.2738  -0.6799  -0.5974  -0.4606   5.2144  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -1.708745   0.018142 -94.185  < 2e-16 ***
cen_dist            0.174764   0.021121   8.274  < 2e-16 ***
core_edge_dist      0.249349   0.029692   8.398  < 2e-16 ***
n_coding            0.220913   0.031262   7.066 1.59e-12 ***
n_tr                0.131888   0.024867   5.304 1.13e-07 ***
gc                 -0.069574   0.026260  -2.649 0.008062 ** 
n_snps             -0.222301   0.058123  -3.825 0.000131 ***
n_indels           -0.144568   0.022517  -6.420 1.36e-10 ***
motif_jiang_a_dist -0.179807   0.059675  -3.013 0.002586 ** 
motif_jiang_b_dist  0.154784   0.075239   2.057 0.039665 *  
motif_jiang_c_dist -0.109046   0.018065  -6.036 1.58e-09 ***
motif_jiang_d_dist -0.001787   0.016259  -0.110 0.912507    
motif_jiang_e_dist  0.021231   0.018305   1.160 0.246098    
motif_aat_dist      0.044707   0.015903   2.811 0.004935 ** 
motif_tta_dist     -0.030727   0.017124  -1.794 0.072752 .  
motif_at_dist       0.012561   0.025605   0.491 0.623725    
motif_a_dist       -0.036343   0.021760  -1.670 0.094876 .  
motif_t_dist        0.078743   0.019984   3.940 8.14e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 14938  on 19298  degrees of freedom
Residual deviance: 14398  on 19281  degrees of freedom
AIC: 20871

Number of Fisher Scoring iterations: 6



	Overdispersion test

data:  rd
z = 13.902, p-value < 2.2e-16
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.2729437 



In [86]:
glm_results_v3 = dict()

vmax_motif = 100000
for window_size in 50000, 20000, 10000, 5000, 2000, 1000:
    summ_edge, t_edge, summ_main, t_main = glm_poisson_v3(window_size, vmax_motif=vmax_motif)
    glm_results_v3[window_size] = summ_edge, t_edge, summ_main, t_main
    print('========================================')
    print(window_size)
    print('==================== edge ====================')
    print(summ_edge)
    print(t_edge)
    print('==================== main ====================')
    print(summ_main)
    print(t_main)


/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: Error in glm.fit(x = numeric(0), y = integer(0), weights = NULL, start = NULL,  : 
  object 'fit' not found

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: In addition: 
  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: Warning messages:

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: 1: glm.fit: fitted rates numerically 0 occurred 

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: 2: glm.fit: fitted rates numerically 0 occurred 

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: 3: glm.fit: fitted rates numerically 0 occurred 

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: 4: glm.fit: fitted rates numerically 0 occurred 

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: 5: 
  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: In glm.fit(x = numeric(0), y = integer(0), weights = NULL, start = NULL,  :
  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: 
 
  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning:  no observations informative at iteration 1

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: 6: glm.fit: algorithm did not converge 

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: 1: 
  res = super(Function, self).__call__(*new_args, **new_kwargs)
Error in glm.fit(x = numeric(0), y = integer(0), weights = NULL, start = NULL,  : 
  object 'fit' not found
========================================
50000
==================== edge ====================

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_edge)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1282  -0.8290  -0.1567   0.4204   2.3169  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)         0.699145   0.073620   9.497  < 2e-16 ***
cen_dist            0.123472   0.075948   1.626  0.10400    
core_edge_dist      0.047892   0.066182   0.724  0.46929    
n_coding            0.303519   0.097856   3.102  0.00192 ** 
n_tr                0.258031   0.125048   2.063  0.03907 *  
gc                  0.030931   0.105549   0.293  0.76949    
n_snps             -0.033437   0.067698  -0.494  0.62137    
n_indels            0.444417   0.138383   3.212  0.00132 ** 
motif_jiang_a_dist -0.001399   0.297662  -0.005  0.99625    
motif_jiang_b_dist  0.111600   0.256953   0.434  0.66406    
motif_jiang_c_dist -0.062777   0.063721  -0.985  0.32454    
motif_jiang_d_dist  0.023316   0.090361   0.258  0.79638    
motif_jiang_e_dist -0.054049   0.069420  -0.779  0.43623    
motif_aat_dist      0.015013   0.054535   0.275  0.78309    
motif_tta_dist     -0.157664   0.058861  -2.679  0.00739 ** 
motif_at_dist      -0.146242   0.116996  -1.250  0.21131    
motif_a_dist        0.108273   0.106389   1.018  0.30881    
motif_t_dist        0.081539   0.082587   0.987  0.32349    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 264.91  on 134  degrees of freedom
Residual deviance: 130.33  on 117  degrees of freedom
AIC: 486.35

Number of Fisher Scoring iterations: 5



	Overdispersion test

data:  rd
z = -0.64469, p-value = 0.7404
alternative hypothesis: true alpha is greater than 0
sample estimates:
      alpha 
-0.06343747 


==================== main ====================

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_edge)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1282  -0.8290  -0.1567   0.4204   2.3169  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)         0.699145   0.073620   9.497  < 2e-16 ***
cen_dist            0.123472   0.075948   1.626  0.10400    
core_edge_dist      0.047892   0.066182   0.724  0.46929    
n_coding            0.303519   0.097856   3.102  0.00192 ** 
n_tr                0.258031   0.125048   2.063  0.03907 *  
gc                  0.030931   0.105549   0.293  0.76949    
n_snps             -0.033437   0.067698  -0.494  0.62137    
n_indels            0.444417   0.138383   3.212  0.00132 ** 
motif_jiang_a_dist -0.001399   0.297662  -0.005  0.99625    
motif_jiang_b_dist  0.111600   0.256953   0.434  0.66406    
motif_jiang_c_dist -0.062777   0.063721  -0.985  0.32454    
motif_jiang_d_dist  0.023316   0.090361   0.258  0.79638    
motif_jiang_e_dist -0.054049   0.069420  -0.779  0.43623    
motif_aat_dist      0.015013   0.054535   0.275  0.78309    
motif_tta_dist     -0.157664   0.058861  -2.679  0.00739 ** 
motif_at_dist      -0.146242   0.116996  -1.250  0.21131    
motif_a_dist        0.108273   0.106389   1.018  0.30881    
motif_t_dist        0.081539   0.082587   0.987  0.32349    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 264.91  on 134  degrees of freedom
Residual deviance: 130.33  on 117  degrees of freedom
AIC: 486.35

Number of Fisher Scoring iterations: 5



	Overdispersion test

data:  rd
z = -0.64469, p-value = 0.7404
alternative hypothesis: true alpha is greater than 0
sample estimates:
      alpha 
-0.06343747 



Error in glm.fit(x = numeric(0), y = integer(0), weights = NULL, start = NULL,  : 
  object 'fit' not found
========================================
20000
==================== edge ====================

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_edge)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2588  -1.0335  -0.3335   0.5659   3.2551  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -0.051177   0.067436  -0.759 0.447912    
cen_dist            0.106821   0.081946   1.304 0.192386    
core_edge_dist      0.030967   0.066718   0.464 0.642542    
n_coding            0.529510   0.101634   5.210 1.89e-07 ***
n_tr                0.311768   0.094100   3.313 0.000923 ***
gc                 -0.017144   0.098791  -0.174 0.862227    
n_snps             -0.060575   0.069121  -0.876 0.380827    
n_indels            0.241784   0.098245   2.461 0.013854 *  
motif_jiang_a_dist -0.355062   0.217841  -1.630 0.103119    
motif_jiang_b_dist  0.323534   0.224278   1.443 0.149145    
motif_jiang_c_dist  0.023451   0.060886   0.385 0.700118    
motif_jiang_d_dist  0.039322   0.093140   0.422 0.672891    
motif_jiang_e_dist  0.000621   0.071815   0.009 0.993101    
motif_aat_dist      0.029710   0.055943   0.531 0.595363    
motif_tta_dist     -0.034661   0.057040  -0.608 0.543412    
motif_at_dist      -0.216697   0.115937  -1.869 0.061609 .  
motif_a_dist        0.056625   0.056933   0.995 0.319931    
motif_t_dist        0.123535   0.099903   1.237 0.216254    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 437.18  on 286  degrees of freedom
Residual deviance: 332.73  on 269  degrees of freedom
AIC: 791.75

Number of Fisher Scoring iterations: 5



	Overdispersion test

data:  rd
z = 0.93582, p-value = 0.1747
alternative hypothesis: true alpha is greater than 0
sample estimates:
     alpha 
0.08778128 


==================== main ====================

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_edge)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2588  -1.0335  -0.3335   0.5659   3.2551  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -0.051177   0.067436  -0.759 0.447912    
cen_dist            0.106821   0.081946   1.304 0.192386    
core_edge_dist      0.030967   0.066718   0.464 0.642542    
n_coding            0.529510   0.101634   5.210 1.89e-07 ***
n_tr                0.311768   0.094100   3.313 0.000923 ***
gc                 -0.017144   0.098791  -0.174 0.862227    
n_snps             -0.060575   0.069121  -0.876 0.380827    
n_indels            0.241784   0.098245   2.461 0.013854 *  
motif_jiang_a_dist -0.355062   0.217841  -1.630 0.103119    
motif_jiang_b_dist  0.323534   0.224278   1.443 0.149145    
motif_jiang_c_dist  0.023451   0.060886   0.385 0.700118    
motif_jiang_d_dist  0.039322   0.093140   0.422 0.672891    
motif_jiang_e_dist  0.000621   0.071815   0.009 0.993101    
motif_aat_dist      0.029710   0.055943   0.531 0.595363    
motif_tta_dist     -0.034661   0.057040  -0.608 0.543412    
motif_at_dist      -0.216697   0.115937  -1.869 0.061609 .  
motif_a_dist        0.056625   0.056933   0.995 0.319931    
motif_t_dist        0.123535   0.099903   1.237 0.216254    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 437.18  on 286  degrees of freedom
Residual deviance: 332.73  on 269  degrees of freedom
AIC: 791.75

Number of Fisher Scoring iterations: 5



	Overdispersion test

data:  rd
z = 0.93582, p-value = 0.1747
alternative hypothesis: true alpha is greater than 0
sample estimates:
     alpha 
0.08778128 



Error in glm.fit(x = numeric(0), y = integer(0), weights = NULL, start = NULL,  : 
  object 'fit' not found
========================================
10000
==================== edge ====================

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_edge)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8840  -1.0606  -0.5957   0.4939   2.6803  

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -0.64860    0.06763  -9.590  < 2e-16 ***
cen_dist            0.14298    0.07776   1.839  0.06594 .  
core_edge_dist      0.05913    0.06543   0.904  0.36621    
n_coding            0.55581    0.10646   5.221 1.78e-07 ***
n_tr                0.22927    0.09217   2.487  0.01287 *  
gc                 -0.01348    0.10282  -0.131  0.89572    
n_snps             -0.23908    0.16182  -1.477  0.13956    
n_indels            0.25893    0.08934   2.898  0.00375 ** 
motif_jiang_a_dist -0.35376    0.19102  -1.852  0.06404 .  
motif_jiang_b_dist  0.36396    0.21212   1.716  0.08620 .  
motif_jiang_c_dist  0.03868    0.05989   0.646  0.51839    
motif_jiang_d_dist  0.02236    0.07963   0.281  0.77888    
motif_jiang_e_dist -0.01475    0.06967  -0.212  0.83236    
motif_aat_dist      0.02114    0.05382   0.393  0.69449    
motif_tta_dist     -0.07227    0.05850  -1.235  0.21670    
motif_at_dist       0.06344    0.10408   0.610  0.54218    
motif_a_dist       -0.15201    0.07505  -2.025  0.04283 *  
motif_t_dist       -0.15453    0.09819  -1.574  0.11554    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 661.89  on 536  degrees of freedom
Residual deviance: 559.00  on 519  degrees of freedom
AIC: 1112.1

Number of Fisher Scoring iterations: 6



	Overdispersion test

data:  rd
z = 1.0125, p-value = 0.1557
alternative hypothesis: true alpha is greater than 0
sample estimates:
     alpha 
0.05246205 


==================== main ====================

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_edge)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8840  -1.0606  -0.5957   0.4939   2.6803  

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -0.64860    0.06763  -9.590  < 2e-16 ***
cen_dist            0.14298    0.07776   1.839  0.06594 .  
core_edge_dist      0.05913    0.06543   0.904  0.36621    
n_coding            0.55581    0.10646   5.221 1.78e-07 ***
n_tr                0.22927    0.09217   2.487  0.01287 *  
gc                 -0.01348    0.10282  -0.131  0.89572    
n_snps             -0.23908    0.16182  -1.477  0.13956    
n_indels            0.25893    0.08934   2.898  0.00375 ** 
motif_jiang_a_dist -0.35376    0.19102  -1.852  0.06404 .  
motif_jiang_b_dist  0.36396    0.21212   1.716  0.08620 .  
motif_jiang_c_dist  0.03868    0.05989   0.646  0.51839    
motif_jiang_d_dist  0.02236    0.07963   0.281  0.77888    
motif_jiang_e_dist -0.01475    0.06967  -0.212  0.83236    
motif_aat_dist      0.02114    0.05382   0.393  0.69449    
motif_tta_dist     -0.07227    0.05850  -1.235  0.21670    
motif_at_dist       0.06344    0.10408   0.610  0.54218    
motif_a_dist       -0.15201    0.07505  -2.025  0.04283 *  
motif_t_dist       -0.15453    0.09819  -1.574  0.11554    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 661.89  on 536  degrees of freedom
Residual deviance: 559.00  on 519  degrees of freedom
AIC: 1112.1

Number of Fisher Scoring iterations: 6



	Overdispersion test

data:  rd
z = 1.0125, p-value = 0.1557
alternative hypothesis: true alpha is greater than 0
sample estimates:
     alpha 
0.05246205 



Error in glm.fit(x = numeric(0), y = integer(0), weights = NULL, start = NULL,  : 
  object 'fit' not found
========================================
5000
==================== edge ====================

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_edge)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4067  -0.8806  -0.6930   0.4116   2.9489  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -1.231994   0.071369 -17.262  < 2e-16 ***
cen_dist            0.134162   0.072506   1.850 0.064262 .  
core_edge_dist      0.094563   0.060671   1.559 0.119084    
n_coding            0.606427   0.107460   5.643 1.67e-08 ***
n_tr                0.321544   0.086655   3.711 0.000207 ***
gc                 -0.055219   0.096080  -0.575 0.565484    
n_snps             -0.656653   0.311629  -2.107 0.035104 *  
n_indels            0.211908   0.079869   2.653 0.007974 ** 
motif_jiang_a_dist -0.346729   0.179600  -1.931 0.053537 .  
motif_jiang_b_dist  0.397119   0.205359   1.934 0.053140 .  
motif_jiang_c_dist  0.040292   0.057297   0.703 0.481926    
motif_jiang_d_dist -0.006961   0.068933  -0.101 0.919559    
motif_jiang_e_dist -0.028055   0.066595  -0.421 0.673551    
motif_aat_dist      0.003797   0.052484   0.072 0.942331    
motif_tta_dist     -0.049446   0.053984  -0.916 0.359700    
motif_at_dist      -0.115588   0.094365  -1.225 0.220612    
motif_a_dist        0.056419   0.060187   0.937 0.348553    
motif_t_dist        0.039715   0.083556   0.475 0.634566    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1031.95  on 1038  degrees of freedom
Residual deviance:  927.65  on 1021  degrees of freedom
AIC: 1584.9

Number of Fisher Scoring iterations: 6



	Overdispersion test

data:  rd
z = 2.028, p-value = 0.02128
alternative hypothesis: true alpha is greater than 0
sample estimates:
     alpha 
0.08024102 


==================== main ====================

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_edge)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4067  -0.8806  -0.6930   0.4116   2.9489  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -1.231994   0.071369 -17.262  < 2e-16 ***
cen_dist            0.134162   0.072506   1.850 0.064262 .  
core_edge_dist      0.094563   0.060671   1.559 0.119084    
n_coding            0.606427   0.107460   5.643 1.67e-08 ***
n_tr                0.321544   0.086655   3.711 0.000207 ***
gc                 -0.055219   0.096080  -0.575 0.565484    
n_snps             -0.656653   0.311629  -2.107 0.035104 *  
n_indels            0.211908   0.079869   2.653 0.007974 ** 
motif_jiang_a_dist -0.346729   0.179600  -1.931 0.053537 .  
motif_jiang_b_dist  0.397119   0.205359   1.934 0.053140 .  
motif_jiang_c_dist  0.040292   0.057297   0.703 0.481926    
motif_jiang_d_dist -0.006961   0.068933  -0.101 0.919559    
motif_jiang_e_dist -0.028055   0.066595  -0.421 0.673551    
motif_aat_dist      0.003797   0.052484   0.072 0.942331    
motif_tta_dist     -0.049446   0.053984  -0.916 0.359700    
motif_at_dist      -0.115588   0.094365  -1.225 0.220612    
motif_a_dist        0.056419   0.060187   0.937 0.348553    
motif_t_dist        0.039715   0.083556   0.475 0.634566    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1031.95  on 1038  degrees of freedom
Residual deviance:  927.65  on 1021  degrees of freedom
AIC: 1584.9

Number of Fisher Scoring iterations: 6



	Overdispersion test

data:  rd
z = 2.028, p-value = 0.02128
alternative hypothesis: true alpha is greater than 0
sample estimates:
     alpha 
0.08024102 



Error in glm.fit(x = numeric(0), y = integer(0), weights = NULL, start = NULL,  : 
  object 'fit' not found
========================================
2000
==================== edge ====================

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_edge)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.2692  -0.6680  -0.5613  -0.4079   3.1867  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -1.796278   0.055375 -32.438  < 2e-16 ***
cen_dist            0.052228   0.063870   0.818  0.41352    
core_edge_dist      0.063475   0.053707   1.182  0.23726    
n_coding            0.664982   0.091740   7.249 4.21e-13 ***
n_tr                0.234166   0.071847   3.259  0.00112 ** 
gc                 -0.222730   0.082417  -2.702  0.00688 ** 
n_snps             -0.408974   0.238575  -1.714  0.08648 .  
n_indels            0.153030   0.064115   2.387  0.01699 *  
motif_jiang_a_dist -0.303529   0.137462  -2.208  0.02724 *  
motif_jiang_b_dist  0.334510   0.159583   2.096  0.03607 *  
motif_jiang_c_dist  0.003071   0.051059   0.060  0.95205    
motif_jiang_d_dist  0.015832   0.063349   0.250  0.80265    
motif_jiang_e_dist -0.037303   0.058615  -0.636  0.52451    
motif_aat_dist      0.029926   0.046119   0.649  0.51641    
motif_tta_dist     -0.037403   0.047426  -0.789  0.43031    
motif_at_dist       0.023325   0.076422   0.305  0.76020    
motif_a_dist        0.006732   0.051112   0.132  0.89521    
motif_t_dist       -0.028018   0.073828  -0.380  0.70431    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1813.6  on 2534  degrees of freedom
Residual deviance: 1703.5  on 2517  degrees of freedom
AIC: 2617.5

Number of Fisher Scoring iterations: 6



	Overdispersion test

data:  rd
z = 2.1779, p-value = 0.01471
alternative hypothesis: true alpha is greater than 0
sample estimates:
     alpha 
0.06106801 


==================== main ====================

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_edge)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.2692  -0.6680  -0.5613  -0.4079   3.1867  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -1.796278   0.055375 -32.438  < 2e-16 ***
cen_dist            0.052228   0.063870   0.818  0.41352    
core_edge_dist      0.063475   0.053707   1.182  0.23726    
n_coding            0.664982   0.091740   7.249 4.21e-13 ***
n_tr                0.234166   0.071847   3.259  0.00112 ** 
gc                 -0.222730   0.082417  -2.702  0.00688 ** 
n_snps             -0.408974   0.238575  -1.714  0.08648 .  
n_indels            0.153030   0.064115   2.387  0.01699 *  
motif_jiang_a_dist -0.303529   0.137462  -2.208  0.02724 *  
motif_jiang_b_dist  0.334510   0.159583   2.096  0.03607 *  
motif_jiang_c_dist  0.003071   0.051059   0.060  0.95205    
motif_jiang_d_dist  0.015832   0.063349   0.250  0.80265    
motif_jiang_e_dist -0.037303   0.058615  -0.636  0.52451    
motif_aat_dist      0.029926   0.046119   0.649  0.51641    
motif_tta_dist     -0.037403   0.047426  -0.789  0.43031    
motif_at_dist       0.023325   0.076422   0.305  0.76020    
motif_a_dist        0.006732   0.051112   0.132  0.89521    
motif_t_dist       -0.028018   0.073828  -0.380  0.70431    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1813.6  on 2534  degrees of freedom
Residual deviance: 1703.5  on 2517  degrees of freedom
AIC: 2617.5

Number of Fisher Scoring iterations: 6



	Overdispersion test

data:  rd
z = 2.1779, p-value = 0.01471
alternative hypothesis: true alpha is greater than 0
sample estimates:
     alpha 
0.06106801 



Error in glm.fit(x = numeric(0), y = integer(0), weights = NULL, start = NULL,  : 
  object 'fit' not found
========================================
1000
==================== edge ====================

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_edge)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1465  -0.5720  -0.4942  -0.4009   3.4100  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -2.095272   0.045535 -46.014  < 2e-16 ***
cen_dist            0.015920   0.052806   0.301 0.763054    
core_edge_dist      0.056642   0.044453   1.274 0.202591    
n_coding            0.554574   0.072006   7.702 1.34e-14 ***
n_tr                0.188078   0.054984   3.421 0.000625 ***
gc                 -0.225541   0.065638  -3.436 0.000590 ***
n_snps             -0.443761   0.192540  -2.305 0.021179 *  
n_indels            0.101448   0.049891   2.033 0.042014 *  
motif_jiang_a_dist -0.330114   0.118874  -2.777 0.005486 ** 
motif_jiang_b_dist  0.394121   0.141812   2.779 0.005449 ** 
motif_jiang_c_dist -0.010516   0.042554  -0.247 0.804813    
motif_jiang_d_dist  0.008346   0.051664   0.162 0.871667    
motif_jiang_e_dist -0.063734   0.049319  -1.292 0.196262    
motif_aat_dist      0.030494   0.038353   0.795 0.426561    
motif_tta_dist     -0.063583   0.040301  -1.578 0.114632    
motif_at_dist       0.064657   0.060403   1.070 0.284427    
motif_a_dist        0.017642   0.040977   0.431 0.666798    
motif_t_dist       -0.041100   0.059805  -0.687 0.491941    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 3007.8  on 5049  degrees of freedom
Residual deviance: 2864.3  on 5032  degrees of freedom
AIC: 4222.9

Number of Fisher Scoring iterations: 6



	Overdispersion test

data:  rd
z = 2.055, p-value = 0.01994
alternative hypothesis: true alpha is greater than 0
sample estimates:
     alpha 
0.04186068 


==================== main ====================

Call:
glm(formula = co_count ~ cen_dist + core_edge_dist + n_coding + 
    n_tr + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + 
    motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + 
    motif_aat_dist + motif_tta_dist + motif_at_dist + motif_a_dist + 
    motif_t_dist, family = poisson, data = df_edge)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1465  -0.5720  -0.4942  -0.4009   3.4100  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -2.095272   0.045535 -46.014  < 2e-16 ***
cen_dist            0.015920   0.052806   0.301 0.763054    
core_edge_dist      0.056642   0.044453   1.274 0.202591    
n_coding            0.554574   0.072006   7.702 1.34e-14 ***
n_tr                0.188078   0.054984   3.421 0.000625 ***
gc                 -0.225541   0.065638  -3.436 0.000590 ***
n_snps             -0.443761   0.192540  -2.305 0.021179 *  
n_indels            0.101448   0.049891   2.033 0.042014 *  
motif_jiang_a_dist -0.330114   0.118874  -2.777 0.005486 ** 
motif_jiang_b_dist  0.394121   0.141812   2.779 0.005449 ** 
motif_jiang_c_dist -0.010516   0.042554  -0.247 0.804813    
motif_jiang_d_dist  0.008346   0.051664   0.162 0.871667    
motif_jiang_e_dist -0.063734   0.049319  -1.292 0.196262    
motif_aat_dist      0.030494   0.038353   0.795 0.426561    
motif_tta_dist     -0.063583   0.040301  -1.578 0.114632    
motif_at_dist       0.064657   0.060403   1.070 0.284427    
motif_a_dist        0.017642   0.040977   0.431 0.666798    
motif_t_dist       -0.041100   0.059805  -0.687 0.491941    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 3007.8  on 5049  degrees of freedom
Residual deviance: 2864.3  on 5032  degrees of freedom
AIC: 4222.9

Number of Fisher Scoring iterations: 6



	Overdispersion test

data:  rd
z = 2.055, p-value = 0.01994
alternative hypothesis: true alpha is greater than 0
sample estimates:
     alpha 
0.04186068 


/usr/local/lib/python3.5/dist-packages/rpy2/robjects/functions.py:106: UserWarning: 2: glm.fit: algorithm did not converge 

  res = super(Function, self).__call__(*new_args, **new_kwargs)

Extra plots


In [98]:
tbl_co


Out[98]:
0|sample 1|chrom 2|co_pos_mid 3|co_pos_min 4|co_pos_max 5|co_pos_range 6|cross 7|co_from_parent 8|co_to_parent
B1SD/PG0015-C/ERR019044 Pf3D7_01_v3 145052 144877 145227 350 hb3_dd2 hb3 dd2
GC03/PG0021-C/ERR015447 Pf3D7_01_v3 163584 163145 164024 879 hb3_dd2 dd2 hb3
XF12/PG0102-C/ERR029143 Pf3D7_01_v3 206769 205803 207736 1933 7g8_gb4 gb4 7g8
7C159/PG0040-Cx/ERR107475 Pf3D7_01_v3 206905 206074 207736 1662 hb3_dd2 hb3 dd2
CH3_61/PG0033-Cx/ERR175544 Pf3D7_01_v3 206905 206074 207736 1662 hb3_dd2 dd2 hb3

...


In [100]:
n_co = dict()
for chrom in sorted(fasta.keys()):
    log(chrom)
    n_co[chrom] = np.zeros(len(fasta[chrom]), dtype='i4')
    for rec in tbl_co.eq('chrom', chrom).records():
        start = rec.co_pos_min - 1
        stop = rec.co_pos_max
        n_co[chrom][start:stop] = n_co[chrom][start:stop] + 1

n_co


2016-03-15 11:44:35.006675 :: Pf3D7_01_v3
2016-03-15 11:44:35.028216 :: Pf3D7_02_v3
2016-03-15 11:44:35.051196 :: Pf3D7_03_v3
2016-03-15 11:44:35.073735 :: Pf3D7_04_v3
2016-03-15 11:44:35.095973 :: Pf3D7_05_v3
2016-03-15 11:44:35.120908 :: Pf3D7_06_v3
2016-03-15 11:44:35.143683 :: Pf3D7_07_v3
2016-03-15 11:44:35.167533 :: Pf3D7_08_v3
2016-03-15 11:44:35.186177 :: Pf3D7_09_v3
2016-03-15 11:44:35.200679 :: Pf3D7_10_v3
2016-03-15 11:44:35.214952 :: Pf3D7_11_v3
2016-03-15 11:44:35.235677 :: Pf3D7_12_v3
2016-03-15 11:44:35.252268 :: Pf3D7_13_v3
2016-03-15 11:44:35.278449 :: Pf3D7_14_v3
Out[100]:
{'Pf3D7_01_v3': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'Pf3D7_02_v3': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'Pf3D7_03_v3': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'Pf3D7_04_v3': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'Pf3D7_05_v3': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'Pf3D7_06_v3': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'Pf3D7_07_v3': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'Pf3D7_08_v3': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'Pf3D7_09_v3': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'Pf3D7_10_v3': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'Pf3D7_11_v3': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'Pf3D7_12_v3': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'Pf3D7_13_v3': array([0, 0, 0, ..., 0, 0, 0], dtype=int32),
 'Pf3D7_14_v3': array([0, 0, 0, ..., 0, 0, 0], dtype=int32)}

In [101]:
is_core = dict()
for chrom in sorted(fasta.keys()):
    log(chrom)
    is_core[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
    for rec in tbl_regions_1b.eq('region_chrom', chrom).eq('region_type', 'Core').records():
        start = rec.region_start - 1
        stop = rec.region_stop
        is_core[chrom][start:stop] = True


2016-03-15 11:44:52.459005 :: Pf3D7_01_v3
2016-03-15 11:44:52.461700 :: Pf3D7_02_v3
2016-03-15 11:44:52.463765 :: Pf3D7_03_v3
2016-03-15 11:44:52.465871 :: Pf3D7_04_v3
2016-03-15 11:44:52.467982 :: Pf3D7_05_v3
2016-03-15 11:44:52.470132 :: Pf3D7_06_v3
2016-03-15 11:44:52.472345 :: Pf3D7_07_v3
2016-03-15 11:44:52.474382 :: Pf3D7_08_v3
2016-03-15 11:44:52.476216 :: Pf3D7_09_v3
2016-03-15 11:44:52.478277 :: Pf3D7_10_v3
2016-03-15 11:44:52.480251 :: Pf3D7_11_v3
2016-03-15 11:44:52.481930 :: Pf3D7_12_v3
2016-03-15 11:44:52.483976 :: Pf3D7_13_v3
2016-03-15 11:44:52.486012 :: Pf3D7_14_v3

In [102]:
is_exon = dict()
for chrom in sorted(fasta.keys()):
    log(chrom)
    is_exon[chrom] = np.zeros(len(fasta[chrom]), dtype='b1')
    for rec in tbl_exons.eq('feature_chrom', chrom).records():
        start = rec.feature_start - 1
        stop = rec.feature_stop
        is_exon[chrom][start:stop] = True


2016-03-15 11:45:09.174195 :: Pf3D7_01_v3
2016-03-15 11:45:09.254450 :: Pf3D7_02_v3
2016-03-15 11:45:09.328169 :: Pf3D7_03_v3
2016-03-15 11:45:09.406301 :: Pf3D7_04_v3
2016-03-15 11:45:09.487373 :: Pf3D7_05_v3
2016-03-15 11:45:09.573311 :: Pf3D7_06_v3
2016-03-15 11:45:09.644841 :: Pf3D7_07_v3
2016-03-15 11:45:09.712922 :: Pf3D7_08_v3
2016-03-15 11:45:09.783390 :: Pf3D7_09_v3
2016-03-15 11:45:09.855530 :: Pf3D7_10_v3
2016-03-15 11:45:09.929839 :: Pf3D7_11_v3
2016-03-15 11:45:10.006652 :: Pf3D7_12_v3
2016-03-15 11:45:10.087393 :: Pf3D7_13_v3
2016-03-15 11:45:10.170985 :: Pf3D7_14_v3

In [104]:
coding_tr = list()
coding_nontr = list()
noncoding_tr = list()
noncoding_nontr = list()
for chrom in sorted(fasta.keys()):
    # coding tr
    c = is_core[chrom] & is_exon[chrom] & is_tr[chrom]
    x = n_co[chrom][c]
    coding_tr.append(x)
    # coding non-tr
    c = is_core[chrom] & is_exon[chrom] & ~is_tr[chrom]
    x = n_co[chrom][c]
    coding_nontr.append(x)
    # non-coding tr
    c = is_core[chrom] & ~is_exon[chrom] & is_tr[chrom]
    x = n_co[chrom][c]
    noncoding_tr.append(x)
    # non-coding non-tr
    c = is_core[chrom] & ~is_exon[chrom] & ~is_tr[chrom]
    x = n_co[chrom][c]
    noncoding_nontr.append(x)
    
coding_tr = np.concatenate(coding_tr)
coding_nontr = np.concatenate(coding_nontr)
noncoding_tr = np.concatenate(noncoding_tr)
noncoding_nontr = np.concatenate(noncoding_nontr)

In [107]:
for x in coding_tr, coding_nontr, noncoding_tr, noncoding_nontr:
    print(x.mean(), x.std())


0.166780142546 0.477043084051
0.168413758092 0.486159724882
0.106075666555 0.393877725344
0.100056326805 0.383056799382

In [106]:
fig, ax = plt.subplots()
ax.boxplot([coding_tr, coding_nontr, noncoding_tr, noncoding_nontr]);


Legacy


In [49]:
analyse_dispersion(1000)


2016-03-14 22:16:39.324392 :: =================================
2016-03-14 22:16:39.324850 :: 1000
window size: 1000bp
0|co_count 1|count 2|frequency
0 17573 0.844774540909528
1 2584 0.1242188251129699
2 438 0.021055667724257283
3 192 0.009229881742140178
4 12 0.0005768676088837611
5 3 0.00014421690222094028
augmented windows
0|chrom 1|start 2|stop 3|co_count 4|cen_dist 5|n_coding 6|gc 7|n_snps 8|n_indels 9|motif_jiang_a_dist 10|motif_jiang_b_dist 11|motif_jiang_c_dist 12|motif_jiang_d_dist 13|motif_jiang_e_dist 14|motif_aat_dist 15|motif_tta_dist
Pf3D7_01_v3 92901 93900 0 365720.5 0 0.212 1 0 526858.0 523834.5 149487.5 546659.5 1967.5 1015.5 1004.5
Pf3D7_01_v3 93901 94900 0 364720.5 0 0.124 3 6 525858.0 522834.5 148487.5 545659.5 967.5 38.5 2004.5
Pf3D7_01_v3 94901 95900 0 363720.5 0 0.284 25 3 524858.0 521834.5 147487.5 544659.5 32.5 961.5 3004.5
Pf3D7_01_v3 95901 96900 0 362720.5 0 0.108 0 2 523858.0 520834.5 146487.5 543659.5 1032.5 662.5 2148.5
Pf3D7_01_v3 96901 97900 0 361720.5 0 0.094 0 2 522858.0 519834.5 145487.5 542659.5 1515.5 337.5 1148.5

...

2016-03-14 22:20:40.077024 :: ************************
2016-03-14 22:20:40.077556 :: model centromeric
2016-03-14 22:20:40.078163 :: 0.162679425837
2016-03-14 22:20:40.078484 :: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2016-03-14 22:20:40.218620 :: 
Call:
glm(formula = co_count ~ cen_dist + n_coding + gc + n_snps + 
    n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + 
    motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + 
    motif_tta_dist, family = poisson, data = df_cen)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1991  -0.6112  -0.4950  -0.3734   3.5918  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -2.380e+00  3.338e-01  -7.129 1.01e-12 ***
cen_dist            1.486e-05  1.866e-06   7.963 1.68e-15 ***
n_coding            1.078e-03  2.234e-04   4.824 1.41e-06 ***
gc                 -5.733e+00  1.824e+00  -3.143  0.00167 ** 
n_snps             -2.491e-02  1.821e-02  -1.368  0.17133    
n_indels            1.879e-02  2.045e-02   0.919  0.35821    
motif_jiang_a_dist -6.709e-09  5.404e-08  -0.124  0.90119    
motif_jiang_b_dist -1.275e-07  6.699e-08  -1.904  0.05695 .  
motif_jiang_c_dist -4.495e-07  1.143e-06  -0.393  0.69420    
motif_jiang_d_dist  3.560e-07  1.285e-07   2.770  0.00560 ** 
motif_jiang_e_dist  2.849e-06  4.481e-06   0.636  0.52493    
motif_aat_dist      1.676e-05  1.195e-05   1.403  0.16062    
motif_tta_dist     -1.286e-05  1.036e-05  -1.241  0.21450    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1642.3  on 2507  degrees of freedom
Residual deviance: 1528.0  on 2495  degrees of freedom
AIC: 2291.2

Number of Fisher Scoring iterations: 6


2016-03-14 22:20:40.249699 :: 
	Overdispersion test

data:  rd
z = 2.6454, p-value = 0.00408
alternative hypothesis: true alpha is greater than 0
sample estimates:
     alpha 
0.09036992 


2016-03-14 22:20:40.260034 :: ************************
2016-03-14 22:20:40.262031 :: model telomeric
2016-03-14 22:20:40.264285 :: 0.203442320291
2016-03-14 22:20:40.267022 :: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2016-03-14 22:20:41.559157 :: 
Call:
glm(formula = co_count ~ cen_dist + n_coding + gc + n_snps + 
    n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + 
    motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + 
    motif_tta_dist, family = poisson, data = df_tel)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.0964  -0.6795  -0.5929  -0.4849   5.3062  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -8.202e-01  1.053e-01  -7.791 6.67e-15 ***
cen_dist           -5.240e-07  4.994e-08 -10.492  < 2e-16 ***
n_coding            5.369e-04  7.244e-05   7.411 1.25e-13 ***
gc                 -2.833e+00  5.814e-01  -4.873 1.10e-06 ***
n_snps             -3.031e-02  7.260e-03  -4.174 2.99e-05 ***
n_indels           -4.745e-02  7.896e-03  -6.009 1.86e-09 ***
motif_jiang_a_dist  9.884e-08  1.784e-08   5.542 2.99e-08 ***
motif_jiang_b_dist -1.021e-07  2.048e-08  -4.983 6.27e-07 ***
motif_jiang_c_dist -3.737e-06  4.920e-07  -7.596 3.05e-14 ***
motif_jiang_d_dist  3.689e-08  2.942e-08   1.254   0.2099    
motif_jiang_e_dist -9.672e-07  1.609e-06  -0.601   0.5477    
motif_aat_dist      7.101e-06  4.182e-06   1.698   0.0895 .  
motif_tta_dist     -3.147e-06  3.927e-06  -0.801   0.4229    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 13276  on 16790  degrees of freedom
Residual deviance: 12821  on 16778  degrees of freedom
AIC: 18548

Number of Fisher Scoring iterations: 6


2016-03-14 22:20:41.596904 :: 
	Overdispersion test

data:  rd
z = 12.993, p-value < 2.2e-16
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.2973965 



In [50]:
analyse_dispersion(2000)


2016-03-14 22:20:41.624153 :: =================================
2016-03-14 22:20:41.627089 :: 2000
window size: 2000bp
0|co_count 1|count 2|frequency
0 8351 0.8022864828513786
1 1618 0.15544240561052936
2 311 0.02987799020078778
3 117 0.011240272840810837
4 10 0.0009607070804111826
5 2 0.00019214141608223654
augmented windows
0|chrom 1|start 2|stop 3|co_count 4|cen_dist 5|n_coding 6|gc 7|n_snps 8|n_indels 9|motif_jiang_a_dist 10|motif_jiang_b_dist 11|motif_jiang_c_dist 12|motif_jiang_d_dist 13|motif_jiang_e_dist 14|motif_aat_dist 15|motif_tta_dist
Pf3D7_01_v3 92901 94900 0 365220.5 0 0.168 4 6 526358.0 523334.5 148987.5 546159.5 1467.5 538.5 1504.5
Pf3D7_01_v3 94901 96900 0 363220.5 0 0.196 25 5 524358.0 521334.5 146987.5 544159.5 532.5 1162.5 2648.5
Pf3D7_01_v3 96901 98900 0 361220.5 82 0.0995 2 4 522358.0 519334.5 144987.5 542159.5 1015.5 202.5 648.5
Pf3D7_01_v3 98901 100900 0 359220.5 1794 0.301 4 1 520358.0 517334.5 142987.5 540159.5 984.5 1797.5 1351.5
Pf3D7_01_v3 100901 102900 0 357220.5 1382 0.2305 2 2 518358.0 515334.5 140987.5 538159.5 41.5 3797.5 3351.5

...

2016-03-14 22:22:47.271834 :: ************************
2016-03-14 22:22:47.272334 :: model centromeric
2016-03-14 22:22:47.273014 :: 0.224620303757
2016-03-14 22:22:47.273436 :: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2016-03-14 22:22:47.364960 :: 
Call:
glm(formula = co_count ~ cen_dist + n_coding + gc + n_snps + 
    n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + 
    motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + 
    motif_tta_dist, family = poisson, data = df_cen)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.3165  -0.6980  -0.5525  -0.3939   3.3306  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -1.814e+00  4.769e-01  -3.804 0.000143 ***
cen_dist            1.509e-05  2.277e-06   6.626 3.46e-11 ***
n_coding            6.938e-04  1.617e-04   4.291 1.78e-05 ***
gc                 -8.534e+00  2.755e+00  -3.098 0.001947 ** 
n_snps             -8.158e-03  1.027e-02  -0.794 0.427152    
n_indels            1.294e-02  1.666e-02   0.777 0.437375    
motif_jiang_a_dist -8.802e-08  7.096e-08  -1.240 0.214804    
motif_jiang_b_dist -9.281e-08  8.584e-08  -1.081 0.279584    
motif_jiang_c_dist  1.363e-07  1.321e-06   0.103 0.917802    
motif_jiang_d_dist  4.839e-07  1.580e-07   3.062 0.002201 ** 
motif_jiang_e_dist  3.353e-06  5.391e-06   0.622 0.533937    
motif_aat_dist      1.099e-05  1.465e-05   0.750 0.453146    
motif_tta_dist     -7.795e-06  1.215e-05  -0.641 0.521303    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 983.12  on 1250  degrees of freedom
Residual deviance: 898.04  on 1238  degrees of freedom
AIC: 1415.5

Number of Fisher Scoring iterations: 6


2016-03-14 22:22:47.378637 :: 
	Overdispersion test

data:  rd
z = 2.3414, p-value = 0.009606
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.1137319 


2016-03-14 22:22:47.381772 :: ************************
2016-03-14 22:22:47.382206 :: model telomeric
2016-03-14 22:22:47.383085 :: 0.258981679753
2016-03-14 22:22:47.383563 :: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2016-03-14 22:22:47.891085 :: 
Call:
glm(formula = co_count ~ cen_dist + n_coding + gc + n_snps + 
    n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + 
    motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + 
    motif_tta_dist, family = poisson, data = df_tel)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.1316  -0.7574  -0.6662  -0.5307   4.8899  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -4.463e-01  1.624e-01  -2.747 0.006010 ** 
cen_dist           -4.644e-07  6.157e-08  -7.542 4.61e-14 ***
n_coding            3.317e-04  5.351e-05   6.199 5.69e-10 ***
gc                 -3.998e+00  9.072e-01  -4.407 1.05e-05 ***
n_snps             -1.673e-02  5.380e-03  -3.110 0.001871 ** 
n_indels           -2.634e-02  6.544e-03  -4.025 5.70e-05 ***
motif_jiang_a_dist  8.195e-08  2.286e-08   3.585 0.000337 ***
motif_jiang_b_dist -7.648e-08  2.592e-08  -2.951 0.003172 ** 
motif_jiang_c_dist -2.567e-06  5.978e-07  -4.295 1.74e-05 ***
motif_jiang_d_dist  1.457e-08  3.670e-08   0.397 0.691261    
motif_jiang_e_dist -2.843e-06  2.028e-06  -1.401 0.161116    
motif_aat_dist      7.459e-06  5.218e-06   1.429 0.152900    
motif_tta_dist     -3.514e-06  4.937e-06  -0.712 0.476594    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 7434.2  on 8405  degrees of freedom
Residual deviance: 7201.1  on 8393  degrees of freedom
AIC: 10848

Number of Fisher Scoring iterations: 6


2016-03-14 22:22:47.912065 :: 
	Overdispersion test

data:  rd
z = 9.3373, p-value < 2.2e-16
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.2655056 



In [51]:
analyse_dispersion(5000)


2016-03-14 22:22:47.925200 :: =================================
2016-03-14 22:22:47.926155 :: 5000
window size: 5000bp
0|co_count 1|count 2|frequency
0 2869 0.687185628742515
1 935 0.22395209580838324
2 279 0.06682634730538922
3 77 0.01844311377245509
4 13 0.0031137724550898203
5 2 0.00047904191616766467
augmented windows
0|chrom 1|start 2|stop 3|co_count 4|cen_dist 5|n_coding 6|gc 7|n_snps 8|n_indels 9|motif_jiang_a_dist 10|motif_jiang_b_dist 11|motif_jiang_c_dist 12|motif_jiang_d_dist 13|motif_jiang_e_dist 14|motif_aat_dist 15|motif_tta_dist
Pf3D7_01_v3 92901 97900 0 363720.5 0 0.1644 29 13 524858.0 521834.5 147487.5 544659.5 32.5 961.5 3004.5
Pf3D7_01_v3 97901 102900 0 358720.5 3258 0.2336 8 5 519858.0 516834.5 142487.5 539659.5 1484.5 2297.5 1851.5
Pf3D7_01_v3 102901 107900 0 353720.5 354 0.1518 8 20 514858.0 511834.5 137487.5 534659.5 3158.5 3098.5 2453.5
Pf3D7_01_v3 107901 112900 0 348720.5 1805 0.2098 7 18 509858.0 506834.5 132487.5 529659.5 8158.5 325.5 501.5
Pf3D7_01_v3 112901 117900 0 343720.5 2899 0.2128 7 14 504858.0 501834.5 127487.5 524659.5 13158.5 5325.5 243.5

...

2016-03-14 22:23:42.795560 :: ************************
2016-03-14 22:23:42.796046 :: model centromeric
2016-03-14 22:23:42.796721 :: 0.417647058824
2016-03-14 22:23:42.797116 :: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2016-03-14 22:23:42.926625 :: 
Call:
glm(formula = co_count ~ cen_dist + n_coding + gc + n_snps + 
    n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + 
    motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + 
    motif_tta_dist, family = poisson, data = df_cen)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6177  -0.9010  -0.6698   0.4132   2.6366  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -1.174e+00  7.104e-01  -1.652  0.09845 .  
cen_dist            1.556e-05  2.667e-06   5.833 5.43e-09 ***
n_coding            3.179e-04  9.762e-05   3.257  0.00113 ** 
gc                 -9.728e+00  4.451e+00  -2.186  0.02885 *  
n_snps             -7.997e-03  7.547e-03  -1.060  0.28929    
n_indels            1.280e-02  1.005e-02   1.274  0.20259    
motif_jiang_a_dist -1.416e-07  8.932e-08  -1.585  0.11297    
motif_jiang_b_dist -2.381e-09  1.029e-07  -0.023  0.98154    
motif_jiang_c_dist  1.314e-06  1.415e-06   0.928  0.35332    
motif_jiang_d_dist  3.098e-07  1.804e-07   1.717  0.08600 .  
motif_jiang_e_dist  2.228e-06  6.139e-06   0.363  0.71666    
motif_aat_dist      1.496e-05  1.645e-05   0.909  0.36322    
motif_tta_dist     -2.180e-06  1.365e-05  -0.160  0.87309    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 546.68  on 509  degrees of freedom
Residual deviance: 480.89  on 497  degrees of freedom
AIC: 848.64

Number of Fisher Scoring iterations: 6


2016-03-14 22:23:42.938654 :: 
	Overdispersion test

data:  rd
z = 2.0735, p-value = 0.01906
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.1221318 


2016-03-14 22:23:42.941389 :: ************************
2016-03-14 22:23:42.941751 :: model telomeric
2016-03-14 22:23:42.942525 :: 0.430142687277
2016-03-14 22:23:42.942938 :: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2016-03-14 22:23:43.182630 :: 
Call:
glm(formula = co_count ~ cen_dist + n_coding + gc + n_snps + 
    n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + 
    motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + 
    motif_tta_dist, family = poisson, data = df_tel)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4198  -0.9508  -0.8392   0.6326   4.1470  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -3.473e-02  2.796e-01  -0.124  0.90115    
cen_dist           -3.570e-07  7.394e-08  -4.828 1.38e-06 ***
n_coding            1.838e-04  3.348e-05   5.489 4.03e-08 ***
gc                 -4.915e+00  1.547e+00  -3.177  0.00149 ** 
n_snps             -7.580e-03  3.193e-03  -2.374  0.01761 *  
n_indels           -3.765e-03  4.487e-03  -0.839  0.40138    
motif_jiang_a_dist  5.966e-08  2.886e-08   2.067  0.03874 *  
motif_jiang_b_dist -4.805e-08  3.228e-08  -1.488  0.13663    
motif_jiang_c_dist -1.560e-06  7.124e-07  -2.190  0.02854 *  
motif_jiang_d_dist -3.190e-09  4.470e-08  -0.071  0.94310    
motif_jiang_e_dist -4.643e-06  2.516e-06  -1.846  0.06493 .  
motif_aat_dist      5.891e-06  6.415e-06   0.918  0.35845    
motif_tta_dist     -8.805e-06  6.210e-06  -1.418  0.15623    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 3639.6  on 3363  degrees of freedom
Residual deviance: 3534.7  on 3351  degrees of freedom
AIC: 5887.1

Number of Fisher Scoring iterations: 6


2016-03-14 22:23:43.196385 :: 
	Overdispersion test

data:  rd
z = 6.3307, p-value = 1.22e-10
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.2159193 



In [52]:
analyse_dispersion(10000)


2016-03-14 22:23:43.208154 :: =================================
2016-03-14 22:23:43.209143 :: 10000
window size: 10000bp
0|co_count 1|count 2|frequency
0 1122 0.5350500715307582
1 599 0.28564616118264186
2 260 0.12398664759179781
3 90 0.04291845493562232
4 24 0.011444921316165951
5 1 0.0004768717215069146
6 1 0.0004768717215069146
augmented windows
0|chrom 1|start 2|stop 3|co_count 4|cen_dist 5|n_coding 6|gc 7|n_snps 8|n_indels 9|motif_jiang_a_dist 10|motif_jiang_b_dist 11|motif_jiang_c_dist 12|motif_jiang_d_dist 13|motif_jiang_e_dist 14|motif_aat_dist 15|motif_tta_dist
Pf3D7_01_v3 92901 102900 0 361220.5 3258 0.199 37 18 522358.0 519334.5 144987.5 542159.5 1015.5 202.5 648.5
Pf3D7_01_v3 102901 112900 0 351220.5 2159 0.1808 15 38 512358.0 509334.5 134987.5 532159.5 5658.5 598.5 22.5
Pf3D7_01_v3 112901 122900 0 341220.5 4792 0.1984 12 27 502358.0 499334.5 124987.5 522159.5 15658.5 7825.5 72.5
Pf3D7_01_v3 122901 132900 0 331220.5 5004 0.1735 17 39 492358.0 489334.5 114987.5 512159.5 25658.5 721.5 190.5
Pf3D7_01_v3 132901 142900 0 321220.5 6522 0.192 22 41 482358.0 479334.5 104987.5 502159.5 26483.5 4061.5 382.5

...

2016-03-14 22:24:13.229841 :: ************************
2016-03-14 22:24:13.230290 :: model centromeric
2016-03-14 22:24:13.230792 :: 0.730769230769
2016-03-14 22:24:13.231066 :: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2016-03-14 22:24:13.262359 :: 
Call:
glm(formula = co_count ~ cen_dist + n_coding + gc + n_snps + 
    n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + 
    motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + 
    motif_tta_dist, family = poisson, data = df_cen)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1677  -1.0599  -0.7272   0.5481   2.6080  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)         7.956e-02  9.702e-01   0.082  0.93465    
cen_dist            1.485e-05  2.930e-06   5.068 4.02e-07 ***
n_coding            1.784e-04  6.646e-05   2.685  0.00726 ** 
gc                 -1.363e+01  6.253e+00  -2.180  0.02924 *  
n_snps             -2.418e-03  2.513e-03  -0.962  0.33593    
n_indels            8.514e-03  7.081e-03   1.202  0.22920    
motif_jiang_a_dist -1.809e-07  9.929e-08  -1.822  0.06840 .  
motif_jiang_b_dist -2.146e-08  1.150e-07  -0.187  0.85193    
motif_jiang_c_dist  1.030e-06  1.525e-06   0.676  0.49934    
motif_jiang_d_dist  3.337e-07  1.991e-07   1.676  0.09379 .  
motif_jiang_e_dist -7.127e-07  6.609e-06  -0.108  0.91412    
motif_aat_dist      1.766e-05  1.747e-05   1.011  0.31215    
motif_tta_dist     -7.940e-06  1.478e-05  -0.537  0.59109    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 346.96  on 259  degrees of freedom
Residual deviance: 289.70  on 247  degrees of freedom
AIC: 588.22

Number of Fisher Scoring iterations: 6


2016-03-14 22:24:13.274465 :: 
	Overdispersion test

data:  rd
z = 1.6378, p-value = 0.05073
alternative hypothesis: true alpha is greater than 0
sample estimates:
   alpha 
0.121414 


2016-03-14 22:24:13.277480 :: ************************
2016-03-14 22:24:13.277841 :: model telomeric
2016-03-14 22:24:13.278603 :: 0.708185053381
2016-03-14 22:24:13.278984 :: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2016-03-14 22:24:13.513567 :: 
Call:
glm(formula = co_count ~ cen_dist + n_coding + gc + n_snps + 
    n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + 
    motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + 
    motif_tta_dist, family = poisson, data = df_tel)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.5461  -1.1810  -0.9160   0.4622   4.1441  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -1.459e-01  4.093e-01  -0.357 0.721450    
cen_dist           -2.887e-07  8.079e-08  -3.573 0.000353 ***
n_coding            1.280e-04  2.205e-05   5.804 6.49e-09 ***
gc                 -4.425e+00  2.227e+00  -1.987 0.046912 *  
n_snps             -2.631e-03  1.699e-03  -1.549 0.121453    
n_indels            5.052e-03  3.026e-03   1.669 0.095053 .  
motif_jiang_a_dist  5.165e-08  3.219e-08   1.604 0.108641    
motif_jiang_b_dist -2.726e-08  3.575e-08  -0.763 0.445725    
motif_jiang_c_dist -7.872e-07  7.617e-07  -1.033 0.301398    
motif_jiang_d_dist -8.811e-09  4.890e-08  -0.180 0.857010    
motif_jiang_e_dist -5.728e-06  2.789e-06  -2.054 0.039981 *  
motif_aat_dist      5.927e-06  7.173e-06   0.826 0.408641    
motif_tta_dist     -7.003e-06  6.856e-06  -1.021 0.307069    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 2098.6  on 1685  degrees of freedom
Residual deviance: 2023.8  on 1673  degrees of freedom
AIC: 3840.5

Number of Fisher Scoring iterations: 5


2016-03-14 22:24:13.526689 :: 
	Overdispersion test

data:  rd
z = 4.0199, p-value = 2.911e-05
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.1724322 



In [53]:
analyse_dispersion(20000)


2016-03-14 22:24:13.538662 :: =================================
2016-03-14 22:24:13.539758 :: 20000
window size: 20000bp
0|co_count 1|count 2|frequency
0 376 0.35471698113207545
1 301 0.2839622641509434
2 221 0.20849056603773586
3 102 0.09622641509433963
4 39 0.03679245283018868
5 13 0.012264150943396227
6 6 0.005660377358490566
7 2 0.0018867924528301887
augmented windows
0|chrom 1|start 2|stop 3|co_count 4|cen_dist 5|n_coding 6|gc 7|n_snps 8|n_indels 9|motif_jiang_a_dist 10|motif_jiang_b_dist 11|motif_jiang_c_dist 12|motif_jiang_d_dist 13|motif_jiang_e_dist 14|motif_aat_dist 15|motif_tta_dist
Pf3D7_01_v3 92901 112900 0 356220.5 5417 0.1899 52 56 517358.0 514334.5 139987.5 537159.5 658.5 4797.5 4351.5
Pf3D7_01_v3 112901 132900 0 336220.5 9796 0.18595 29 66 497358.0 494334.5 119987.5 517159.5 20658.5 5721.5 4362.5
Pf3D7_01_v3 132901 152900 1 316220.5 14142 0.20315 45 56 477358.0 474334.5 99987.5 497159.5 21483.5 938.5 1174.5
Pf3D7_01_v3 152901 172900 1 296220.5 12865 0.1888 62 91 457358.0 454334.5 79987.5 477159.5 1483.5 1443.5 8425.5
Pf3D7_01_v3 172901 192900 0 276220.5 8592 0.17105 128 116 437358.0 434334.5 59987.5 457159.5 8368.5 2046.5 6873.5

...

2016-03-14 22:24:30.988493 :: ************************
2016-03-14 22:24:30.988943 :: model centromeric
2016-03-14 22:24:30.989569 :: 1.31617647059
2016-03-14 22:24:30.989966 :: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2016-03-14 22:24:31.031628 :: 
Call:
glm(formula = co_count ~ cen_dist + n_coding + gc + n_snps + 
    n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + 
    motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + 
    motif_tta_dist, family = poisson, data = df_cen)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2946  -1.2042  -0.2185   0.6029   3.0070  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)         2.368e-01  1.415e+00   0.167   0.8671    
cen_dist            1.408e-05  3.288e-06   4.284 1.84e-05 ***
n_coding            7.762e-05  4.507e-05   1.722   0.0851 .  
gc                 -1.065e+01  8.953e+00  -1.190   0.2340    
n_snps              4.401e-05  5.706e-04   0.077   0.9385    
n_indels            4.030e-03  4.665e-03   0.864   0.3876    
motif_jiang_a_dist -1.847e-07  1.057e-07  -1.747   0.0806 .  
motif_jiang_b_dist -2.896e-08  1.201e-07  -0.241   0.8095    
motif_jiang_c_dist  1.064e-06  1.535e-06   0.693   0.4883    
motif_jiang_d_dist  4.317e-07  2.075e-07   2.081   0.0374 *  
motif_jiang_e_dist -2.419e-07  7.013e-06  -0.034   0.9725    
motif_aat_dist      5.579e-06  1.992e-05   0.280   0.7794    
motif_tta_dist     -6.428e-06  1.510e-05  -0.426   0.6704    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 223.71  on 135  degrees of freedom
Residual deviance: 174.39  on 123  degrees of freedom
AIC: 414.7

Number of Fisher Scoring iterations: 5


2016-03-14 22:24:31.044448 :: 
	Overdispersion test

data:  rd
z = 1.1712, p-value = 0.1208
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.1493528 


2016-03-14 22:24:31.047229 :: ************************
2016-03-14 22:24:31.047642 :: model telomeric
2016-03-14 22:24:31.048512 :: 1.22641509434
2016-03-14 22:24:31.048917 :: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2016-03-14 22:24:31.164280 :: 
Call:
glm(formula = co_count ~ cen_dist + n_coding + gc + n_snps + 
    n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + 
    motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + 
    motif_tta_dist, family = poisson, data = df_tel)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9599  -1.3903  -0.2114   0.5894   3.5481  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)         3.003e-01  5.826e-01   0.516  0.60618    
cen_dist           -2.580e-07  8.691e-08  -2.968  0.00299 ** 
n_coding            9.108e-05  1.470e-05   6.196 5.80e-10 ***
gc                 -8.531e+00  3.148e+00  -2.710  0.00672 ** 
n_snps             -1.247e-03  9.831e-04  -1.268  0.20470    
n_indels            9.899e-03  2.019e-03   4.902 9.48e-07 ***
motif_jiang_a_dist  3.623e-08  3.504e-08   1.034  0.30121    
motif_jiang_b_dist -1.379e-08  3.887e-08  -0.355  0.72282    
motif_jiang_c_dist -6.183e-07  8.175e-07  -0.756  0.44945    
motif_jiang_d_dist -8.267e-09  5.241e-08  -0.158  0.87466    
motif_jiang_e_dist -5.348e-06  3.006e-06  -1.779  0.07522 .  
motif_aat_dist      1.379e-05  7.244e-06   1.903  0.05701 .  
motif_tta_dist      2.318e-06  7.340e-06   0.316  0.75218    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 1192.0  on 847  degrees of freedom
Residual deviance: 1112.1  on 835  degrees of freedom
AIC: 2486.5

Number of Fisher Scoring iterations: 5


2016-03-14 22:24:31.200524 :: 
	Overdispersion test

data:  rd
z = 3.0893, p-value = 0.001003
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.1747332 



In [54]:
analyse_dispersion(50000)


2016-03-14 22:24:31.225260 :: =================================
2016-03-14 22:24:31.227166 :: 50000
window size: 50000bp
0|co_count 1|count 2|frequency
2 88 0.2018348623853211
3 73 0.16743119266055045
1 66 0.15137614678899083
4 66 0.15137614678899083
0 52 0.11926605504587157
5 51 0.11697247706422019
6 18 0.04128440366972477
8 10 0.022935779816513763
7 9 0.020642201834862386
9 2 0.0045871559633027525
10 1 0.0022935779816513763
augmented windows
0|chrom 1|start 2|stop 3|co_count 4|cen_dist 5|n_coding 6|gc 7|n_snps 8|n_indels 9|motif_jiang_a_dist 10|motif_jiang_b_dist 11|motif_jiang_c_dist 12|motif_jiang_d_dist 13|motif_jiang_e_dist 14|motif_aat_dist 15|motif_tta_dist
Pf3D7_01_v3 92901 142900 0 341220.5 21735 0.18874 103 163 502358.0 499334.5 124987.5 522159.5 15658.5 7825.5 72.5
Pf3D7_01_v3 142901 192900 2 291220.5 29077 0.1868 213 222 452358.0 449334.5 74987.5 472159.5 3516.5 679.5 5627.5
Pf3D7_01_v3 192901 242900 4 241220.5 30768 0.197 70 123 402358.0 399334.5 24987.5 422159.5 1031.5 1126.5 7902.5
Pf3D7_01_v3 242901 292900 1 191220.5 29821 0.19262 68 157 352358.0 349334.5 25012.5 372159.5 6721.5 6704.5 115.5
Pf3D7_01_v3 292901 342900 6 141220.5 28248 0.18416 64 201 302358.0 299334.5 20267.5 322159.5 4501.5 97.5 9210.5

...

2016-03-14 22:24:42.521326 :: ************************
2016-03-14 22:24:42.521749 :: model centromeric
2016-03-14 22:24:42.522194 :: 2.98305084746
2016-03-14 22:24:42.522492 :: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2016-03-14 22:24:42.551344 :: 
Call:
glm(formula = co_count ~ cen_dist + n_coding + gc + n_snps + 
    n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + 
    motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + 
    motif_tta_dist, family = poisson, data = df_cen)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.6214  -0.7477  -0.1112   0.4943   1.7598  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)   
(Intercept)        -4.522e-01  2.077e+00  -0.218  0.82762   
cen_dist            3.460e-06  4.656e-06   0.743  0.45733   
n_coding            5.122e-05  2.631e-05   1.947  0.05154 . 
gc                 -8.482e+00  1.192e+01  -0.711  0.47682   
n_snps             -4.603e-04  3.781e-04  -1.217  0.22351   
n_indels            9.964e-03  3.353e-03   2.972  0.00296 **
motif_jiang_a_dist -7.355e-08  9.711e-08  -0.757  0.44882   
motif_jiang_b_dist -1.418e-07  1.218e-07  -1.164  0.24436   
motif_jiang_c_dist -7.077e-08  2.194e-06  -0.032  0.97427   
motif_jiang_d_dist  3.644e-07  2.350e-07   1.551  0.12101   
motif_jiang_e_dist -8.954e-06  7.361e-06  -1.216  0.22384   
motif_aat_dist     -7.594e-07  2.684e-05  -0.028  0.97743   
motif_tta_dist     -2.693e-05  1.529e-05  -1.762  0.07811 . 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 107.664  on 58  degrees of freedom
Residual deviance:  59.807  on 46  degrees of freedom
AIC: 236.26

Number of Fisher Scoring iterations: 5


2016-03-14 22:24:42.563903 :: 
	Overdispersion test

data:  rd
z = -0.81829, p-value = 0.7934
alternative hypothesis: true alpha is greater than 0
sample estimates:
     alpha 
-0.1134324 


2016-03-14 22:24:42.566835 :: ************************
2016-03-14 22:24:42.567299 :: model telomeric
2016-03-14 22:24:42.568128 :: 2.86705202312
2016-03-14 22:24:42.568565 :: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2016-03-14 22:24:42.623098 :: 
Call:
glm(formula = co_count ~ cen_dist + n_coding + gc + n_snps + 
    n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + 
    motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + 
    motif_tta_dist, family = poisson, data = df_tel)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7740  -0.8902  -0.1339   0.5839   2.4972  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -6.349e-01  8.396e-01  -0.756    0.450    
cen_dist           -1.014e-07  8.900e-08  -1.139    0.255    
n_coding            5.140e-05  7.889e-06   6.516 7.21e-11 ***
gc                 -5.443e+00  4.324e+00  -1.259    0.208    
n_snps             -8.807e-05  5.111e-04  -0.172    0.863    
n_indels            8.176e-03  1.043e-03   7.839 4.55e-15 ***
motif_jiang_a_dist  3.228e-08  3.680e-08   0.877    0.380    
motif_jiang_b_dist  1.340e-08  4.030e-08   0.333    0.739    
motif_jiang_c_dist -9.563e-07  8.254e-07  -1.159    0.247    
motif_jiang_d_dist -5.104e-08  5.381e-08  -0.949    0.343    
motif_jiang_e_dist -2.828e-06  3.023e-06  -0.935    0.350    
motif_aat_dist      9.343e-06  8.644e-06   1.081    0.280    
motif_tta_dist      1.266e-08  7.768e-06   0.002    0.999    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 523.03  on 345  degrees of freedom
Residual deviance: 397.57  on 333  degrees of freedom
AIC: 1322.9

Number of Fisher Scoring iterations: 5


2016-03-14 22:24:42.635692 :: 
	Overdispersion test

data:  rd
z = 0.79114, p-value = 0.2144
alternative hypothesis: true alpha is greater than 0
sample estimates:
     alpha 
0.05666315 



In [55]:
analyse_dispersion(100000)


2016-03-14 22:24:42.647925 :: =================================
2016-03-14 22:24:42.648807 :: 100000
window size: 100000bp
0|co_count 1|count 2|frequency
4 38 0.16593886462882096
6 30 0.13100436681222707
5 28 0.1222707423580786
7 25 0.1091703056768559
8 20 0.08733624454148471
3 19 0.08296943231441048
0 13 0.056768558951965066
2 13 0.056768558951965066
9 12 0.05240174672489083
10 11 0.048034934497816595
1 8 0.034934497816593885
13 5 0.021834061135371178
11 4 0.017467248908296942
12 2 0.008733624454148471
14 1 0.004366812227074236
augmented windows
0|chrom 1|start 2|stop 3|co_count 4|cen_dist 5|n_coding 6|gc 7|n_snps 8|n_indels 9|motif_jiang_a_dist 10|motif_jiang_b_dist 11|motif_jiang_c_dist 12|motif_jiang_d_dist 13|motif_jiang_e_dist 14|motif_aat_dist 15|motif_tta_dist
Pf3D7_01_v3 92901 192900 2 316220.5 50812 0.18777 316 385 477358.0 474334.5 99987.5 497159.5 21483.5 938.5 1174.5
Pf3D7_01_v3 192901 292900 5 216220.5 60589 0.19481 138 280 377358.0 374334.5 12.5 397159.5 1877.5 1863.5 3602.5
Pf3D7_01_v3 292901 392900 8 116220.5 52005 0.1847 128 421 277358.0 274334.5 1636.5 297159.5 644.5 2.5 12024.5
Pf3D7_01_v3 392901 492900 6 16220.5 49583 0.18866 134 363 177358.0 174334.5 101636.5 197159.5 34139.5 1591.5 17129.5
Pf3D7_01_v3 460312 560311 3 51190.5 42915 0.20299 1403 486 109947.0 106923.5 35410.5 129748.5 11850.5 12980.5 8152.5

...

2016-03-14 22:24:50.827223 :: ************************
2016-03-14 22:24:50.827682 :: model centromeric
2016-03-14 22:24:50.828344 :: 6.2
2016-03-14 22:24:50.828748 :: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2016-03-14 22:24:50.849252 :: 
Call:
glm(formula = co_count ~ cen_dist + n_coding + gc + n_snps + 
    n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + 
    motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + 
    motif_tta_dist, family = poisson, data = df_cen)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.0129  -0.8436  -0.0854   0.7576   1.4049  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)   
(Intercept)         2.261e+00  2.912e+00   0.776  0.43745   
cen_dist            6.810e-06  6.002e-06   1.135  0.25655   
n_coding            7.483e-06  2.304e-05   0.325  0.74532   
gc                 -1.089e+01  1.567e+01  -0.695  0.48712   
n_snps             -6.747e-05  2.922e-04  -0.231  0.81740   
n_indels            1.820e-03  2.298e-03   0.792  0.42838   
motif_jiang_a_dist -1.773e-07  1.019e-07  -1.740  0.08190 . 
motif_jiang_b_dist -7.212e-08  1.155e-07  -0.624  0.53231   
motif_jiang_c_dist -1.066e-06  1.632e-06  -0.653  0.51360   
motif_jiang_d_dist  5.896e-07  2.119e-07   2.783  0.00539 **
motif_jiang_e_dist  1.502e-06  6.098e-06   0.246  0.80539   
motif_aat_dist      1.108e-05  1.976e-05   0.561  0.57491   
motif_tta_dist      1.150e-05  2.253e-05   0.510  0.60971   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 53.378  on 34  degrees of freedom
Residual deviance: 29.293  on 22  degrees of freedom
AIC: 178.19

Number of Fisher Scoring iterations: 5


2016-03-14 22:24:50.861605 :: 
	Overdispersion test

data:  rd
z = -1.6951, p-value = 0.955
alternative hypothesis: true alpha is greater than 0
sample estimates:
     alpha 
-0.2229645 


2016-03-14 22:24:50.864442 :: ************************
2016-03-14 22:24:50.864816 :: model telomeric
2016-03-14 22:24:50.865575 :: 5.37640449438
2016-03-14 22:24:50.865972 :: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2016-03-14 22:24:50.902949 :: 
Call:
glm(formula = co_count ~ cen_dist + n_coding + gc + n_snps + 
    n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + 
    motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + 
    motif_tta_dist, family = poisson, data = df_tel)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.96544  -0.81238   0.00341   0.58311   2.55421  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)         1.642e+00  1.043e+00   1.574  0.11548    
cen_dist           -1.512e-07  9.397e-08  -1.609  0.10769    
n_coding            2.791e-05  4.856e-06   5.747 9.08e-09 ***
gc                 -1.452e+01  5.264e+00  -2.758  0.00582 ** 
n_snps              3.834e-04  3.025e-04   1.267  0.20502    
n_indels            3.897e-03  6.500e-04   5.994 2.04e-09 ***
motif_jiang_a_dist  3.556e-08  3.784e-08   0.940  0.34728    
motif_jiang_b_dist  5.770e-09  4.165e-08   0.139  0.88982    
motif_jiang_c_dist -9.397e-07  8.935e-07  -1.052  0.29293    
motif_jiang_d_dist -5.221e-08  5.509e-08  -0.948  0.34332    
motif_jiang_e_dist -1.723e-07  3.466e-06  -0.050  0.96036    
motif_aat_dist     -2.793e-06  7.457e-06  -0.375  0.70802    
motif_tta_dist      9.883e-07  6.472e-06   0.153  0.87864    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 325.98  on 177  degrees of freedom
Residual deviance: 185.56  on 165  degrees of freedom
AIC: 797.43

Number of Fisher Scoring iterations: 5


2016-03-14 22:24:50.915929 :: 
	Overdispersion test

data:  rd
z = -0.12053, p-value = 0.548
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
-0.011571 


Sandbox


In [17]:
df = tabulate_predictors(10000)
df.head()


window size: 10000bp
0|co_count 1|count 2|frequency
0 1122 0.5350500715307582
1 599 0.28564616118264186
2 260 0.12398664759179781
3 90 0.04291845493562232
4 24 0.011444921316165951
5 1 0.0004768717215069146
6 1 0.0004768717215069146
augmented windows
0|chrom 1|start 2|stop 3|co_count 4|cen_dist 5|n_coding 6|gc 7|n_snps 8|n_indels 9|motif_jiang_a_dist 10|motif_jiang_b_dist 11|motif_jiang_c_dist 12|motif_jiang_d_dist 13|motif_jiang_e_dist 14|motif_aat_dist 15|motif_tta_dist
Pf3D7_01_v3 92901 102900 0 361220.5 3258 0.199 37 18 522358.0 519334.5 144987.5 542159.5 1015.5 202.5 648.5
Pf3D7_01_v3 102901 112900 0 351220.5 2159 0.1808 15 38 512358.0 509334.5 134987.5 532159.5 5658.5 598.5 22.5
Pf3D7_01_v3 112901 122900 0 341220.5 4792 0.1984 12 27 502358.0 499334.5 124987.5 522159.5 15658.5 7825.5 72.5
Pf3D7_01_v3 122901 132900 0 331220.5 5004 0.1735 17 39 492358.0 489334.5 114987.5 512159.5 25658.5 721.5 154.5
Pf3D7_01_v3 132901 142900 0 321220.5 6522 0.192 22 41 482358.0 479334.5 104987.5 502159.5 26483.5 4061.5 385.5

...

Out[17]:
chrom start stop co_count cen_dist n_coding gc n_snps n_indels motif_jiang_a_dist motif_jiang_b_dist motif_jiang_c_dist motif_jiang_d_dist motif_jiang_e_dist motif_aat_dist motif_tta_dist
0 Pf3D7_01_v3 92901 102900 0 361220.5 3258 0.1990 37 18 522358 519334.5 144987.5 542159.5 1015.5 202.5 648.5
1 Pf3D7_01_v3 102901 112900 0 351220.5 2159 0.1808 15 38 512358 509334.5 134987.5 532159.5 5658.5 598.5 22.5
2 Pf3D7_01_v3 112901 122900 0 341220.5 4792 0.1984 12 27 502358 499334.5 124987.5 522159.5 15658.5 7825.5 72.5
3 Pf3D7_01_v3 122901 132900 0 331220.5 5004 0.1735 17 39 492358 489334.5 114987.5 512159.5 25658.5 721.5 154.5
4 Pf3D7_01_v3 132901 142900 0 321220.5 6522 0.1920 22 41 482358 479334.5 104987.5 502159.5 26483.5 4061.5 385.5

In [18]:
rd = %R -i df -o rd rd <- glm(co_count ~ cen_dist + n_coding + gc + n_snps + n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + motif_tta_dist, data = df, family = poisson)
rd


Out[18]:
<ListVector - Python:0x7fef54ab5548 / R:0x3781450>
[Float..., Float..., Float..., ..., StrVe..., RNULL..., ListV...]
  coefficients: <class 'rpy2.robjects.vectors.FloatVector'>
  <FloatVector - Python:0x7fef54ad4b88 / R:0xadcd068>
[-0.445172, -0.000000, 0.000133, ..., -0.000005, 0.000006, -0.000012]
  residuals: <class 'rpy2.robjects.vectors.FloatVector'>
  <FloatVector - Python:0x7fef54a51c08 / R:0xd061b40>
[-1.000000, -1.000000, -1.000000, ..., -1.000000, -1.000000, -1.000000]
  fitted.values: <class 'rpy2.robjects.vectors.FloatVector'>
  <FloatVector - Python:0x7fef54a516c8 / R:0xd00e740>
[0.434281, 0.503580, 0.608735, ..., 0.323299, 0.546473, 0.296638]
  ...
  coefficients: <class 'rpy2.robjects.vectors.StrVector'>
  <StrVector - Python:0x7fef6222fd88 / R:0xc62ff18>
['glm.fit']
  residuals: <class 'rpy2.rinterface.RNULLType'>
  rpy2.rinterface.NULL
<ListVector - Python:0x7fef54ab5548 / R:0x3781450>
[Float..., Float..., Float..., ..., StrVe..., RNULL..., ListV...]

In [19]:
print(rd)


Call:  glm(formula = co_count ~ cen_dist + n_coding + gc + n_snps + 
    n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + 
    motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + 
    motif_tta_dist, family = poisson, data = df)

Coefficients:
       (Intercept)            cen_dist            n_coding                  gc  
        -4.452e-01          -2.014e-07           1.327e-04          -3.763e+00  
            n_snps            n_indels  motif_jiang_a_dist  motif_jiang_b_dist  
        -2.843e-03           8.608e-03           2.069e-08          -1.517e-08  
motif_jiang_c_dist  motif_jiang_d_dist  motif_jiang_e_dist      motif_aat_dist  
        -3.033e-07           7.041e-09          -5.190e-06           5.576e-06  
    motif_tta_dist  
        -1.221e-05  

Degrees of Freedom: 1945 Total (i.e. Null);  1933 Residual
Null Deviance:	    2446 
Residual Deviance: 2363 	AIC: 4452


In [20]:
summ = %R summary(rd)
summ


Out[20]:
<ListVector - Python:0x7fefcdf30708 / R:0x9e034f0>
[Vector, Formula, ListV..., ..., IntVe..., Matrix, Matrix]
  call: <class 'rpy2.robjects.vectors.Vector'>
  <Vector - Python:0x7fefcdf30048 / R:0xbd1e378>
[RNULLType, Vector, Vector, Vector]
  terms: <class 'rpy2.robjects.Formula'>
  <Formula - Python:0x7fefcdaca708 / R:0xb0385f8>
<ListVector - Python:0x7fefcdf30708 / R:0x9e034f0>
[Vector, Formula, ListV..., ..., IntVe..., Matrix, Matrix]
  ...
  call: <class 'rpy2.robjects.vectors.IntVector'>
  <IntVector - Python:0x7fef54f68308 / R:0xc9f9f68>
[      13,     1933,       13]
  terms: <class 'rpy2.robjects.vectors.Matrix'>
  <Matrix - Python:0x7fef60047d48 / R:0xc6e5920>
[0.132926, -0.000000, 0.000002, ..., 0.000000, 0.000000, 0.000000]
  family: <class 'rpy2.robjects.vectors.Matrix'>
  <Matrix - Python:0x7fef6004e4c8 / R:0xc90e8b0>
[0.132926, -0.000000, 0.000002, ..., 0.000000, 0.000000, 0.000000]

In [21]:
print(summ)


Call:
glm(formula = co_count ~ cen_dist + n_coding + gc + n_snps + 
    n_indels + motif_jiang_a_dist + motif_jiang_b_dist + motif_jiang_c_dist + 
    motif_jiang_d_dist + motif_jiang_e_dist + motif_aat_dist + 
    motif_tta_dist, family = poisson, data = df)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6206  -1.1827  -0.9273   0.4467   4.0896  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -4.452e-01  3.646e-01  -1.221  0.22208    
cen_dist           -2.014e-07  7.135e-08  -2.823  0.00476 ** 
n_coding            1.327e-04  2.076e-05   6.392 1.64e-10 ***
gc                 -3.763e+00  2.057e+00  -1.830  0.06728 .  
n_snps             -2.843e-03  1.389e-03  -2.046  0.04076 *  
n_indels            8.608e-03  2.674e-03   3.219  0.00128 ** 
motif_jiang_a_dist  2.069e-08  3.008e-08   0.688  0.49150    
motif_jiang_b_dist -1.517e-08  3.348e-08  -0.453  0.65041    
motif_jiang_c_dist -3.033e-07  6.728e-07  -0.451  0.65209    
motif_jiang_d_dist  7.041e-09  4.706e-08   0.150  0.88107    
motif_jiang_e_dist -5.190e-06  2.531e-06  -2.051  0.04030 *  
motif_aat_dist      5.576e-06  7.478e-06   0.746  0.45587    
motif_tta_dist     -1.221e-05  7.247e-06  -1.685  0.09201 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 2445.8  on 1945  degrees of freedom
Residual deviance: 2363.2  on 1933  degrees of freedom
AIC: 4452.5

Number of Fisher Scoring iterations: 5



In [39]:
'%.2e' % .2


Out[39]:
'2.00e-01'

In [24]:
print(summ.names)


 [1] "call"           "terms"          "family"         "deviance"      
 [5] "aic"            "contrasts"      "df.residual"    "null.deviance" 
 [9] "df.null"        "iter"           "deviance.resid" "coefficients"  
[13] "aliased"        "dispersion"     "df"             "cov.unscaled"  
[17] "cov.scaled"    


In [32]:
summ[11]


Out[32]:
<Matrix - Python:0x7fef56903088 / R:0xa21a460>
[-0.445172, -0.000000, 0.000133, ..., 0.040299, 0.455869, 0.092008]

In [33]:
coefficients = np.array(summ[11])
coefficients


Out[33]:
array([[ -4.45171660e-01,   3.64590289e-01,  -1.22101897e+00,
          2.22078837e-01],
       [ -2.01410741e-07,   7.13517168e-08,  -2.82278759e+00,
          4.76081019e-03],
       [  1.32670618e-04,   2.07570309e-05,   6.39159899e+00,
          1.64159948e-10],
       [ -3.76305116e+00,   2.05656092e+00,  -1.82977860e+00,
          6.72830521e-02],
       [ -2.84265750e-03,   1.38941791e-03,  -2.04593411e+00,
          4.07628536e-02],
       [  8.60779846e-03,   2.67376152e-03,   3.21935909e+00,
          1.28477485e-03],
       [  2.06931439e-08,   3.00802901e-08,   6.87930330e-01,
          4.91496655e-01],
       [ -1.51742544e-08,   3.34833169e-08,  -4.53188506e-01,
          6.50413008e-01],
       [ -3.03336612e-07,   6.72787409e-07,  -4.50865471e-01,
          6.52086511e-01],
       [  7.04123530e-09,   4.70625075e-08,   1.49614538e-01,
          8.81068738e-01],
       [ -5.19044121e-06,   2.53109058e-06,  -2.05067383e+00,
          4.02987212e-02],
       [  5.57608480e-06,   7.47798350e-06,   7.45666903e-01,
          4.55868657e-01],
       [ -1.22109429e-05,   7.24728862e-06,  -1.68489811e+00,
          9.20082317e-02]])

In [54]:
from prettypandas import PrettyPandas

In [58]:
col_index = pandas.MultiIndex.from_product([[10000], ['Estimate', 'Std. Error', 'z value', 'Pr(>|z|)']], 
                                           names=['window size', 'coefficient'])
row_index = ['intercept', 'cen_dist', 'n_coding', 'gc', 'n_snps', 'n_indels', 
             'motif_jiang_a_dist', 'motif_jiang_b_dist', 'motif_jiang_c_dist', 'motif_jiang_d_dist', 'motif_jiang_e_dist',
             'motif_aat_dist', 'motif_tta_dist']
df = pandas.DataFrame(coefficients, columns=col_index, index=row_index)
df


Out[58]:
window size 10000
coefficient Estimate Std. Error z value Pr(>|z|)
intercept -4.451717e-01 3.645903e-01 -1.221019 2.220788e-01
cen_dist -2.014107e-07 7.135172e-08 -2.822788 4.760810e-03
n_coding 1.326706e-04 2.075703e-05 6.391599 1.641599e-10
gc -3.763051e+00 2.056561e+00 -1.829779 6.728305e-02
n_snps -2.842657e-03 1.389418e-03 -2.045934 4.076285e-02
n_indels 8.607798e-03 2.673762e-03 3.219359 1.284775e-03
motif_jiang_a_dist 2.069314e-08 3.008029e-08 0.687930 4.914967e-01
motif_jiang_b_dist -1.517425e-08 3.348332e-08 -0.453189 6.504130e-01
motif_jiang_c_dist -3.033366e-07 6.727874e-07 -0.450865 6.520865e-01
motif_jiang_d_dist 7.041235e-09 4.706251e-08 0.149615 8.810687e-01
motif_jiang_e_dist -5.190441e-06 2.531091e-06 -2.050674 4.029872e-02
motif_aat_dist 5.576085e-06 7.477984e-06 0.745667 4.558687e-01
motif_tta_dist -1.221094e-05 7.247289e-06 -1.684898 9.200823e-02

In [60]:
pandas.__version__


Out[60]:
'0.17.1'

In [71]:
dfs = df.style
dfs.apply(lambda v: '%.1f' % v, axis=0)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/usr/local/lib/python3.5/dist-packages/IPython/core/formatters.py in __call__(self, obj)
    339             method = _safe_get_formatter_method(obj, self.print_method)
    340             if method is not None:
--> 341                 return method()
    342             return None
    343         else:

/usr/local/lib/python3.5/dist-packages/pandas/core/style.py in _repr_html_(self)
    158         Hooks into Jupyter notebook rich display system.
    159         '''
--> 160         return self.render()
    161 
    162     def _translate(self):

/usr/local/lib/python3.5/dist-packages/pandas/core/style.py in render(self)
    258         the rendered HTML in the notebook.
    259         """
--> 260         self._compute()
    261         d = self._translate()
    262         # filter out empty styles, every cell will have a class

/usr/local/lib/python3.5/dist-packages/pandas/core/style.py in _compute(self)
    325         r = self
    326         for func, args, kwargs in self._todo:
--> 327             r = func(self)(*args, **kwargs)
    328         return r
    329 

/usr/local/lib/python3.5/dist-packages/pandas/core/style.py in _apply(self, func, axis, subset, **kwargs)
    332         subset = _non_reducing_slice(subset)
    333         if axis is not None:
--> 334             result = self.data.loc[subset].apply(func, axis=axis, **kwargs)
    335         else:
    336             # like tee

/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
   3970                     if reduce is None:
   3971                         reduce = True
-> 3972                     return self._apply_standard(f, axis, reduce=reduce)
   3973             else:
   3974                 return self._apply_broadcast(f, axis)

/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
   4062             try:
   4063                 for i, v in enumerate(series_gen):
-> 4064                     results[i] = func(v)
   4065                     keys.append(v.name)
   4066             except Exception as e:

<ipython-input-71-39c0d6405b9d> in <lambda>(v)
      1 dfs = df.style
----> 2 dfs.apply(lambda v: '%.1f' % v, axis=0)

/usr/local/lib/python3.5/dist-packages/pandas/core/series.py in wrapper(self)
     79             return converter(self.iloc[0])
     80         raise TypeError(
---> 81             "cannot convert the series to {0}".format(str(converter)))
     82     return wrapper
     83 

TypeError: ("cannot convert the series to <class 'float'>", 'occurred at index (10000, Estimate)')
Out[71]:
<pandas.core.style.Styler at 0x7fef54ee0320>

In [56]:
PrettyPandas(df, precision=2)


Out[56]:
10000 10000 10000 10000
Estimate Std. Error z value Pr(>|z|)
0 -0.45 0.36 -1.22 0.22
1 -0.0 0.0 -2.82 0.0
2 0.0 0.0 6.39 0.0
3 -3.76 2.06 -1.83 0.07
4 -0.0 0.0 -2.05 0.04
5 0.01 0.0 3.22 0.0
6 0.0 0.0 0.69 0.49
7 -0.0 0.0 -0.45 0.65
8 -0.0 0.0 -0.45 0.65
9 0.0 0.0 0.15 0.88
10 -0.0 0.0 -2.05 0.04
11 0.0 0.0 0.75 0.46
12 -0.0 0.0 -1.68 0.09

In [40]:
t = %R -i rd -o t t <- dispersiontest(rd, trafo=1, alternative="greater")
t


Out[40]:
<ListVector - Python:0x7fef61fa1588 / R:0xbdd4aa8>
[Float..., Float..., Float..., ..., StrVe..., StrVe..., StrVe...]
  statistic: <class 'rpy2.robjects.vectors.FloatVector'>
  <FloatVector - Python:0x7fef56893c48 / R:0xc0e74a8>
[4.871770]
  p.value: <class 'rpy2.robjects.vectors.FloatVector'>
  <FloatVector - Python:0x7fef568931c8 / R:0xca11b48>
[0.000001]
  estimate: <class 'rpy2.robjects.vectors.FloatVector'>
  <FloatVector - Python:0x7fef56893908 / R:0xc0e7418>
[0.192078]
  ...
  statistic: <class 'rpy2.robjects.vectors.StrVector'>
  <StrVector - Python:0x7fef54a90e88 / R:0xc4bc038>
['greater']
  p.value: <class 'rpy2.robjects.vectors.StrVector'>
  <StrVector - Python:0x7fef54a90288 / R:0xc4bc8b8>
['Overdispersion test']
  estimate: <class 'rpy2.robjects.vectors.StrVector'>
  <StrVector - Python:0x7fef568992c8 / R:0xca11e48>
['rd']

In [48]:
t[1][0]


Out[48]:
5.530132768132743e-07

In [41]:
print(t)


	Overdispersion test

data:  rd
z = 4.8718, p-value = 5.53e-07
alternative hypothesis: true alpha is greater than 0
sample estimates:
    alpha 
0.1920779 



In [ ]:
summ = %R summary(rd)
        log(summ)
        t = %R -i rd -o t t <- dispersiontest(rd, trafo=1, alternative="greater")

Sandbox


In [ ]:


In [36]:
x = scipy.stats.poisson.rvs(np.mean(co_count), size=co_count.size)
x


Out[36]:
array([0, 0, 1, ..., 0, 1, 1])

In [37]:
plt.hist(x, histtype='step', bins=np.arange(10), lw=2)
plt.hist(co_count, histtype='step', bins=np.arange(10), lw=2);



In [33]:
np.mean(co_count), np.std(co_count)


Out[33]:
(0.42778443113772457, 0.73214916283762699)

In [35]:
np.mean(x), np.std(x)


Out[35]:
(0.86347305389221551, 0.92110057064902273)

Sandbox


In [14]:
%%R
library(AER)
data(RecreationDemand)
head(RecreationDemand)


  trips quality ski income userfee  costC   costS   costH
1     0       0 yes      4      no  67.59  68.620  76.800
2     0       0  no      9      no  68.86  70.936  84.780
3     0       0 yes      5      no  58.12  59.465  72.110
4     0       0  no      2      no  15.79  13.750  23.680
5     0       0 yes      3      no  24.02  34.033  34.547
6     0       0 yes      5      no 129.46 137.377 137.850

In [15]:
%%R
rd <- glm(RecreationDemand$trips ~ 1, family = poisson)
summary(rd)


Call:
glm(formula = RecreationDemand$trips ~ 1, family = poisson)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1186  -2.1186  -2.1186  -0.1662  21.7766  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)   0.8084     0.0260   31.09   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 4849.7  on 658  degrees of freedom
Residual deviance: 4849.7  on 658  degrees of freedom
AIC: 5604.8

Number of Fisher Scoring iterations: 6


In [17]:
%%R
dispersiontest(rd, trafo=1, alternative="greater")


	Overdispersion test

data:  rd
z = 3.0234, p-value = 0.00125
alternative hypothesis: true alpha is greater than 0
sample estimates:
   alpha 
16.61573 


In [ ]:
combined = load_callsets(COMBINED_CALLSET_FN_TEMPLATE, 
                         variant_filter='FILTER_PASS',
                         call_filter=combined_conf_calls, 
                         sample_exclusions=excessive_recomb_samples)

In [ ]:
variants = combined['3d7_hb3']['variants']

In [ ]:
pos = variants['POS'][variants['CHROM'] == b'Pf3D7_01_v3']
pos.shape

In [ ]:
tbl_intervals = (
    etl
    .fromcolumns((pos[:-1], pos[1:], np.diff(pos)), header=('start', 'stop', 'length'))
    .addfield('chrom', b'Pf3D7_01_v3', index=0)
    .intervalleftjoin(tbl_recom.prefixheader('recom_'), 
                      lkey='chrom', lstart='start', lstop='stop',
                      rkey='recom_chrom', rstart='recom_start', rstop='recom_stop',
                      include_stop=True)
    .aggregate(key=('chrom', 'start', 'stop', 'length'),
               aggregation=list, value='recom_type')
    .rename('value', 'event_types')
    .convert('event_types', lambda v: [] if v == [None] else v)
    .addfield('n_events', lambda row: len(row.event_types))
)
tbl_intervals.gt('n_events', 0).display(50)

In [ ]: