Notebook Analyzes Duplication in Motif Defintions



In [207]:

    
import venusar
import motif
import thresholds
import motifs
import activity
import tf_expression
import gene_expression



In [208]:

    
# to get code changes
import imp  
imp.reload(motif)









    Out[208]:





<module 'motif' from '/data640g1/data/documents/docs/projects/payton_venusar/VENUSAR_DEV/venusar/motif.py'>

Import Motif Definitions



In [209]:

    
motif_f_base='../../data/HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.txt'
pc = 0.1
th = 0
bp = [0.25, 0.25, 0.25, 0.25]
motif_set_base = motif.get_motifs(motif_f_base, pc, th, bp)



In [210]:

    
motif_set_base.length()
motif_dict = motif_set_base.motif_count(False)
len(motif_dict)









    Out[210]:





601



In [211]:

    
# drop to valid motifs only; do twice to see if any invalid
motif_dict = motif_set_base.motif_count(True)
len(motif_dict)









    Out[211]:





601



In [212]:

    
range(len(motif_set_base.motifs))









    Out[212]:





range(0, 641)



In [213]:

    
range(0, (len(motif_set_base.motifs) - 1))









    Out[213]:





range(0, 640)

Convert to DataFrame for Analysis of Duplication



In [214]:

    
import pandas

# -- building data frame from dictionary
#df = pandas.DataFrame(motif_dict) # errors about 'you must pass an index'
# ref: http://stackoverflow.com/questions/17839973/construct-pandas-dataframe-from-values-in-variables#17840195
dfCounts = pandas.DataFrame(motif_dict, index=[0])
dfCounts = pandas.melt(dfCounts) # rotate
dfCounts.rename(columns = {'variable': 'TF', 'value': 'tfCount'}, inplace=True)

# -- handling types
# ref: http://stackoverflow.com/questions/15891038/pandas-change-data-type-of-columns
dfCounts.dtypes

#pandas.to_numeric(s, errors='ignore')









    Out[214]:





TF         object
tfCount     int64
dtype: object



In [215]:

    
dfCounts.size
dfCounts[dfCounts.tfCount > 1]



In [216]:

    
# add additional information to the dataframe
msb_names,msb_lengths = motif_set_base.motif_lengths(False)
# define data frame by columns
dfc = pandas.DataFrame(
    {
        "TF" : msb_names,
        "TFLength" : msb_lengths
    })
print(dfc)

if False:
    # define data frame by row (this is wrong; interlaced value sets)
    dfr = pandas.melt(pandas.DataFrame(
        [ msb_names,
        msb_lengths])).rename(columns = {'variable': 'TF', 'value': 'tfCount'}, inplace=True)
    dfr









    



          TF  TFLength
0        AHR         9
1       AIRE        18
2       ALX1        12
3       ALX3        11
4       ALX4        12
5         AR        17
6     TFAP2A        15
7     TFAP2B        10
8     TFAP2C        14
9     TFAP2D        14
10    ARID3A        22
11    ARID3A         7
12    ARID5B        13
13     ARNT2        12
14      ARNT         8
15       ARX        13
16     ASCL2        14
17      ATF1        11
18      ATF2         9
19      ATF3         9
20      ATF6        12
21      ATF7        11
22     ATOH1        11
23     BACH1        13
24    BARHL1        13
25    BARHL2        18
26     BARX1        17
27     BARX2        10
28     BATF3        12
29      BATF        18
..       ...       ...
611     ZEB1        11
612   HIVEP1        12
613   HIVEP2        14
614    ZFHX3        11
615      ZFX        19
616     ZIC1         9
617     ZIC2         9
618     ZIC3         9
619     ZIC4        15
620  ZKSCAN1        22
621  ZKSCAN3        11
622   ZNF143        22
623   ZNF148        15
624   ZNF219        12
625   ZNF232        19
626   ZNF282        14
627   ZNF333         9
628   ZNF350        24
629   ZNF384        12
630   ZNF410        13
631   ZNF423        15
632   ZNF524        18
633   ZNF589        16
634   ZNF639        20
635   ZNF652        11
636   ZNF713        19
637   ZNF740        14
638   ZNF784        11
639  ZSCAN16        20
640   ZSCAN4        12

[641 rows x 2 columns]



In [217]:

    
dfCounts[dfCounts.tfCount > 1].TF









    Out[217]:





6      ARID3A
24       BATF
84       EGR1
88        EHF
105      ESR1
106      ESR2
144     FOXJ3
158     GATA1
163     GATA6
286      MAFG
287      MAFK
323     NANOG
328    NFATC1
335      NFIA
359     NR1I2
360     NR1I3
365     NR2F1
366     NR2F2
368     NR3C1
388      PAX2
391      PAX5
399       PGR
407     PLAG1
424     PPARA
426     PPARG
436      RARA
438      RARG
487       SP1
500     STAT1
508      TAL1
536     TFDP1
539     TGIF1
543      THRA
544      THRB
545      TLX1
547      TP63
548      TP73
556       VDR
567    ZBTB33
568     ZBTB4
Name: TF, dtype: object



In [218]:

    
dfc









    Out[218]:






  
    
      
      TF
      TFLength
    
  
  
    
      0
      AHR
      9
    
    
      1
      AIRE
      18
    
    
      2
      ALX1
      12
    
    
      3
      ALX3
      11
    
    
      4
      ALX4
      12
    
    
      5
      AR
      17
    
    
      6
      TFAP2A
      15
    
    
      7
      TFAP2B
      10
    
    
      8
      TFAP2C
      14
    
    
      9
      TFAP2D
      14
    
    
      10
      ARID3A
      22
    
    
      11
      ARID3A
      7
    
    
      12
      ARID5B
      13
    
    
      13
      ARNT2
      12
    
    
      14
      ARNT
      8
    
    
      15
      ARX
      13
    
    
      16
      ASCL2
      14
    
    
      17
      ATF1
      11
    
    
      18
      ATF2
      9
    
    
      19
      ATF3
      9
    
    
      20
      ATF6
      12
    
    
      21
      ATF7
      11
    
    
      22
      ATOH1
      11
    
    
      23
      BACH1
      13
    
    
      24
      BARHL1
      13
    
    
      25
      BARHL2
      18
    
    
      26
      BARX1
      17
    
    
      27
      BARX2
      10
    
    
      28
      BATF3
      12
    
    
      29
      BATF
      18
    
    
      ...
      ...
      ...
    
    
      611
      ZEB1
      11
    
    
      612
      HIVEP1
      12
    
    
      613
      HIVEP2
      14
    
    
      614
      ZFHX3
      11
    
    
      615
      ZFX
      19
    
    
      616
      ZIC1
      9
    
    
      617
      ZIC2
      9
    
    
      618
      ZIC3
      9
    
    
      619
      ZIC4
      15
    
    
      620
      ZKSCAN1
      22
    
    
      621
      ZKSCAN3
      11
    
    
      622
      ZNF143
      22
    
    
      623
      ZNF148
      15
    
    
      624
      ZNF219
      12
    
    
      625
      ZNF232
      19
    
    
      626
      ZNF282
      14
    
    
      627
      ZNF333
      9
    
    
      628
      ZNF350
      24
    
    
      629
      ZNF384
      12
    
    
      630
      ZNF410
      13
    
    
      631
      ZNF423
      15
    
    
      632
      ZNF524
      18
    
    
      633
      ZNF589
      16
    
    
      634
      ZNF639
      20
    
    
      635
      ZNF652
      11
    
    
      636
      ZNF713
      19
    
    
      637
      ZNF740
      14
    
    
      638
      ZNF784
      11
    
    
      639
      ZSCAN16
      20
    
    
      640
      ZSCAN4
      12
    
  

641 rows × 2 columns



In [219]:

    
duplication_set = pandas.merge( dfCounts[dfCounts.tfCount > 1], dfc, how='inner', on='TF' ).sort_values('TF')
duplication_set
#duplication_set.select(['TF','TFLength']).groupby(by='TF').rank(method='min')
duplication_set.groupby(by='TF').rank(method='min')
# ref: http://stackoverflow.com/questions/23976176/ranks-within-groupby-in-pandas

dfRank = lambda x: pandas.Series(pandas.qcut(x,2,labels=False),index=x.index)
dfRank2 = lambda x: pandas.qcut(x,2,labels=False)
# this works: replacing x above with duplication_set['TFLength'] but fails when adding groupby, why? fails using apply too.
# duplication_set['ranks'] = duplication_set.groupby('TF')['TFLength'].apply(dfRank)
# duplication_set['ranks'] = duplication_set['TFLength'].apply(dfRank)
# duplication_set['ranks'] = pandas.qcut((duplication_set['TFLength']),2,labels=False)
duplication_set['ranks'] = dfRank2(duplication_set['TFLength'])

# adding rank to try to pivot multiple rows to columns but no dice
# df.pivot(columns='var', values='val')
#duplication_set[['TF','TFLength','ranks']].pivot(columns='ranks',values='TFLength') # stupidly keeps dropping TF column, why? also duplicating rows and not actually pivoting
# duplication_set.pivot_table(df,index=["TF","Ranks"]) # fails, 'grouper for TF not 1 dimensional
# hack tired of fighting odd outcome, dyplr is much better than pandas
duplication_set









    Out[219]:






  
    
      
      TF
      tfCount
      TFLength
      ranks
    
  
  
    
      0
      ARID3A
      2
      22
      1
    
    
      1
      ARID3A
      2
      7
      0
    
    
      2
      BATF
      2
      18
      1
    
    
      3
      BATF
      2
      10
      0
    
    
      4
      EGR1
      2
      18
      1
    
    
      5
      EGR1
      2
      11
      0
    
    
      6
      EHF
      2
      17
      1
    
    
      7
      EHF
      2
      13
      0
    
    
      9
      ESR1
      2
      18
      1
    
    
      8
      ESR1
      2
      17
      1
    
    
      10
      ESR2
      2
      18
      1
    
    
      11
      ESR2
      2
      7
      0
    
    
      12
      FOXJ3
      2
      13
      0
    
    
      13
      FOXJ3
      2
      13
      0
    
    
      14
      GATA1
      2
      19
      1
    
    
      15
      GATA1
      2
      10
      0
    
    
      17
      GATA6
      2
      7
      0
    
    
      16
      GATA6
      2
      14
      1
    
    
      19
      MAFG
      2
      7
      0
    
    
      18
      MAFG
      2
      21
      1
    
    
      20
      MAFK
      2
      22
      1
    
    
      21
      MAFK
      2
      10
      0
    
    
      22
      NANOG
      2
      17
      1
    
    
      23
      NANOG
      2
      8
      0
    
    
      24
      NFATC1
      2
      12
      0
    
    
      25
      NFATC1
      2
      13
      0
    
    
      26
      NFIA
      2
      15
      1
    
    
      27
      NFIA
      2
      7
      0
    
    
      29
      NR1I2
      2
      8
      0
    
    
      28
      NR1I2
      2
      19
      1
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      50
      RARA
      2
      17
      1
    
    
      51
      RARA
      2
      7
      0
    
    
      52
      RARG
      2
      20
      1
    
    
      53
      RARG
      2
      7
      0
    
    
      54
      SP1
      2
      19
      1
    
    
      55
      SP1
      2
      11
      0
    
    
      56
      STAT1
      2
      22
      1
    
    
      57
      STAT1
      2
      15
      1
    
    
      59
      TAL1
      2
      12
      0
    
    
      58
      TAL1
      2
      19
      1
    
    
      60
      TFDP1
      2
      22
      1
    
    
      61
      TFDP1
      2
      14
      1
    
    
      62
      TGIF1
      2
      16
      1
    
    
      63
      TGIF1
      2
      7
      0
    
    
      64
      THRA
      2
      17
      1
    
    
      65
      THRA
      2
      10
      0
    
    
      66
      THRB
      2
      17
      1
    
    
      67
      THRB
      2
      10
      0
    
    
      69
      TLX1
      2
      11
      0
    
    
      68
      TLX1
      2
      17
      1
    
    
      70
      TP63
      2
      19
      1
    
    
      71
      TP63
      2
      11
      0
    
    
      72
      TP73
      2
      20
      1
    
    
      73
      TP73
      2
      12
      0
    
    
      74
      VDR
      2
      16
      1
    
    
      75
      VDR
      2
      7
      0
    
    
      76
      ZBTB33
      2
      19
      1
    
    
      77
      ZBTB33
      2
      12
      0
    
    
      78
      ZBTB4
      2
      17
      1
    
    
      79
      ZBTB4
      2
      15
      1
    
  

80 rows × 4 columns



In [220]:

    
pandas.qcut(duplication_set['TFLength'],2,labels=False)  # doesn't error gives ranks









    Out[220]:





0     1
1     0
2     1
3     0
4     1
5     0
6     1
7     0
9     1
8     1
10    1
11    0
12    0
13    0
14    1
15    0
17    0
16    1
19    0
18    1
20    1
21    0
22    1
23    0
24    0
25    0
26    1
27    0
29    0
28    1
     ..
50    1
51    0
52    1
53    0
54    1
55    0
56    1
57    1
59    0
58    1
60    1
61    1
62    1
63    0
64    1
65    0
66    1
67    0
69    0
68    1
70    1
71    0
72    1
73    0
74    1
75    0
76    1
77    0
78    1
79    1
Name: TFLength, dtype: int64



In [221]:

    
# this is wrong but not clear why?
duplication_set[['TF','TFLength','ranks']].pivot(index='TF', columns='ranks',values='Lengths') # this is wrong too









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.5/dist-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2133             try:
-> 2134                 return self._engine.get_loc(key)
   2135             except KeyError:

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: 'Lengths'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-221-64c46a687b0d> in <module>()
      1 # this is wrong but not clear why?
----> 2 duplication_set[['TF','TFLength','ranks']].pivot(index='TF', columns='ranks',values='Lengths') # this is wrong too

/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py in pivot(self, index, columns, values)
   3933         """
   3934         from pandas.core.reshape import pivot
-> 3935         return pivot(self, index=index, columns=columns, values=values)
   3936 
   3937     def stack(self, level=-1, dropna=True):

/usr/local/lib/python3.5/dist-packages/pandas/core/reshape.py in pivot(self, index, columns, values)
    334         else:
    335             index = self[index]
--> 336         indexed = Series(self[values].values,
    337                          index=MultiIndex.from_arrays([index, self[columns]]))
    338         return indexed.unstack(columns)

/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

/usr/local/lib/python3.5/dist-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1384         res = cache.get(item)
   1385         if res is None:
-> 1386             values = self._data.get(item)
   1387             res = self._box_item_values(item, values)
   1388             cache[item] = res

/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py in get(self, item, fastpath)
   3541 
   3542             if not isnull(item):
-> 3543                 loc = self.items.get_loc(item)
   3544             else:
   3545                 indexer = np.arange(len(self.items))[isnull(self.items)]

/usr/local/lib/python3.5/dist-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2134                 return self._engine.get_loc(key)
   2135             except KeyError:
-> 2136                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2137 
   2138         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: 'Lengths'



In [ ]:

    
duplication_set



In [ ]:

    
duplication_set.dtypes



In [ ]:

    
# duplication_set.pivot(index='TF', columns='ranks',values='TFLength') # this is wrong too errors: pandas pivot ValueError: Index contains duplicate entries, cannot reshape
# led to
# ref: http://stackoverflow.com/questions/28651079/pandas-unstack-problems-valueerror-index-contains-duplicate-entries-cannot-re#28652153
# e.set_index(['id', 'date', 'location'], append=True)
# not this doesn't work either and just creates problems
#duplication_set.set_index(['TF', 'ranks', 'TFLength','tfCount'], append=True)#.pivot( columns='ranks',values='TFLength')



In [222]:

    
duplication_set[duplication_set.TF == 'RFX5']









    Out[222]:






  
    
      
      TF
      tfCount
      TFLength
      ranks

Move On: Just focus on Direct Runs

Debugging the motifs.py code;
grep 'RFX5' output.motif.20170114.vcf | grep '5.4636' > temp_investigate.txt
duplication of RFX5 occurs in chr1, 145039922; chr22 25005718; and chr5 46363751

Build up Reduced Data Set

Built reduced Motif File:

only include RFX5
note the motif file must have an end blank line
name: HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.fpr_0p001.RFX5.txt

Building Example VCF File [Code]

grep '^#' ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.vcf > ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset.vcf
grep 'chr1' ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.vcf | grep '145039922' >> ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset.vcf
grep 'chr22' ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.vcf | grep '25005718' >> ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset.vcf
grep 'chr5' ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.vcf | grep '46363751' >> ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset.vcf

building example vcf file for ZNF143 [Code]

grep '^#' ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.vcf > ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset2.vcf
grep 'chr1' ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.vcf | grep '762601' >> ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset2.vcf

Running the Replacement Code

python3 tf_expression.py -i ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset.vcf -o 1 -m ../../data/HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.fpr_0p001.RFX5.txt -e ../../data/ALL_ARRAYS_NORMALIZED_MAXPROBE_LOG2_COORDS.sorted.txt -mo ../../data/HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.fpr_0p001.txt.bed_reduced.RFX5.txt
- Creating gene dictionary for expression data.
- start read exp_file:../../data/ALL_ARRAYS_NORMALIZED_MAXPROBE_LOG2_COORDS.sorted.txt
- Filtering motif info for TFs that don't meet the expression threshold of 5. Found 17225 genes. Start filtering motifs.
- COMPLETE.
python3 motifs.py -i ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset.vcf -r ../../data/genome_reference/reference_genome_hg19.fa -m ../../data/HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.fpr_0p001.txt.bed_reduced.RFX5.txt -o ../../data/output.motif.20170507.vcf -fm -fp -ci ../../data/GM12878.ENCODE.ALL_TFS.bed -co ../../data/output.chip_peaks_output.20170507.bed &> ../../data/0_run_logs/20170507_motifs_run_stdout.txt

Examining Logs

- Shows 2 matches, 1 positive, 1 negative (ie reverse complement)



In [232]:

    
# repeating motifs.py sub code set

import vcf
import motifs
import sequence
from pyfaidx import Fasta

force_ref_match = False

file_motif='../../data/HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.fpr_0p001.txt.bed_reduced.RFX5.txt'
pc = 0.1
th = 0
bp = [0.25, 0.25, 0.25, 0.25]
ws = 50
motif_set = motif.get_motifs(file_motif, pc, th, bp)
wing_l = max(motif_set.max_positions, ws)

file_reference_genome='../../data/genome_reference/reference_genome_hg19.fa'
fa_ind = Fasta(file_reference_genome)    # XXX: need to check, if present skip


file_input = '../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset.vcf'
with open(file_input) as vcf_handle:
    variant_set = vcf.read_vcf_variant_lines(vcf_handle, False)
    
for index in range(variant_set.length()):
        var_element = variant_set.seq[index]    # XXX: WARNING: changes made to element not saved

for index in range(variant_set.length()):
        var_element = variant_set.seq[index]        
        
        # 1. get reference sequence
        var_element.get_surround_seq(wing_l, fa_ind, force_ref_match)
        # 2. compute reverse complement
        var_element.assign_rev_complement()
        # 3. compute int version (faster to process as int)
        var_element.assign_int_versions()
        
        ref_seq = var_element.return_full_ref_seq_int(wing_l)
        var_seq = var_element.return_full_var_seq_int(wing_l)
        print("\tref int: " + format(ref_seq) +
              "\n\tvar int: " + format(var_seq))
        print("start motif_match_int")
        plusmatch = motif_set.motif_match_int(bp, ref_seq, var_seq, wing_l)
        
        print('## Positive Matches ##')
        for match in plusmatch:
            print( match.name + " vs=" + str(round(match.var_score, 4)) + 
                  " rs = " + str(round(match.ref_score, 4)) )
        
        
        # 6. Calculate motif matches to reverse complement
        ref_seq_rc = var_element.return_full_ref_seq_reverse_complement_int(wing_l)
        var_seq_rc = var_element.return_full_var_seq_reverse_complement_int(wing_l)
        print("\tref rc int: " + format(ref_seq_rc) +
              "\n\tvar rc int: " + format(var_seq_rc))
        print("start motif_match_int reverse complement")
        minusmatch = motif_set.motif_match_int(bp, ref_seq_rc, var_seq_rc, wing_l)
        
        print('## Reverse Complement Matches ##')
        for match in minusmatch:
            print( match.name + " vs=" + str(round(match.var_score, 4)) + 
                  " rs = " + str(round(match.ref_score, 4)) )









    



pulling reference sequence for (chr1, 145039922, 1, 50, Fasta("../../data/genome_reference/reference_genome_hg19.fa"))
	145039872:145039973
	ref int: [3, 2, 3, 2, 3, 3, 2, 3, 2, 2, 2, 3, 3, 3, 3, 2, 4, 3, 4, 4, 4, 2, 3, 4, 3, 1, 3, 2, 4, 3, 3, 1, 3, 2, 2, 1, 3, 2, 3, 3, 2, 4, 3, 2, 2, 2, 3, 3, 3, 2, 3, 3, 2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 1, 2, 4, 2, 4, 2, 2, 1, 3, 3, 2, 4, 3, 1, 3, 1, 2, 1, 2, 3, 1, 2, 4, 3, 3, 2, 4, 3, 3, 2, 1, 2, 3, 1, 3, 4, 4, 3, 2]
	var int: [3, 2, 3, 2, 3, 3, 2, 3, 2, 2, 2, 3, 3, 3, 3, 2, 4, 3, 4, 4, 4, 2, 3, 4, 3, 1, 3, 2, 4, 3, 3, 1, 3, 2, 2, 1, 3, 2, 3, 3, 2, 4, 3, 2, 2, 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 1, 2, 4, 2, 4, 2, 2, 1, 3, 3, 2, 4, 3, 1, 3, 1, 2, 1, 2, 3, 1, 2, 4, 3, 3, 2, 4, 3, 3, 2, 1, 2, 3, 1, 3, 4, 4, 3, 2]
start motif_match_int
## Positive Matches ##
RFX5 vs=5.4636 rs = 8.4653
	ref rc int: [3, 2, 3, 2, 3, 3, 2, 3, 2, 2, 2, 3, 3, 3, 3, 2, 4, 3, 4, 4, 4, 2, 3, 4, 3, 1, 3, 2, 4, 3, 3, 1, 3, 2, 2, 1, 3, 2, 3, 3, 2, 4, 3, 2, 2, 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 1, 2, 4, 2, 4, 2, 2, 1, 3, 3, 2, 4, 3, 1, 3, 1, 2, 1, 2, 3, 1, 2, 4, 3, 3, 2, 4, 3, 3, 2, 1, 2, 3, 1, 3, 4, 4, 3, 2]
	var rc int: [3, 2, 3, 2, 3, 3, 2, 3, 2, 2, 2, 3, 3, 3, 3, 2, 4, 3, 4, 4, 4, 2, 3, 4, 3, 1, 3, 2, 4, 3, 3, 1, 3, 2, 2, 1, 3, 2, 3, 3, 2, 4, 3, 2, 2, 2, 3, 3, 3, 2, 3, 3, 2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 1, 2, 4, 2, 4, 2, 2, 1, 3, 3, 2, 4, 3, 1, 3, 1, 2, 1, 2, 3, 1, 2, 4, 3, 3, 2, 4, 3, 3, 2, 1, 2, 3, 1, 3, 4, 4, 3, 2]
start motif_match_int reverse complement
## Reverse Complement Matches ##
RFX5 vs=8.4653 rs = 5.4636
pulling reference sequence for (chr22, 25005718, 1, 50, Fasta("../../data/genome_reference/reference_genome_hg19.fa"))
	25005668:25005769
	ref int: [1, 2, 2, 2, 2, 1, 1, 2, 2, 4, 2, 4, 3, 4, 2, 1, 2, 2, 1, 2, 1, 4, 2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 4, 3, 4, 4, 3, 4, 2, 1, 2, 1, 2, 2, 4, 3, 4, 2, 1, 2, 2, 4, 3, 2, 4, 3, 2, 2, 1, 4, 1, 3, 2, 2, 1, 4, 3, 1, 3, 1, 2, 4, 4, 2, 2, 2, 1, 1, 3, 3, 3, 4, 2, 1, 2, 4, 3, 2, 4, 3, 2, 2, 1, 2, 4, 2, 1, 2, 4, 3, 2]
	var int: [1, 2, 2, 2, 2, 1, 1, 2, 2, 4, 2, 4, 3, 4, 2, 1, 2, 2, 1, 2, 1, 4, 2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 4, 3, 4, 4, 3, 4, 2, 1, 2, 1, 2, 2, 4, 3, 4, 2, 1, 2, 4, 4, 3, 2, 4, 3, 2, 2, 1, 4, 1, 3, 2, 2, 1, 4, 3, 1, 3, 1, 2, 4, 4, 2, 2, 2, 1, 1, 3, 3, 3, 4, 2, 1, 2, 4, 3, 2, 4, 3, 2, 2, 1, 2, 4, 2, 1, 2, 4, 3, 2]
start motif_match_int
## Positive Matches ##
RFX5 vs=6.8183 rs = 8.0363
	ref rc int: [1, 2, 2, 2, 2, 1, 1, 2, 2, 4, 2, 4, 3, 4, 2, 1, 2, 2, 1, 2, 1, 4, 2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 4, 3, 4, 4, 3, 4, 2, 1, 2, 1, 2, 2, 4, 3, 4, 2, 1, 2, 3, 4, 3, 2, 4, 3, 2, 2, 1, 4, 1, 3, 2, 2, 1, 4, 3, 1, 3, 1, 2, 4, 4, 2, 2, 2, 1, 1, 3, 3, 3, 4, 2, 1, 2, 4, 3, 2, 4, 3, 2, 2, 1, 2, 4, 2, 1, 2, 4, 3, 2]
	var rc int: [1, 2, 2, 2, 2, 1, 1, 2, 2, 4, 2, 4, 3, 4, 2, 1, 2, 2, 1, 2, 1, 4, 2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 4, 3, 4, 4, 3, 4, 2, 1, 2, 1, 2, 2, 4, 3, 4, 2, 1, 2, 1, 4, 3, 2, 4, 3, 2, 2, 1, 4, 1, 3, 2, 2, 1, 4, 3, 1, 3, 1, 2, 4, 4, 2, 2, 2, 1, 1, 3, 3, 3, 4, 2, 1, 2, 4, 3, 2, 4, 3, 2, 2, 1, 2, 4, 2, 1, 2, 4, 3, 2]
start motif_match_int reverse complement
## Reverse Complement Matches ##
RFX5 vs=4.7028 rs = 6.5375
pulling reference sequence for (chr5, 46363751, 1, 50, Fasta("../../data/genome_reference/reference_genome_hg19.fa"))
	46363701:46363802
	ref int: [3, 4, 1, 2, 2, 1, 1, 1, 1, 3, 3, 1, 1, 3, 3, 4, 4, 2, 1, 1, 2, 4, 2, 4, 1, 2, 1, 1, 3, 4, 4, 3, 1, 1, 4, 3, 2, 1, 2, 1, 2, 1, 4, 2, 1, 2, 1, 1, 1, 3, 1, 1, 3, 4, 4, 4, 2, 4, 3, 1, 3, 1, 1, 1, 3, 2, 4, 4, 2, 4, 3, 4, 2, 4, 1, 3, 4, 4, 4, 4, 4, 1, 4, 4, 4, 3, 1, 1, 4, 1, 4, 1, 4, 4, 4, 2, 2, 4, 4, 4, 4]
	var int: [3, 4, 1, 2, 2, 1, 1, 1, 1, 3, 3, 1, 1, 3, 3, 4, 4, 2, 1, 1, 2, 4, 2, 4, 1, 2, 1, 1, 3, 4, 4, 3, 1, 1, 4, 3, 2, 1, 2, 1, 2, 1, 4, 2, 1, 2, 1, 1, 1, 3, 2, 0, 4, 1, 3, 4, 4, 4, 2, 4, 3, 1, 3, 1, 1, 1, 3, 2, 4, 4, 2, 4, 3, 4, 2, 4, 1, 3, 4, 4, 4, 4, 4, 1, 4, 4, 4, 3, 1, 1, 4, 1, 4, 1, 4, 4, 4, 2, 2, 4, 4, 4, 4]
start motif_match_int
## Positive Matches ##
RFX5 vs=7.7285 rs = 6.6905
	ref rc int: [3, 4, 1, 2, 2, 1, 1, 1, 1, 3, 3, 1, 1, 3, 3, 4, 4, 2, 1, 1, 2, 4, 2, 4, 1, 2, 1, 1, 3, 4, 4, 3, 1, 1, 4, 3, 2, 1, 2, 1, 2, 1, 4, 2, 1, 2, 1, 1, 1, 3, 4, 1, 3, 4, 4, 4, 2, 4, 3, 1, 3, 1, 1, 1, 3, 2, 4, 4, 2, 4, 3, 4, 2, 4, 1, 3, 4, 4, 4, 4, 4, 1, 4, 4, 4, 3, 1, 1, 4, 1, 4, 1, 4, 4, 4, 2, 2, 4, 4, 4, 4]
	var rc int: [3, 4, 1, 2, 2, 1, 1, 1, 1, 3, 3, 1, 1, 3, 3, 4, 4, 2, 1, 1, 2, 4, 2, 4, 1, 2, 1, 1, 3, 4, 4, 3, 1, 1, 4, 3, 2, 1, 2, 1, 2, 1, 4, 2, 1, 2, 1, 1, 1, 3, 1, 0, 3, 1, 3, 4, 4, 4, 2, 4, 3, 1, 3, 1, 1, 1, 3, 2, 4, 4, 2, 4, 3, 4, 2, 4, 1, 3, 4, 4, 4, 4, 4, 1, 4, 4, 4, 3, 1, 1, 4, 1, 4, 1, 4, 4, 4, 2, 2, 4, 4, 4, 4]
start motif_match_int reverse complement
## Reverse Complement Matches ##
RFX5 vs=8.2578 rs = 8.8059



In [230]:









    Out[230]:





<motif.MotifMatch at 0x7fbf653871d0>



In [ ]:

	TF	tfCount
6	ARID3A	2
24	BATF	2
84	EGR1	2
88	EHF	2
105	ESR1	2
106	ESR2	2
144	FOXJ3	2
158	GATA1	2
163	GATA6	2
286	MAFG	2
287	MAFK	2
323	NANOG	2
328	NFATC1	2
335	NFIA	2
359	NR1I2	2
360	NR1I3	2
365	NR2F1	2
366	NR2F2	2
368	NR3C1	2
388	PAX2	2
391	PAX5	2
399	PGR	2
407	PLAG1	2
424	PPARA	2
426	PPARG	2
436	RARA	2
438	RARG	2
487	SP1	2
500	STAT1	2
508	TAL1	2
536	TFDP1	2
539	TGIF1	2
543	THRA	2
544	THRB	2
545	TLX1	2
547	TP63	2
548	TP73	2
556	VDR	2
567	ZBTB33	2
568	ZBTB4	2

	TF	TFLength
0	AHR	9
1	AIRE	18
2	ALX1	12
3	ALX3	11
4	ALX4	12
5	AR	17
6	TFAP2A	15
7	TFAP2B	10
8	TFAP2C	14
9	TFAP2D	14
10	ARID3A	22
11	ARID3A	7
12	ARID5B	13
13	ARNT2	12
14	ARNT	8
15	ARX	13
16	ASCL2	14
17	ATF1	11
18	ATF2	9
19	ATF3	9
20	ATF6	12
21	ATF7	11
22	ATOH1	11
23	BACH1	13
24	BARHL1	13
25	BARHL2	18
26	BARX1	17
27	BARX2	10
28	BATF3	12
29	BATF	18
...	...	...
611	ZEB1	11
612	HIVEP1	12
613	HIVEP2	14
614	ZFHX3	11
615	ZFX	19
616	ZIC1	9
617	ZIC2	9
618	ZIC3	9
619	ZIC4	15
620	ZKSCAN1	22
621	ZKSCAN3	11
622	ZNF143	22
623	ZNF148	15
624	ZNF219	12
625	ZNF232	19
626	ZNF282	14
627	ZNF333	9
628	ZNF350	24
629	ZNF384	12
630	ZNF410	13
631	ZNF423	15
632	ZNF524	18
633	ZNF589	16
634	ZNF639	20
635	ZNF652	11
636	ZNF713	19
637	ZNF740	14
638	ZNF784	11
639	ZSCAN16	20
640	ZSCAN4	12