Notebook Analyzes Duplication in Motif Defintions


In [207]:
import venusar
import motif
import thresholds
import motifs
import activity
import tf_expression
import gene_expression

In [208]:
# to get code changes
import imp  
imp.reload(motif)


Out[208]:
<module 'motif' from '/data640g1/data/documents/docs/projects/payton_venusar/VENUSAR_DEV/venusar/motif.py'>

Import Motif Definitions


In [209]:
motif_f_base='../../data/HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.txt'
pc = 0.1
th = 0
bp = [0.25, 0.25, 0.25, 0.25]
motif_set_base = motif.get_motifs(motif_f_base, pc, th, bp)

In [210]:
motif_set_base.length()
motif_dict = motif_set_base.motif_count(False)
len(motif_dict)


Out[210]:
601

In [211]:
# drop to valid motifs only; do twice to see if any invalid
motif_dict = motif_set_base.motif_count(True)
len(motif_dict)


Out[211]:
601

In [212]:
range(len(motif_set_base.motifs))


Out[212]:
range(0, 641)

In [213]:
range(0, (len(motif_set_base.motifs) - 1))


Out[213]:
range(0, 640)

Convert to DataFrame for Analysis of Duplication


In [214]:
import pandas

# -- building data frame from dictionary
#df = pandas.DataFrame(motif_dict) # errors about 'you must pass an index'
# ref: http://stackoverflow.com/questions/17839973/construct-pandas-dataframe-from-values-in-variables#17840195
dfCounts = pandas.DataFrame(motif_dict, index=[0])
dfCounts = pandas.melt(dfCounts) # rotate
dfCounts.rename(columns = {'variable': 'TF', 'value': 'tfCount'}, inplace=True)

# -- handling types
# ref: http://stackoverflow.com/questions/15891038/pandas-change-data-type-of-columns
dfCounts.dtypes

#pandas.to_numeric(s, errors='ignore')


Out[214]:
TF         object
tfCount     int64
dtype: object

In [215]:
dfCounts.size
dfCounts[dfCounts.tfCount > 1]


Out[215]:
TF tfCount
6 ARID3A 2
24 BATF 2
84 EGR1 2
88 EHF 2
105 ESR1 2
106 ESR2 2
144 FOXJ3 2
158 GATA1 2
163 GATA6 2
286 MAFG 2
287 MAFK 2
323 NANOG 2
328 NFATC1 2
335 NFIA 2
359 NR1I2 2
360 NR1I3 2
365 NR2F1 2
366 NR2F2 2
368 NR3C1 2
388 PAX2 2
391 PAX5 2
399 PGR 2
407 PLAG1 2
424 PPARA 2
426 PPARG 2
436 RARA 2
438 RARG 2
487 SP1 2
500 STAT1 2
508 TAL1 2
536 TFDP1 2
539 TGIF1 2
543 THRA 2
544 THRB 2
545 TLX1 2
547 TP63 2
548 TP73 2
556 VDR 2
567 ZBTB33 2
568 ZBTB4 2

In [216]:
# add additional information to the dataframe
msb_names,msb_lengths = motif_set_base.motif_lengths(False)
# define data frame by columns
dfc = pandas.DataFrame(
    {
        "TF" : msb_names,
        "TFLength" : msb_lengths
    })
print(dfc)

if False:
    # define data frame by row (this is wrong; interlaced value sets)
    dfr = pandas.melt(pandas.DataFrame(
        [ msb_names,
        msb_lengths])).rename(columns = {'variable': 'TF', 'value': 'tfCount'}, inplace=True)
    dfr


          TF  TFLength
0        AHR         9
1       AIRE        18
2       ALX1        12
3       ALX3        11
4       ALX4        12
5         AR        17
6     TFAP2A        15
7     TFAP2B        10
8     TFAP2C        14
9     TFAP2D        14
10    ARID3A        22
11    ARID3A         7
12    ARID5B        13
13     ARNT2        12
14      ARNT         8
15       ARX        13
16     ASCL2        14
17      ATF1        11
18      ATF2         9
19      ATF3         9
20      ATF6        12
21      ATF7        11
22     ATOH1        11
23     BACH1        13
24    BARHL1        13
25    BARHL2        18
26     BARX1        17
27     BARX2        10
28     BATF3        12
29      BATF        18
..       ...       ...
611     ZEB1        11
612   HIVEP1        12
613   HIVEP2        14
614    ZFHX3        11
615      ZFX        19
616     ZIC1         9
617     ZIC2         9
618     ZIC3         9
619     ZIC4        15
620  ZKSCAN1        22
621  ZKSCAN3        11
622   ZNF143        22
623   ZNF148        15
624   ZNF219        12
625   ZNF232        19
626   ZNF282        14
627   ZNF333         9
628   ZNF350        24
629   ZNF384        12
630   ZNF410        13
631   ZNF423        15
632   ZNF524        18
633   ZNF589        16
634   ZNF639        20
635   ZNF652        11
636   ZNF713        19
637   ZNF740        14
638   ZNF784        11
639  ZSCAN16        20
640   ZSCAN4        12

[641 rows x 2 columns]

In [217]:
dfCounts[dfCounts.tfCount > 1].TF


Out[217]:
6      ARID3A
24       BATF
84       EGR1
88        EHF
105      ESR1
106      ESR2
144     FOXJ3
158     GATA1
163     GATA6
286      MAFG
287      MAFK
323     NANOG
328    NFATC1
335      NFIA
359     NR1I2
360     NR1I3
365     NR2F1
366     NR2F2
368     NR3C1
388      PAX2
391      PAX5
399       PGR
407     PLAG1
424     PPARA
426     PPARG
436      RARA
438      RARG
487       SP1
500     STAT1
508      TAL1
536     TFDP1
539     TGIF1
543      THRA
544      THRB
545      TLX1
547      TP63
548      TP73
556       VDR
567    ZBTB33
568     ZBTB4
Name: TF, dtype: object

In [218]:
dfc


Out[218]:
TF TFLength
0 AHR 9
1 AIRE 18
2 ALX1 12
3 ALX3 11
4 ALX4 12
5 AR 17
6 TFAP2A 15
7 TFAP2B 10
8 TFAP2C 14
9 TFAP2D 14
10 ARID3A 22
11 ARID3A 7
12 ARID5B 13
13 ARNT2 12
14 ARNT 8
15 ARX 13
16 ASCL2 14
17 ATF1 11
18 ATF2 9
19 ATF3 9
20 ATF6 12
21 ATF7 11
22 ATOH1 11
23 BACH1 13
24 BARHL1 13
25 BARHL2 18
26 BARX1 17
27 BARX2 10
28 BATF3 12
29 BATF 18
... ... ...
611 ZEB1 11
612 HIVEP1 12
613 HIVEP2 14
614 ZFHX3 11
615 ZFX 19
616 ZIC1 9
617 ZIC2 9
618 ZIC3 9
619 ZIC4 15
620 ZKSCAN1 22
621 ZKSCAN3 11
622 ZNF143 22
623 ZNF148 15
624 ZNF219 12
625 ZNF232 19
626 ZNF282 14
627 ZNF333 9
628 ZNF350 24
629 ZNF384 12
630 ZNF410 13
631 ZNF423 15
632 ZNF524 18
633 ZNF589 16
634 ZNF639 20
635 ZNF652 11
636 ZNF713 19
637 ZNF740 14
638 ZNF784 11
639 ZSCAN16 20
640 ZSCAN4 12

641 rows × 2 columns


In [219]:
duplication_set = pandas.merge( dfCounts[dfCounts.tfCount > 1], dfc, how='inner', on='TF' ).sort_values('TF')
duplication_set
#duplication_set.select(['TF','TFLength']).groupby(by='TF').rank(method='min')
duplication_set.groupby(by='TF').rank(method='min')
# ref: http://stackoverflow.com/questions/23976176/ranks-within-groupby-in-pandas

dfRank = lambda x: pandas.Series(pandas.qcut(x,2,labels=False),index=x.index)
dfRank2 = lambda x: pandas.qcut(x,2,labels=False)
# this works: replacing x above with duplication_set['TFLength'] but fails when adding groupby, why? fails using apply too.
# duplication_set['ranks'] = duplication_set.groupby('TF')['TFLength'].apply(dfRank)
# duplication_set['ranks'] = duplication_set['TFLength'].apply(dfRank)
# duplication_set['ranks'] = pandas.qcut((duplication_set['TFLength']),2,labels=False)
duplication_set['ranks'] = dfRank2(duplication_set['TFLength'])

# adding rank to try to pivot multiple rows to columns but no dice
# df.pivot(columns='var', values='val')
#duplication_set[['TF','TFLength','ranks']].pivot(columns='ranks',values='TFLength') # stupidly keeps dropping TF column, why? also duplicating rows and not actually pivoting
# duplication_set.pivot_table(df,index=["TF","Ranks"]) # fails, 'grouper for TF not 1 dimensional
# hack tired of fighting odd outcome, dyplr is much better than pandas
duplication_set


Out[219]:
TF tfCount TFLength ranks
0 ARID3A 2 22 1
1 ARID3A 2 7 0
2 BATF 2 18 1
3 BATF 2 10 0
4 EGR1 2 18 1
5 EGR1 2 11 0
6 EHF 2 17 1
7 EHF 2 13 0
9 ESR1 2 18 1
8 ESR1 2 17 1
10 ESR2 2 18 1
11 ESR2 2 7 0
12 FOXJ3 2 13 0
13 FOXJ3 2 13 0
14 GATA1 2 19 1
15 GATA1 2 10 0
17 GATA6 2 7 0
16 GATA6 2 14 1
19 MAFG 2 7 0
18 MAFG 2 21 1
20 MAFK 2 22 1
21 MAFK 2 10 0
22 NANOG 2 17 1
23 NANOG 2 8 0
24 NFATC1 2 12 0
25 NFATC1 2 13 0
26 NFIA 2 15 1
27 NFIA 2 7 0
29 NR1I2 2 8 0
28 NR1I2 2 19 1
... ... ... ... ...
50 RARA 2 17 1
51 RARA 2 7 0
52 RARG 2 20 1
53 RARG 2 7 0
54 SP1 2 19 1
55 SP1 2 11 0
56 STAT1 2 22 1
57 STAT1 2 15 1
59 TAL1 2 12 0
58 TAL1 2 19 1
60 TFDP1 2 22 1
61 TFDP1 2 14 1
62 TGIF1 2 16 1
63 TGIF1 2 7 0
64 THRA 2 17 1
65 THRA 2 10 0
66 THRB 2 17 1
67 THRB 2 10 0
69 TLX1 2 11 0
68 TLX1 2 17 1
70 TP63 2 19 1
71 TP63 2 11 0
72 TP73 2 20 1
73 TP73 2 12 0
74 VDR 2 16 1
75 VDR 2 7 0
76 ZBTB33 2 19 1
77 ZBTB33 2 12 0
78 ZBTB4 2 17 1
79 ZBTB4 2 15 1

80 rows × 4 columns


In [220]:
pandas.qcut(duplication_set['TFLength'],2,labels=False)  # doesn't error gives ranks


Out[220]:
0     1
1     0
2     1
3     0
4     1
5     0
6     1
7     0
9     1
8     1
10    1
11    0
12    0
13    0
14    1
15    0
17    0
16    1
19    0
18    1
20    1
21    0
22    1
23    0
24    0
25    0
26    1
27    0
29    0
28    1
     ..
50    1
51    0
52    1
53    0
54    1
55    0
56    1
57    1
59    0
58    1
60    1
61    1
62    1
63    0
64    1
65    0
66    1
67    0
69    0
68    1
70    1
71    0
72    1
73    0
74    1
75    0
76    1
77    0
78    1
79    1
Name: TFLength, dtype: int64

In [221]:
# this is wrong but not clear why?
duplication_set[['TF','TFLength','ranks']].pivot(index='TF', columns='ranks',values='Lengths') # this is wrong too


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.5/dist-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2133             try:
-> 2134                 return self._engine.get_loc(key)
   2135             except KeyError:

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: 'Lengths'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-221-64c46a687b0d> in <module>()
      1 # this is wrong but not clear why?
----> 2 duplication_set[['TF','TFLength','ranks']].pivot(index='TF', columns='ranks',values='Lengths') # this is wrong too

/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py in pivot(self, index, columns, values)
   3933         """
   3934         from pandas.core.reshape import pivot
-> 3935         return pivot(self, index=index, columns=columns, values=values)
   3936 
   3937     def stack(self, level=-1, dropna=True):

/usr/local/lib/python3.5/dist-packages/pandas/core/reshape.py in pivot(self, index, columns, values)
    334         else:
    335             index = self[index]
--> 336         indexed = Series(self[values].values,
    337                          index=MultiIndex.from_arrays([index, self[columns]]))
    338         return indexed.unstack(columns)

/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py in __getitem__(self, key)
   2057             return self._getitem_multilevel(key)
   2058         else:
-> 2059             return self._getitem_column(key)
   2060 
   2061     def _getitem_column(self, key):

/usr/local/lib/python3.5/dist-packages/pandas/core/frame.py in _getitem_column(self, key)
   2064         # get column
   2065         if self.columns.is_unique:
-> 2066             return self._get_item_cache(key)
   2067 
   2068         # duplicate columns & possible reduce dimensionality

/usr/local/lib/python3.5/dist-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1384         res = cache.get(item)
   1385         if res is None:
-> 1386             values = self._data.get(item)
   1387             res = self._box_item_values(item, values)
   1388             cache[item] = res

/usr/local/lib/python3.5/dist-packages/pandas/core/internals.py in get(self, item, fastpath)
   3541 
   3542             if not isnull(item):
-> 3543                 loc = self.items.get_loc(item)
   3544             else:
   3545                 indexer = np.arange(len(self.items))[isnull(self.items)]

/usr/local/lib/python3.5/dist-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   2134                 return self._engine.get_loc(key)
   2135             except KeyError:
-> 2136                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2137 
   2138         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4433)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4279)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13742)()

pandas/src/hashtable_class_helper.pxi in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:13696)()

KeyError: 'Lengths'

In [ ]:
duplication_set

In [ ]:
duplication_set.dtypes

In [ ]:
# duplication_set.pivot(index='TF', columns='ranks',values='TFLength') # this is wrong too errors: pandas pivot ValueError: Index contains duplicate entries, cannot reshape
# led to
# ref: http://stackoverflow.com/questions/28651079/pandas-unstack-problems-valueerror-index-contains-duplicate-entries-cannot-re#28652153
# e.set_index(['id', 'date', 'location'], append=True)
# not this doesn't work either and just creates problems
#duplication_set.set_index(['TF', 'ranks', 'TFLength','tfCount'], append=True)#.pivot( columns='ranks',values='TFLength')

In [222]:
duplication_set[duplication_set.TF == 'RFX5']


Out[222]:
TF tfCount TFLength ranks

Move On: Just focus on Direct Runs

  • Debugging the motifs.py code;
  • grep 'RFX5' output.motif.20170114.vcf | grep '5.4636' > temp_investigate.txt
  • duplication of RFX5 occurs in chr1, 145039922; chr22 25005718; and chr5 46363751

Build up Reduced Data Set

Built reduced Motif File:

  • only include RFX5
  • note the motif file must have an end blank line
  • name: HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.fpr_0p001.RFX5.txt

Building Example VCF File [Code]

  • grep '^#' ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.vcf > ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset.vcf
  • grep 'chr1' ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.vcf | grep '145039922' >> ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset.vcf
  • grep 'chr22' ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.vcf | grep '25005718' >> ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset.vcf
  • grep 'chr5' ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.vcf | grep '46363751' >> ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset.vcf

building example vcf file for ZNF143 [Code]

  • grep '^#' ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.vcf > ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset2.vcf
  • grep 'chr1' ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.vcf | grep '762601' >> ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset2.vcf

Running the Replacement Code

  • python3 tf_expression.py -i ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset.vcf -o 1 -m ../../data/HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.fpr_0p001.RFX5.txt -e ../../data/ALL_ARRAYS_NORMALIZED_MAXPROBE_LOG2_COORDS.sorted.txt -mo ../../data/HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.fpr_0p001.txt.bed_reduced.RFX5.txt
    • Creating gene dictionary for expression data.
    • start read exp_file:../../data/ALL_ARRAYS_NORMALIZED_MAXPROBE_LOG2_COORDS.sorted.txt
    • Filtering motif info for TFs that don't meet the expression threshold of 5. Found 17225 genes. Start filtering motifs.
    • COMPLETE.
  • python3 motifs.py -i ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset.vcf -r ../../data/genome_reference/reference_genome_hg19.fa -m ../../data/HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.fpr_0p001.txt.bed_reduced.RFX5.txt -o ../../data/output.motif.20170507.vcf -fm -fp -ci ../../data/GM12878.ENCODE.ALL_TFS.bed -co ../../data/output.chip_peaks_output.20170507.bed &> ../../data/0_run_logs/20170507_motifs_run_stdout.txt

Examining Logs

- Shows 2 matches, 1 positive, 1 negative (ie reverse complement)


In [232]:
# repeating motifs.py sub code set

import vcf
import motifs
import sequence
from pyfaidx import Fasta

force_ref_match = False

file_motif='../../data/HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.fpr_0p001.txt.bed_reduced.RFX5.txt'
pc = 0.1
th = 0
bp = [0.25, 0.25, 0.25, 0.25]
ws = 50
motif_set = motif.get_motifs(file_motif, pc, th, bp)
wing_l = max(motif_set.max_positions, ws)

file_reference_genome='../../data/genome_reference/reference_genome_hg19.fa'
fa_ind = Fasta(file_reference_genome)    # XXX: need to check, if present skip


file_input = '../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.subset.vcf'
with open(file_input) as vcf_handle:
    variant_set = vcf.read_vcf_variant_lines(vcf_handle, False)
    
for index in range(variant_set.length()):
        var_element = variant_set.seq[index]    # XXX: WARNING: changes made to element not saved

for index in range(variant_set.length()):
        var_element = variant_set.seq[index]        
        
        # 1. get reference sequence
        var_element.get_surround_seq(wing_l, fa_ind, force_ref_match)
        # 2. compute reverse complement
        var_element.assign_rev_complement()
        # 3. compute int version (faster to process as int)
        var_element.assign_int_versions()
        
        ref_seq = var_element.return_full_ref_seq_int(wing_l)
        var_seq = var_element.return_full_var_seq_int(wing_l)
        print("\tref int: " + format(ref_seq) +
              "\n\tvar int: " + format(var_seq))
        print("start motif_match_int")
        plusmatch = motif_set.motif_match_int(bp, ref_seq, var_seq, wing_l)
        
        print('## Positive Matches ##')
        for match in plusmatch:
            print( match.name + " vs=" + str(round(match.var_score, 4)) + 
                  " rs = " + str(round(match.ref_score, 4)) )
        
        
        # 6. Calculate motif matches to reverse complement
        ref_seq_rc = var_element.return_full_ref_seq_reverse_complement_int(wing_l)
        var_seq_rc = var_element.return_full_var_seq_reverse_complement_int(wing_l)
        print("\tref rc int: " + format(ref_seq_rc) +
              "\n\tvar rc int: " + format(var_seq_rc))
        print("start motif_match_int reverse complement")
        minusmatch = motif_set.motif_match_int(bp, ref_seq_rc, var_seq_rc, wing_l)
        
        print('## Reverse Complement Matches ##')
        for match in minusmatch:
            print( match.name + " vs=" + str(round(match.var_score, 4)) + 
                  " rs = " + str(round(match.ref_score, 4)) )


pulling reference sequence for (chr1, 145039922, 1, 50, Fasta("../../data/genome_reference/reference_genome_hg19.fa"))
	145039872:145039973
	ref int: [3, 2, 3, 2, 3, 3, 2, 3, 2, 2, 2, 3, 3, 3, 3, 2, 4, 3, 4, 4, 4, 2, 3, 4, 3, 1, 3, 2, 4, 3, 3, 1, 3, 2, 2, 1, 3, 2, 3, 3, 2, 4, 3, 2, 2, 2, 3, 3, 3, 2, 3, 3, 2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 1, 2, 4, 2, 4, 2, 2, 1, 3, 3, 2, 4, 3, 1, 3, 1, 2, 1, 2, 3, 1, 2, 4, 3, 3, 2, 4, 3, 3, 2, 1, 2, 3, 1, 3, 4, 4, 3, 2]
	var int: [3, 2, 3, 2, 3, 3, 2, 3, 2, 2, 2, 3, 3, 3, 3, 2, 4, 3, 4, 4, 4, 2, 3, 4, 3, 1, 3, 2, 4, 3, 3, 1, 3, 2, 2, 1, 3, 2, 3, 3, 2, 4, 3, 2, 2, 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 1, 2, 4, 2, 4, 2, 2, 1, 3, 3, 2, 4, 3, 1, 3, 1, 2, 1, 2, 3, 1, 2, 4, 3, 3, 2, 4, 3, 3, 2, 1, 2, 3, 1, 3, 4, 4, 3, 2]
start motif_match_int
## Positive Matches ##
RFX5 vs=5.4636 rs = 8.4653
	ref rc int: [3, 2, 3, 2, 3, 3, 2, 3, 2, 2, 2, 3, 3, 3, 3, 2, 4, 3, 4, 4, 4, 2, 3, 4, 3, 1, 3, 2, 4, 3, 3, 1, 3, 2, 2, 1, 3, 2, 3, 3, 2, 4, 3, 2, 2, 2, 3, 3, 3, 2, 2, 3, 2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 1, 2, 4, 2, 4, 2, 2, 1, 3, 3, 2, 4, 3, 1, 3, 1, 2, 1, 2, 3, 1, 2, 4, 3, 3, 2, 4, 3, 3, 2, 1, 2, 3, 1, 3, 4, 4, 3, 2]
	var rc int: [3, 2, 3, 2, 3, 3, 2, 3, 2, 2, 2, 3, 3, 3, 3, 2, 4, 3, 4, 4, 4, 2, 3, 4, 3, 1, 3, 2, 4, 3, 3, 1, 3, 2, 2, 1, 3, 2, 3, 3, 2, 4, 3, 2, 2, 2, 3, 3, 3, 2, 3, 3, 2, 3, 3, 4, 3, 2, 3, 2, 3, 2, 1, 2, 4, 2, 4, 2, 2, 1, 3, 3, 2, 4, 3, 1, 3, 1, 2, 1, 2, 3, 1, 2, 4, 3, 3, 2, 4, 3, 3, 2, 1, 2, 3, 1, 3, 4, 4, 3, 2]
start motif_match_int reverse complement
## Reverse Complement Matches ##
RFX5 vs=8.4653 rs = 5.4636
pulling reference sequence for (chr22, 25005718, 1, 50, Fasta("../../data/genome_reference/reference_genome_hg19.fa"))
	25005668:25005769
	ref int: [1, 2, 2, 2, 2, 1, 1, 2, 2, 4, 2, 4, 3, 4, 2, 1, 2, 2, 1, 2, 1, 4, 2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 4, 3, 4, 4, 3, 4, 2, 1, 2, 1, 2, 2, 4, 3, 4, 2, 1, 2, 2, 4, 3, 2, 4, 3, 2, 2, 1, 4, 1, 3, 2, 2, 1, 4, 3, 1, 3, 1, 2, 4, 4, 2, 2, 2, 1, 1, 3, 3, 3, 4, 2, 1, 2, 4, 3, 2, 4, 3, 2, 2, 1, 2, 4, 2, 1, 2, 4, 3, 2]
	var int: [1, 2, 2, 2, 2, 1, 1, 2, 2, 4, 2, 4, 3, 4, 2, 1, 2, 2, 1, 2, 1, 4, 2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 4, 3, 4, 4, 3, 4, 2, 1, 2, 1, 2, 2, 4, 3, 4, 2, 1, 2, 4, 4, 3, 2, 4, 3, 2, 2, 1, 4, 1, 3, 2, 2, 1, 4, 3, 1, 3, 1, 2, 4, 4, 2, 2, 2, 1, 1, 3, 3, 3, 4, 2, 1, 2, 4, 3, 2, 4, 3, 2, 2, 1, 2, 4, 2, 1, 2, 4, 3, 2]
start motif_match_int
## Positive Matches ##
RFX5 vs=6.8183 rs = 8.0363
	ref rc int: [1, 2, 2, 2, 2, 1, 1, 2, 2, 4, 2, 4, 3, 4, 2, 1, 2, 2, 1, 2, 1, 4, 2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 4, 3, 4, 4, 3, 4, 2, 1, 2, 1, 2, 2, 4, 3, 4, 2, 1, 2, 3, 4, 3, 2, 4, 3, 2, 2, 1, 4, 1, 3, 2, 2, 1, 4, 3, 1, 3, 1, 2, 4, 4, 2, 2, 2, 1, 1, 3, 3, 3, 4, 2, 1, 2, 4, 3, 2, 4, 3, 2, 2, 1, 2, 4, 2, 1, 2, 4, 3, 2]
	var rc int: [1, 2, 2, 2, 2, 1, 1, 2, 2, 4, 2, 4, 3, 4, 2, 1, 2, 2, 1, 2, 1, 4, 2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 4, 3, 4, 4, 3, 4, 2, 1, 2, 1, 2, 2, 4, 3, 4, 2, 1, 2, 1, 4, 3, 2, 4, 3, 2, 2, 1, 4, 1, 3, 2, 2, 1, 4, 3, 1, 3, 1, 2, 4, 4, 2, 2, 2, 1, 1, 3, 3, 3, 4, 2, 1, 2, 4, 3, 2, 4, 3, 2, 2, 1, 2, 4, 2, 1, 2, 4, 3, 2]
start motif_match_int reverse complement
## Reverse Complement Matches ##
RFX5 vs=4.7028 rs = 6.5375
pulling reference sequence for (chr5, 46363751, 1, 50, Fasta("../../data/genome_reference/reference_genome_hg19.fa"))
	46363701:46363802
	ref int: [3, 4, 1, 2, 2, 1, 1, 1, 1, 3, 3, 1, 1, 3, 3, 4, 4, 2, 1, 1, 2, 4, 2, 4, 1, 2, 1, 1, 3, 4, 4, 3, 1, 1, 4, 3, 2, 1, 2, 1, 2, 1, 4, 2, 1, 2, 1, 1, 1, 3, 1, 1, 3, 4, 4, 4, 2, 4, 3, 1, 3, 1, 1, 1, 3, 2, 4, 4, 2, 4, 3, 4, 2, 4, 1, 3, 4, 4, 4, 4, 4, 1, 4, 4, 4, 3, 1, 1, 4, 1, 4, 1, 4, 4, 4, 2, 2, 4, 4, 4, 4]
	var int: [3, 4, 1, 2, 2, 1, 1, 1, 1, 3, 3, 1, 1, 3, 3, 4, 4, 2, 1, 1, 2, 4, 2, 4, 1, 2, 1, 1, 3, 4, 4, 3, 1, 1, 4, 3, 2, 1, 2, 1, 2, 1, 4, 2, 1, 2, 1, 1, 1, 3, 2, 0, 4, 1, 3, 4, 4, 4, 2, 4, 3, 1, 3, 1, 1, 1, 3, 2, 4, 4, 2, 4, 3, 4, 2, 4, 1, 3, 4, 4, 4, 4, 4, 1, 4, 4, 4, 3, 1, 1, 4, 1, 4, 1, 4, 4, 4, 2, 2, 4, 4, 4, 4]
start motif_match_int
## Positive Matches ##
RFX5 vs=7.7285 rs = 6.6905
	ref rc int: [3, 4, 1, 2, 2, 1, 1, 1, 1, 3, 3, 1, 1, 3, 3, 4, 4, 2, 1, 1, 2, 4, 2, 4, 1, 2, 1, 1, 3, 4, 4, 3, 1, 1, 4, 3, 2, 1, 2, 1, 2, 1, 4, 2, 1, 2, 1, 1, 1, 3, 4, 1, 3, 4, 4, 4, 2, 4, 3, 1, 3, 1, 1, 1, 3, 2, 4, 4, 2, 4, 3, 4, 2, 4, 1, 3, 4, 4, 4, 4, 4, 1, 4, 4, 4, 3, 1, 1, 4, 1, 4, 1, 4, 4, 4, 2, 2, 4, 4, 4, 4]
	var rc int: [3, 4, 1, 2, 2, 1, 1, 1, 1, 3, 3, 1, 1, 3, 3, 4, 4, 2, 1, 1, 2, 4, 2, 4, 1, 2, 1, 1, 3, 4, 4, 3, 1, 1, 4, 3, 2, 1, 2, 1, 2, 1, 4, 2, 1, 2, 1, 1, 1, 3, 1, 0, 3, 1, 3, 4, 4, 4, 2, 4, 3, 1, 3, 1, 1, 1, 3, 2, 4, 4, 2, 4, 3, 4, 2, 4, 1, 3, 4, 4, 4, 4, 4, 1, 4, 4, 4, 3, 1, 1, 4, 1, 4, 1, 4, 4, 4, 2, 2, 4, 4, 4, 4]
start motif_match_int reverse complement
## Reverse Complement Matches ##
RFX5 vs=8.2578 rs = 8.8059

In [230]:



Out[230]:
<motif.MotifMatch at 0x7fbf653871d0>

In [ ]: