find_string_patterns.ipynb

You will need to run run_me_first and calculate_tau first.

This script finds patterns in pi and tau by performing string searches. The search criteria are divided into four groups:

Group 1: patterns consisting of a string of single digits with a defined startpoint and variable length

repeated digits
digits consecutively increasing or decreasing by 1
the digits of pi, tau, e or root 2

Group 2: patterns consisting of a string of digits with a defined startpoint and variable internal length (e.g. for the primes, 2357 and 235711 would be positive matches but 23571 would not.)

primes or the fibonacci sequences (starting at 0 or 1)

Group 3: regex patterns

consecutive even, odd or prime numbers
consecutive digits not containing one particular digit
consecutive digits above or below the digit average of 4.5
consecutive binary digits (0 or 1)

Group 4: finding specific digits

Jenny's number (8675309)
The Sir Mix-A-Lot number (18007476492568)
All 10 digits in increasing or decreasing order
The Kennedy assassination date (112263)
The number of the beast (666); it counts any string of 3 or more ocnsecutive 6s as one match

To avoid missing matches over a file break, the first 1000 digits from the next file are added to the end

Note: the dict that kept the results was buggy so I removed it and output everything to stdout and to a list saved as json, which can be parsed. Group 4, which keeps counts, not positions, is still saved as a dict.



In [1]:

    
# in case you want to redo only parts of the analyses, you can change any of these to False.

do_group_1 = True
do_group_2 = True
do_group_3 = True
do_group_4 = True

import re
import json
import time

################################################################################
# DEFINE SEARCH TERMS
# tests have three elements: abbreviation, full description, and a tuplet
# of search terms
# note that if a match as long as the search term appears for sequences,
# the search term will need to be lengthened.

group_1 = [
('rep', 'repeating digits', ('00000000000000000000',
                              '11111111111111111111',
                              '22222222222222222222',
                              '33333333333333333333',
                              '44444444444444444444',
                              '55555555555555555555',
                              '66666666666666666666',
                              '77777777777777777777',
                              '88888888888888888888',
                              '99999999999999999999')),
('consec', 'consecutive increasing or decreasing digits', ('01234567890123456789',
                                                        '12345678901234567890',
                                                        '23456789012345678901',
                                                        '34567890123456789012',
                                                        '45678901234567890123',
                                                        '56789012345678901234',
                                                        '67890123456789012345',
                                                        '78901234567890123456',
                                                        '89012345678901234567',
                                                        '90123456789012345678',
                                                        '01234567890123456789',
                                                        '98765432109876543210',
                                                        '87654321098765432109',
                                                        '76543210987654321098',
                                                        '65432109876543210987',
                                                        '54321098765432109876',
                                                        '43210987654321098765',
                                                        '32109876543210987654',
                                                        '21098765432109876543',
                                                        '10987654321098765432',
                                                        '09876543210987654321')),
('pi', 'the digits of pi', ['3141592653589793238462643']),
('tau', 'the digits of tau', ['6283185307179586476925286']),
('e', "the digits of Euler's number", ['2718281828459045235360287']),
('root2', 'the digits of the square root of two', ['1414213562373095048801688'])]

# the last list is the positions new members of the group start at.
group_2 = [('primes', 'the sequence of prime numbers', ['235711131719232931374143475359'], 
            [1,2,3,4,6,8,10,12,14,16,18,20,22]),
                             ('fib0', 'fibonacci sequence starting at 0',['01123581321345589144233'], 
                              [1,2,3,4,5,6,7,9,11,13,15,17,30,23]),
                             ('fib1', 'fibonacci sequence starting at 1',['1123581321345589144233'], 
                              [1,2,3,4,5,6,8,10,12,14,16,19,22])]

group_3 = [('primed', 'prime digits', ['[2357]']),
             ('even', 'even digits', ['[02468]']), 
             ('odd', 'odd digits', ['[13579]']),
             ('not', 'not containing a particular digit', ('[^0]', '[^1]', '[^2]', '[^3]', '[^4]', '[^5]', 
                                                           '[^6]', '[^7]', '[^8]', '[^9]')),
             ('5 or more', 'digits greater than average value of 4.5', ['[56789]']),
             ('4 or less', 'digits less than average value of 4.5', ['[01234]']),
             ('binary', 'zero or one', ['01'])]

group_4 = ['8675309', '112263', '18007476492568', '[^6]666', '0123456789', '9876543210']



In [2]:

    
starttime = time.time()

result = []

for constant in ['pi', 'tau']:

    group_1_results = {}
    for item in group_1:
        group_1_results[item[0]] = [5, 0]
        
    group_2_results = {}
    for item in group_2:
        group_2_results[item[0]] = [4, 0]
        
    group_3_results = {}
    for item in group_3:
        group_3_results[item[0]] = [5, 0]
    
    group_4_results = {}
    for item in group_4:
        group_4_results[item] = [0, 0]    
    
    for i in range(10):
        infilename = 'data/' + constant + '100m.dectxt.00%d' % (i)
        with open(infilename, 'r') as fin:
            digits = fin.read()
        
        print "processing", infilename
        if i < 9:
            next1K = 'data/' + constant + '100m.dectxt.00%d.1K' % (i+1)
            with open(next1K, 'r') as fin2:
                moredigits = fin2.read()
            digits += moredigits
            
        ###############################################################    
        
        if do_group_1:
            for test in group_1:
                #print 'testing ',test[1],
                name=test[0]
                criteria=test[2]
                curr_max = group_1_results[name][0]
                curr_pos = group_1_results[name][1]
                orig_max = curr_max
                for criterion in criteria:
                    found = True
                    while found == True:
                        to_search = criterion[:curr_max]
                        found_pos = digits.find(to_search)
                        if found_pos != -1 and found_pos < 100000000:
                            found_pos += i * 100000000
                            result.append([constant, test[0], to_search, found_pos])
                            if curr_max == orig_max: #if a search of the same length but at earlier position is found, it becomes the frontrunner
                                if found_pos < curr_pos or curr_pos == 0: # I suspect this is the buggy part
                                    curr_pos = found_pos
                            else:
                                curr_pos = found_pos
                            curr_max += 1
                        else:
                            found = False
                    group_1_results[name][0] = curr_max
                    group_1_results[name][1] = curr_pos
                    #print ' '
                
        #################################################################
        
        if do_group_2:
            for test in group_2:
                #print 'testing ',test[1],
                name=test[0]
                criteria=test[2]
                placelist = test[3]
                curr_max = group_2_results[name][0]
                curr_pos = group_2_results[name][1]
                orig_max = curr_max
                for criterion in criteria:
                    found = True
                    while found == True:
                        to_search = criterion[:curr_max]
                        found_pos = digits.find(to_search)
                        if found_pos != -1 and found_pos < 100000000:
                            found_pos += i * 100000000
                            result.append([constant, test[0], to_search, found_pos])
                            if curr_max == orig_max:
                                if found_pos < curr_pos or curr_pos == 0:
                                    curr_pos = found_pos
                            else:
                                curr_pos = found_pos
                            curr_max += 1
                            while curr_max not in placelist:
                                curr_max += 1
                                if curr_max >25:
                                    break
                        else:
                            found = False
                    group_2_results[name][0] = curr_max
                    group_2_results[name][1] = curr_pos
                    #print ' '
                
        ############################################################
        
        if do_group_3:
            for test in group_3:
                #print 'testing ',test[1],
                name=test[0]
                criteria=test[2]
                curr_max = group_3_results[name][0]
                curr_pos = group_3_results[name][1]
                orig_max = curr_max
                for criterion in criteria:
                    found = True
                    while found == True:
                        to_search = criterion + '{%s}' % (curr_max)
                        regexobj = re.search(to_search, digits)
                        if regexobj:
                            found_pos = regexobj.start()
                            if found_pos < 100000000:
                                found_pos += i * 100000000
                                result.append([constant, test[0], to_search, found_pos])
                                if curr_max == orig_max:
                                    if found_pos < curr_pos or curr_pos == 0:
                                        curr_pos = found_pos
                                else:
                                    curr_pos = found_pos
                                curr_max += 1
                            else:
                                found = False
                        else:
                            found = False
                    group_3_results[name][0] = curr_max
                    group_3_results[name][1] = curr_pos
                    #print ' '
        
        ##############################################################
    
        if do_group_4:  
            for item in group_4:
                #print 'testing ',item
                start = group_4_results[item][0]
                count = group_4_results[item][1]
                reitobj = re.finditer(item,digits)
                count = 0
                for refobj in reitobj:
                    if count==0 and refobj.start() < 100000000:
                        start = refobj.start()
                    if refobj.start() < 100000000:
                        count += 1
                if count > 0:
                    start += i * 100000000
                    if group_4_results[item][0] == 0:
                        group_4_results[item][0] = start
                    group_4_results[item][1] += count
        
        #######################################################
        
    #     print '\n==='
        
    #     print group_1_results # note all lengths are +1 due to fencepost counting
    #     print group_2_results
    #     print group_3_results
    #     print group_4_results
    #     print "Total elapsed time: %0.1f minutes." % ((time.time() - starttime)/60)
    #     print '\n=================================================\n\n'
    
    
    with open(constant+'_search_results', 'w+') as f3:
        f3.write(json.dumps(result))
    if do_group_4:
        with open(constant+'_search_results_4', 'w+') as f4:
            f4.write(json.dumps(group_4_results))









    



processing data/pi100m.dectxt.000
processing data/pi100m.dectxt.001
processing data/pi100m.dectxt.002
processing data/pi100m.dectxt.003
processing data/pi100m.dectxt.004
processing data/pi100m.dectxt.005
processing data/pi100m.dectxt.006
processing data/pi100m.dectxt.007
processing data/pi100m.dectxt.008
processing data/pi100m.dectxt.009
processing data/tau100m.dectxt.000
processing data/tau100m.dectxt.001
processing data/tau100m.dectxt.002
processing data/tau100m.dectxt.003
processing data/tau100m.dectxt.004
processing data/tau100m.dectxt.005
processing data/tau100m.dectxt.006
processing data/tau100m.dectxt.007
processing data/tau100m.dectxt.008
processing data/tau100m.dectxt.009



In [ ]: