iterative_analyses.ipynb

You should first run the notebooks run_this_first.ipynb and calculate_tau.ipynb.

This script iterates over pi and/or tau. The user choses the start point and end point (up to one billion). The following statistics are calculated and saved.

ALWAYS, WITH OFFSET:

Note: since many aggregate statistics start by bouncing around between extreme values, the user can delay recording maxima and minima for the following by specifying a max_min_offset.

  • average_max: the position and value every time a new overall maximum average digit value is encountered
  • average_min: the same, for minimum
  • cod_max: the positionand value every time a new overall maximum value for the coefficient of determination is encountered. The c.o.d. may be more familiar as the r-squared value, but it is being compared to a model uniform distribution of digits, not to the best-fit line of the current distribution as would occur in regression analysis.
  • cod_min: the same, for minimum.
  • count_range_max: the position and value every time a new maximum range between the maximum digit count and the minimum digit count is encountered.
  • count_range_min: the same, for minimum range between maximum and minimum digit count.

ALWAYS, WITHOUT OFFSET:

  • cod_targets: the position and value the first time the cod attains a value greater than or equal to a decimal consisting of increasing numbers of nines, e.g. 0.9, then 0.99, then 0.999, etc.

ONLY IF record_and_save is set to True:

These statistics never have an offset.

  • averages_all: a list of all of the average digit values, by position
  • cods_all: a list of all the cods, by position
  • counts_all: a list of all of the counts for every digit at each position. This file can get large.

In [22]:
def iter_analyze(constants = ['pi', 'tau'], start_position=0, end_position=1000, max_min_offset=12, record_and_save=False):

    import time
    import json
    
    def max_and_min(maxes_list, mins_list, value):
        if counter >= max_min_offset:
            if len(maxes_list) == 0 or value > maxes_list[-1][0]:
                maxes_list.append([value, position])
            if len(mins_list) == 0 or value < mins_list[-1][0]:
                mins_list.append([value, position])
    
    starttime = time.time()
    
    for constant in constants:
    
        #determine files needed
        start_file = int (start_position/100000000)
        start_offset = start_position % 100000000
        end_file = int (end_position/100000000)
        end_offset = end_position % 100000000
    
        file_list = []
        for i in range(start_file, end_file+1):
            file_list.append('data/' + constant + '100m.dectxt.00%d' % (i))
        
        if record_and_save:
            averages = []
            cods = [] #coefficient of determination, a.k.a. r squared
            count_ranges = []
            counts_all = []
            
        counts = [0,0,0,0,0,0,0,0,0,0]
        
        average_maxes = []
        average_mins = []
        
        cod_maxes = []
        cod_mins = []
        
        cod_target = 0.9
        cod_targets = []
        
        count_range_maxes = []
        count_range_mins = []
        
        running_average = 0.0
        
        countdown = 1000
        countdown_interval = int((end_position-start_position)/100)
        
        #a quick dict just for user messages
        msg_dict = {}
        i = 0
        for filename in file_list:
            i += 1
            msg_dict[filename] = i
        
        position = start_position - 1
        
        for filename in file_list:
            
            if filename == file_list[0]:
                curr_start_offset = start_offset
            else:
                curr_start_offset = 0
            if filename == file_list[-1]:
                curr_end_offset = end_offset
            else:
                curr_end_offset = -1
                
            with open(filename, 'r') as file_in:
                all_digits = file_in.read()
            
            print "Processing file %d of %d." % (msg_dict[filename], end_file - start_file + 1)
            
            digits = all_digits[curr_start_offset:curr_end_offset]
            
            for digitstr in digits:
                digit = int(digitstr)
                position += 1
                counter = position - start_position
                if counter % countdown_interval == 0:
                    print countdown,
                    countdown -= 1
                running_average = running_average + (digit - running_average) / (counter + 1)
                max_and_min(average_maxes, average_mins, running_average)
                
                counts[digit]+=1
                curr_count_range = max(counts) - min(counts)
                max_and_min(count_range_maxes, count_range_mins, curr_count_range)
            
                ssr = 0
                sst = 0
                for x in range(10):
                   ssr += (counts[x]  - (1.0 * (counter+1) / 10)) ** 2 # sum of square residuals
                   sst += counts[x] ** 2                               # sum of square total
                curr_cod = 1.0 - (ssr/sst)
                max_and_min(cod_maxes, cod_mins, curr_cod)
                if curr_cod >= cod_target:
                    cod_targets.append([curr_cod,position])
                    cod_target = 1.0 - ((1.0 - cod_target) * 0.1)
                
                if record_and_save:
                    averages.append(running_average)
                    cods.append(curr_cod)
                    count_ranges.append(curr_count_range)
                    counts_all.append(counts)
                    
            print "Done. Elapsed time %0.1f minutes." % ((time.time() - starttime) / 60)
            
        basename = "iter_result_" + constant + "_%d-%d-off%d_" % (start_position, end_position, max_min_offset)
        basename2 = "iter_result_" + constant + "_%d-%d_" % (start_position, end_position)
                   
        with open(basename+'average_maxes', 'w+') as f:
            f.write(json.dumps(average_maxes))      
        with open(basename+'average_mins', 'w+') as f:
            f.write(json.dumps(average_mins))
        with open(basename+'cod_maxes', 'w+') as f:
            f.write(json.dumps(cod_maxes))
        with open(basename+'cod_mins', 'w+') as f:
            f.write(json.dumps(cod_mins))
        with open(basename+'count_range_maxes', 'w+') as f:
            f.write(json.dumps(count_range_maxes))
        with open(basename+'count_range_mins', 'w+') as f:
            f.write(json.dumps(count_range_mins))
        with open(basename+'cod_targets', 'w+') as f:
            f.write(json.dumps(cod_targets))
        
        if record_and_save:
            with open(basename2+'averages_all', 'w+') as f:
                f.write(json.dumps(averages))
            with open(basename2+'cods_all', 'w+') as f:
                f.write(json.dumps(cods))
            with open(basename2+'count_ranges_all', 'w+') as f:
                f.write(json.dumps(count_ranges))
            with open(basename2+'counts_all', 'w+') as f:
                f.write(json.dumps(counts_all))
           
    print "Procedure finished."

In [23]:
iter_analyze(['tau'], start_position=0, end_position=1000, max_min_offset=0, record_and_save=True)


Processing file 1 of 1.
1000 999 998 997 996 995 994 993 992 991 990 989 988 987 986 985 984 983 982 981 980 979 978 977 976 975 974 973 972 971 970 969 968 967 966 965 964 963 962 961 960 959 958 957 956 955 954 953 952 951 950 949 948 947 946 945 944 943 942 941 940 939 938 937 936 935 934 933 932 931 930 929 928 927 926 925 924 923 922 921 920 919 918 917 916 915 914 913 912 911 910 909 908 907 906 905 904 903 902 901 Done. Elapsed time 0.0 minutes.
Procedure finished.