run_this_first.ipynb
and calculate_tau.ipynb
.This script iterates over pi and/or tau. The user choses the start point and end point (up to one billion). The following statistics are calculated and saved.
ALWAYS, WITH OFFSET:
Note: since many aggregate statistics start by bouncing around between extreme values, the user can delay recording maxima and minima for the following by specifying a max_min_offset.
ALWAYS, WITHOUT OFFSET:
ONLY IF record_and_save
is set to True
:
These statistics never have an offset.
In [22]:
def iter_analyze(constants = ['pi', 'tau'], start_position=0, end_position=1000, max_min_offset=12, record_and_save=False):
import time
import json
def max_and_min(maxes_list, mins_list, value):
if counter >= max_min_offset:
if len(maxes_list) == 0 or value > maxes_list[-1][0]:
maxes_list.append([value, position])
if len(mins_list) == 0 or value < mins_list[-1][0]:
mins_list.append([value, position])
starttime = time.time()
for constant in constants:
#determine files needed
start_file = int (start_position/100000000)
start_offset = start_position % 100000000
end_file = int (end_position/100000000)
end_offset = end_position % 100000000
file_list = []
for i in range(start_file, end_file+1):
file_list.append('data/' + constant + '100m.dectxt.00%d' % (i))
if record_and_save:
averages = []
cods = [] #coefficient of determination, a.k.a. r squared
count_ranges = []
counts_all = []
counts = [0,0,0,0,0,0,0,0,0,0]
average_maxes = []
average_mins = []
cod_maxes = []
cod_mins = []
cod_target = 0.9
cod_targets = []
count_range_maxes = []
count_range_mins = []
running_average = 0.0
countdown = 1000
countdown_interval = int((end_position-start_position)/100)
#a quick dict just for user messages
msg_dict = {}
i = 0
for filename in file_list:
i += 1
msg_dict[filename] = i
position = start_position - 1
for filename in file_list:
if filename == file_list[0]:
curr_start_offset = start_offset
else:
curr_start_offset = 0
if filename == file_list[-1]:
curr_end_offset = end_offset
else:
curr_end_offset = -1
with open(filename, 'r') as file_in:
all_digits = file_in.read()
print "Processing file %d of %d." % (msg_dict[filename], end_file - start_file + 1)
digits = all_digits[curr_start_offset:curr_end_offset]
for digitstr in digits:
digit = int(digitstr)
position += 1
counter = position - start_position
if counter % countdown_interval == 0:
print countdown,
countdown -= 1
running_average = running_average + (digit - running_average) / (counter + 1)
max_and_min(average_maxes, average_mins, running_average)
counts[digit]+=1
curr_count_range = max(counts) - min(counts)
max_and_min(count_range_maxes, count_range_mins, curr_count_range)
ssr = 0
sst = 0
for x in range(10):
ssr += (counts[x] - (1.0 * (counter+1) / 10)) ** 2 # sum of square residuals
sst += counts[x] ** 2 # sum of square total
curr_cod = 1.0 - (ssr/sst)
max_and_min(cod_maxes, cod_mins, curr_cod)
if curr_cod >= cod_target:
cod_targets.append([curr_cod,position])
cod_target = 1.0 - ((1.0 - cod_target) * 0.1)
if record_and_save:
averages.append(running_average)
cods.append(curr_cod)
count_ranges.append(curr_count_range)
counts_all.append(counts)
print "Done. Elapsed time %0.1f minutes." % ((time.time() - starttime) / 60)
basename = "iter_result_" + constant + "_%d-%d-off%d_" % (start_position, end_position, max_min_offset)
basename2 = "iter_result_" + constant + "_%d-%d_" % (start_position, end_position)
with open(basename+'average_maxes', 'w+') as f:
f.write(json.dumps(average_maxes))
with open(basename+'average_mins', 'w+') as f:
f.write(json.dumps(average_mins))
with open(basename+'cod_maxes', 'w+') as f:
f.write(json.dumps(cod_maxes))
with open(basename+'cod_mins', 'w+') as f:
f.write(json.dumps(cod_mins))
with open(basename+'count_range_maxes', 'w+') as f:
f.write(json.dumps(count_range_maxes))
with open(basename+'count_range_mins', 'w+') as f:
f.write(json.dumps(count_range_mins))
with open(basename+'cod_targets', 'w+') as f:
f.write(json.dumps(cod_targets))
if record_and_save:
with open(basename2+'averages_all', 'w+') as f:
f.write(json.dumps(averages))
with open(basename2+'cods_all', 'w+') as f:
f.write(json.dumps(cods))
with open(basename2+'count_ranges_all', 'w+') as f:
f.write(json.dumps(count_ranges))
with open(basename2+'counts_all', 'w+') as f:
f.write(json.dumps(counts_all))
print "Procedure finished."
In [23]:
iter_analyze(['tau'], start_position=0, end_position=1000, max_min_offset=0, record_and_save=True)