This notebook analyzes a toughies.csv file. This file is generated by the Django management command named get_toughie_info.py with a production db backup


In [ ]:
import pandas
import random
import csv

In [ ]:
toughies = pandas.read_csv('./toughies.csv')
len(toughies)

For better statistical significance, filter only bingos that were asked at least 30 times.


In [ ]:
better_toughies = toughies.loc[toughies['asked'] >= 30]

In [ ]:
# Order of dictionary updates:
def lexkey_assigner(row):
    if row['lexicon'] == 'OWL2':
        return 1
    elif row['lexicon'] == 'America':
        return 2
    elif row['lexicon'] == 'NWL18':
        return 3

better_toughies = better_toughies.assign(
    lexkey=better_toughies.apply(lexkey_assigner, axis=1)).sort_values('lexkey')

In [ ]:
# Determine which words have NOT been asked yet. 
with open('./7s_alphs.csv') as f:
    alphas_7s = set([a for a in f.read().split('\n') if len(a) == 7])

with open('./8s_alphs.csv') as f:
    alphas_8s = set([a for a in f.read().split('\n') if len(a) == 8])

In [ ]:
def additions_to_existing_alphagrams(filename) -> dict:
    """ 
    given the csv file, output a dictionary of alphagrams to words
    where at least one of the words in each pair previously existed
    in the last dictionary update.
    the csv file consists of the added words and alphagrams 
    in an update.
    """
    alphas = {}
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            tpl = (row['word'], row['added'])
            if row['alpha'] not in alphas:
                alphas[row['alpha']] = [tpl]
            else:
                alphas[row['alpha']].append(tpl)
    # Now, keep only the ones where the value has at least one non-+ word.
    new_alphas = {}
    for k, v in alphas.items():
        if any([s == '' for _, s in v]):
            new_alphas[k] = [w for w, _ in v] 
    return new_alphas

# These text files below were created like:
# select word, alphagram, lexicon_symbols from (
#      select alphagrams.alphagram from alphagrams where
#      contains_update_to_lex=1 and length=7 order by alphagrams.probability) q 
# inner join words w using (alphagram);

new_sevens_first_update = additions_to_existing_alphagrams('./7snew_owl2_america.txt')
new_sevens_second_update = additions_to_existing_alphagrams('./7snew_america_nwl18.txt')
new_eights_first_update = additions_to_existing_alphagrams('./8snew_owl2_america.txt')
new_eights_second_update = additions_to_existing_alphagrams('./8snew_america_nwl18.txt')

In [ ]:
better_toughies.loc[better_toughies['Alphagram'] == 'ACCEORST']

In [ ]:
better_toughies.loc[better_toughies['Alphagram'] == 'AEEINST']

In [ ]:
asked_7s = set()
asked_8s = set()

# Start at the first lexicon.
last_lex = 'OWL2'
for row in better_toughies.itertuples():
    # row 0 is the index.
    lex = row[6]
    # Clear out questions that got new additions.
    if lex != last_lex:
        if lex == 'America':
            for alpha in new_sevens_first_update:
                if alpha in asked_7s:
                    asked_7s.remove(alpha)
            for alpha in new_eights_first_update:
                if alpha in asked_8s:
                    asked_8s.remove(alpha)
        elif lex == 'NWL18':
            for alpha in new_sevens_second_update:
                if alpha in asked_7s:
                    asked_7s.remove(alpha)
            for alpha in new_eights_second_update:
                if alpha in asked_8s:
                    asked_8s.remove(alpha) 
            
    alpha = row[1]
    if len(alpha) == 7:
        asked_7s.add(alpha)
    if len(alpha) == 8:
        asked_8s.add(alpha) 
        
    last_lex = lex
        
print(f'Asked {len(asked_7s)} out of {len(alphas_7s)} 7s')
print(f'Asked {len(asked_8s)} out of {len(alphas_8s)} 8s')

In [ ]:
print((len(alphas_7s) - len(asked_7s)) / 50)
print((len(alphas_8s) - len(asked_8s)) / 50)

Determine a list of all bingos by difficulty!


In [ ]:
# Default to taking results for newer lexica. This is because Aerolith may at first have been 
# populated by people who were already really good at the bingos, but as time passed, more lower-rated
# players have been joining. 
bingos = {}
for row in better_toughies.itertuples():
    # Since the df is sorted from oldest to newest lexicon, results from newer "asks" will supersede
    # older asks, if the number of asks is significantly bigger.
    alpha = row[1]
    # If the new bingo was asked at least 7 more times... (just some fudge factor)
    if alpha not in bingos or bingos[alpha][3] < row[3] + 7:
        # Add if it doesn't exist, or if it exists and 
        # the number of asks is now bigger (more data is
        # better). This still might have some bias from
        # early Aerolith users being better on average,
        # if a question was asked a lot back in the day.
        bingos[alpha] = row

print(len(bingos))

In [ ]:
bdf = pandas.DataFrame.from_dict(bingos, orient='index')

In [ ]:
# Now we can ask some questions. For example, what are the hardest 1000 bingos with probability < 15000?

total = 1000
prob_limit = 15000
alphas = bdf[bdf['probability'] <= prob_limit].sort_values('difficulty', ascending=False)['Alphagram'][:total]
ct = 0
for alpha in alphas:
    if len(alpha) == 7:
        ct += 1
print(f'There are {ct} 7s out of {total}')
# for alpha in alphas:
#     print(alpha)

Run the following cell to determine which questions are left to ask (maybe can use for future updates or CSW)


In [ ]:
missing_8s = list(alphas_8s - asked_8s)
missing_7s = list(alphas_7s - asked_7s)

asked_7s_new = list(asked_7s)
asked_8s_new = list(asked_8s)
random.shuffle(asked_7s_new)
random.shuffle(asked_8s_new)

# Extend the 8s by 23 questions so we have 400 exactly.
missing_8s.extend(asked_8s_new[:43])
# Extend the 7s so we have 200 exactly
missing_7s.extend(asked_7s_new[:103])
assert(len(missing_7s) == 200)
assert(len(missing_8s) == 400)

random.shuffle(missing_7s)
random.shuffle(missing_8s)

i = 0
for seven in missing_7s:
    print(seven)
    i += 1
    if i % 50 == 0:
        print('-' * 6)

print ('-' * 12)
i = 0
for eight in missing_8s:
    print(eight)
    i += 1
    if i % 50 == 0:
        print('-' * 6)

Challenge simulator - how many unasked questions do we have after a certain time?


In [ ]:
num_days = 8 * 365   # We've been asking qs for roughly 8 years (since Jun 2011 -- update if changes)
num_qs = 50
num_alphas = 28029   # How many sevens or eights

alphas = set(range(num_alphas))

for i in range(num_days):
    todays = list(range(num_alphas))
    random.shuffle(todays)
    for q in todays[:num_qs]:
        if q in alphas:
            alphas.remove(q)

print(len(alphas))