In [1]:
import csv
from datetime import date
from itertools import combinations
import numpy as np
import pandas as pd
import pickle as pkl
import seaborn as sns
from string import ascii_uppercase
import time as time
%matplotlib inline
maximum_superleave_length = 6
log_file = '../logs/log_20200411.csv'
# log_file = '../logs/log_1m.csv'
todays_date = date.today().strftime("%Y%m%d")
In [2]:
todays_date
Out[2]:
Create a dictionary of all possible 1 to 6-tile leaves. Also, add functionality for sorting by an arbitrary key - allowing us to put rarest letters first
In [3]:
# tilebag = ['A']*9+['B']*2+['C']*2+['D']*4+['E']*12+\
# ['F']*2+['G']*3+['H']*2+['I']*9+['J']*1+\
# ['K']*1+['L']*4+['M']*2+['N']*6+['O']*8+\
# ['P']*2+['Q']*1+['R']*6+['S']*4+['T']*6+\
# ['U']*4+['V']*2+['W']*2+['X']*1+['Y']*2+\
# ['Z']*1+['?']*2
# No superleave is longer than 6 letters, and so we only need to include
# 6 each of the As, Es, Is and Os. This shortens the time it takes to find all of
# the superleaves by 50%!
truncated_tilebag = \
['A']*6+['B']*2+['C']*2+['D']*4+['E']*6+\
['F']*2+['G']*3+['H']*2+['I']*6+['J']*1+\
['K']*1+['L']*4+['M']*2+['N']*6+['O']*6+\
['P']*2+['Q']*1+['R']*6+['S']*4+['T']*6+\
['U']*4+['V']*2+['W']*2+['X']*1+['Y']*2+\
['Z']*1+['?']*2
tiles = [x for x in ascii_uppercase] + ['?']
# potential future improvement: calculate optimal order of letters on the fly
# rarity_key = 'ZXKJQ?HYMFPWBCVSGDLURTNAOIE'
alphabetical_key = '?ABCDEFGHIJKLMNOPQRSTUVWXYZ'
sort_func = lambda x: alphabetical_key.index(x)
On my home machine, the following code takes about 7 minutes to run in its entirety.
In [4]:
# t0 = time.time()
# leaves = {i:sorted(list(set(list(combinations(truncated_tilebag,i))))) for i in
# range(1,maximum_superleave_length+1)}
# # turn leaves from lists of letters into strings
# # algorithm runs faster if leaves non-alphabetical!
# for i in range(1,maximum_superleave_length+1):
# leaves[i] = [''.join(sorted(leave, key=sort_func))
# for leave in leaves[i]]
# t1 = time.time()
# print('Calculated superleaves up to length {} in {} seconds'.format(
# maximum_superleave_length,t1-t0))
# pkl.dump(leaves,open('all_leaves.p','wb'))
In [5]:
leaves = pkl.load(open('all_leaves.p','rb'))
How many superleaves are there of each length? See below:
In [6]:
for i in range(1,maximum_superleave_length+1):
print(i,len(leaves[i]))
In [7]:
all_leaves = []
for i in range(1,maximum_superleave_length+1):
all_leaves.extend(leaves[i])
In [8]:
def find_subleaves(rack, min_length=1, max_length=6, duplicates_allowed = False):
if not duplicates_allowed:
return [''.join(sorted(x, key=sort_func)) for i in range(min_length, max_length+1)
for x in set(list(combinations(rack,i)))]
else:
return [''.join(sorted(x, key=sort_func)) for i in range(min_length, max_length+1)
for x in list(combinations(rack,i))]
tile_limit below is the minimum number of tiles left on a rack for it to be factored into superleave calculation. The idea is that moves with the bag empty tend to be worth less, and may not reflect the value of a letter in the rest of the game (most notably, if you have the blank and the bag is empty, you often can't bingo!). Moves are tend to be worth a little bit less at the beginning of a game when there are fewer juicy spots to play.
In [9]:
t0 = time.time()
tile_limit = 1
bingo_count = {x:0 for x in all_leaves}
count = {x:0 for x in all_leaves}
equity = {x:0 for x in all_leaves}
points = {x:0 for x in all_leaves}
row_count = 0
total_equity = 0
total_points = 0
with open(log_file,'r') as f:
moveReader = csv.reader(f)
next(moveReader)
for i,row in enumerate(moveReader):
if i%1000000==0:
t = time.time()
print('Processed {} rows in {} seconds'.format(i,t-t0))
if i<10:
print(i,row)
try:
if int(row[10]) >= tile_limit:
total_equity += float(row[9])
total_points += int(row[5])
row_count += 1
for subleave in find_subleaves(row[3],
max_length=maximum_superleave_length):
bingo_count[subleave] += row[7] == '7'
count[subleave] += 1
equity[subleave] += float(row[9])
points[subleave] += int(row[5])
except:
print(i,row)
t1 = time.time()
print('{} seconds to populate dictionaries'.format(t1-t0))
In [10]:
ev_df = pd.concat([pd.Series(points, name='points'),
pd.Series(equity, name='equity'),
pd.Series(count, name='count'),
pd.Series(bingo_count, name='bingo_count')],
axis=1)
In [11]:
mean_score = total_points/row_count
mean_equity = total_equity/row_count
In [12]:
ev_df['mean_score'] = ev_df['points']/ev_df['count']
ev_df['mean_equity'] = ev_df['equity']/ev_df['count']
ev_df['bingo pct'] = 100*ev_df['bingo_count']/ev_df['count']
ev_df['pct'] = 100*ev_df['count']/len(ev_df)
ev_df['adjusted_mean_score'] = ev_df['mean_score']-mean_score
ev_df['ev'] = ev_df['mean_equity']-mean_equity
In [13]:
ev_df['ev'].to_csv('leave_values_' + todays_date + '.csv', index=True)
ev_df.to_csv('leave_summary_' + todays_date + '.csv', index=True)
If a given superleave of length n is never observed in the trial games, three things can happen:
In [14]:
ev_dict = ev_df['ev'].to_dict()
In [15]:
t0 = time.time()
for leave in all_leaves:
if pd.isnull(ev_dict[leave]):
subleaves = find_subleaves(leave,
min_length=len(leave)-1,
max_length=len(leave)-1,
duplicates_allowed=True)
sub_evs = [ev_dict[subleave] for subleave in subleaves]
signs = sum([x/abs(x) for x in sub_evs])
if signs==0:
ev_dict[leave] = sum(sub_evs)/len(sub_evs)
if signs>0:
ev_dict[leave] = max(sub_evs)
if signs<0:
ev_dict[leave] = min(sub_evs)
t1 = time.time()
print('Filled in all NaN superleaves with best guesses in {} seconds'.format(t1-t0))
In [16]:
ev_df = ev_df.drop('ev', axis=1)
ev_df = pd.concat([ev_df,pd.Series(ev_dict,name='ev')], axis=1)
In [17]:
ev_df['ev'].to_csv('leave_values_' + todays_date + '_filled_nulls.csv', index=True)
ev_df.to_csv('leave_summary_' + todays_date + '_filled_nulls.csv', index=True)
In [18]:
ev_df
Out[18]:
In [19]:
t0 = time.time()
synergy = {x:0 for x in all_leaves}
for leave in all_leaves:
if len(leave)>1:
subleaves = find_subleaves(leave, min_length=1, max_length=1, duplicates_allowed=True)
sub_evs = [ev_dict[subleave] for subleave in subleaves]
synergy[leave] = ev_dict[leave]-sum(sub_evs)
t1 = time.time()
print('Calculated "synergy" in {} seconds'.format(t1-t0))
In [20]:
ev_df = pd.concat([ev_df,pd.Series(synergy, name='synergy')], axis=1)
In [21]:
ev_df.index.rename('superleave')
Out[21]:
In [22]:
ev_df
Out[22]:
Save superleaves to an external file
In [23]:
ev_df['ev']
Out[23]:
In [24]:
ev_df['ev'].to_csv('leave_values_' + todays_date + '.csv', index=False)
ev_df.to_csv('leave_summary_' + todays_date + '.csv', index=False)
In [ ]: