In [1]:
from collections import Counter
import pandas as pd
import numpy as np
import json
In [2]:
metadata = pd.read_csv('game-metadata.csv', dtype={'gameid': np.int64})
These warnings don't matter for the goal of this project.
In [3]:
metadata.head()
Out[3]:
In [4]:
moves = pd.read_csv('game-moves.csv') #, dtype={'moveid': np.int32, 'xnorm': np.int32, 'ynorm': np.int32})
moves = moves[~np.isnan(moves['xnorm'])]
moves['xnorm'] = moves['xnorm'].astype(int)
moves['ynorm'] = moves['ynorm'].astype(int)
moves['x'] = moves['x'].astype(int)
moves['y'] = moves['y'].astype(int)
In the block down below, maxmove defines the number of moves to use in the subsequent analysis.
In [5]:
maxmoves=8
winnerknown = ((metadata['RE'].str.get(0) == 'B').astype(bool) |
(metadata['RE'].str.get(0) == 'W').astype(bool))
moves2 = pd.merge(
metadata[(np.isnan(metadata['SZ']) | (metadata['SZ'] == 19))
& np.isreal(metadata['AB'])
& np.isnan(metadata['HA'])
& winnerknown][['gameid','SZ','RE']],
moves[moves['moveid']<=maxmoves][['gameid', 'moveid', 'player','x','y']],
on='gameid')
del moves2['gameid']
del moves2['SZ']
moves2.head()
Out[5]:
A new data 'move2' contains the result of games, moveid, player, and x, y coordinates. NOTE: the coordinates are not normalized! At this point, the dataset 'moves2' is already sorted by moveid (and game id is implicit; if moveid=1, it is a start of a new game). NOTE2: Now any game, whose result doesn't start with 'B' or 'W' are excluded at this point.
In [6]:
nodes = [Counter() for i in range(maxmoves)]
paths = [Counter() for i in range(maxmoves-1)]
seqnodes = [(0, 0) for i in range(maxmoves)]
fullpath = Counter()
playeratithmove = [Counter() for i in range(maxmoves)]
# Processing a single record at a time.
for key, row in moves2.iterrows():
result, moveid, player, x, y = row
idx = moveid - 1
nodes[idx][(x, y)] += 1
playeratithmove[idx][player] += 1
seqnodes[idx] = (x, y)
if moveid == maxmoves:
oldnode = seqnodes[0]
for i, node in enumerate(seqnodes[1:maxmoves]):
paths[i][(oldnode, node)] += 1
oldnode = node
fullpath[tuple(seqnodes)] += 1
In [7]:
playeratithmove
Out[7]:
'playeratithmove' counts who played i-th move. As clearly shown here, only one player plays at a step; odd moves are by black players, and even moves are by white players. This clearly shows that neither player passed at the early stage of a game.
In [8]:
paths[0]
Out[8]:
'path' is a list of move from i-th to (i+1)-th move for each i separately, whereas fullpaths is a list of moves from 1-st to 5-th moves.
In [9]:
numgames = len(moves2[moves2.moveid==1])
numgames
Out[9]:
In [10]:
numunique = len(fullpath)
numunique
Out[10]:
In [11]:
(numgames - numunique) / numgames * 100
Out[11]:
Up to 5-th move, there are 28677 games, out of which 4448 unique paths. The vast majority of moves are therefore represented more than once in our dataset.
Transforming the obtained results in a usable form: {nodes: [{"name": "name 1"}, {"name": "name 2"}, ... ], edges: [{"source": 1, "target": 2, "value": 3}, {"source": 2, "target": 3, "value": 4}...]}
First count all possible positions of a move at i-th move:
In [12]:
nodes2 = []
node_map = {}
idx_offset = 0
for i, counter in enumerate(nodes):
ns = [{'name': '{}'.format(n[0]),
'value': n[1],
'level': i + 1}
for n in sorted(counter.items(), key = lambda x: x[1], reverse=True)]
node_map.update({(n['level'],n['name']): idx + idx_offset for idx, n in enumerate(ns)})
idx_offset += len(ns)
nodes2.extend(ns)
In [13]:
nodes2[:5]
Out[13]:
nodes2 holds all positions per-move . The 'name' is x, y coordinate, and the 'value' is frequency/count. The 'level' represents i-th move. 'node_map' holds a mapping from the node to its index in the list; this is purely an implementation detail.
In [14]:
paths2 = []
for i, path in enumerate(paths):
level = i + 1
paths2.extend(
{'source': node_map[(level, '{}'.format(p[0][0]))],
'target': node_map[(level+1, '{}'.format(p[0][1]))],
'value': p[1]}
for p in sorted(path.items(), key = lambda x: x[1], reverse=True))
In [15]:
paths2[:5]
Out[15]:
Like 'nodes2', 'paths2' holds frquency/count of "flows". 'source' is an index of a stone at i-th move in 'nodes2'. Similarly, 'target' is an index of a stone at (i+1)-th move in 'nodes2'. Therefore, 'paths2' represents an edge/flow from i-th to (i+1)-th move.
Finally, store 'nodes2' and 'paths2' in a JSON-formatted file. This file is the only input for our Sankey diagram.
In [ ]:
with open('game-moves.json', 'w') as f:
json.dump({'nodes': nodes2, 'links': paths2}, f)
Now, format for serialization. In particular, I need children of a node in a list not in a dict.
In [16]:
# Start from an empty root node.
move_tree = {'x': 0, 'y': 0, 'children': {}, 'player': 'root', 'count': 0,
'win': {'B': 0, 'W': 0}}
for _, row in moves2.iterrows():
# current_path points to the parent of the current move.
result, moveid, player, x, y = row
if moveid == 1:
current_path = move_tree
move_tree['count'] += 1
if (x, y) not in current_path['children']:
current_path['children'][(x, y)] = {'x': x, 'y': y,
'player': player,
'children': {},
'count': 0,
'win': {'B': 0, 'W': 0}}
# current_path is updated to the current move.
current_path = current_path['children'][(x, y)]
current_path['count'] += 1
current_path['win'][result[0]] += 1
In [17]:
def modifyrecurse(path):
children = path['children'].values()
for child in children:
modifyrecurse(child)
path['children'] = list(children)
modifyrecurse(move_tree)
In [18]:
all(c['count'] == sum(c['win'].values()) for c in move_tree['children'])
Out[18]:
In [19]:
with open('game-tree-8.json', 'w') as f:
json.dump(move_tree, f)