In [1]:

    
from collections import Counter
import pandas as pd
import numpy as np
import json

Preparing data for Sankey and Radial-Game-Tree

Reading two data files



In [2]:

    
metadata = pd.read_csv('game-metadata.csv', dtype={'gameid': np.int64})









    



/usr/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (14,15,21,22,26,28,31,34,35,36,37,38,40,41) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

These warnings don't matter for the goal of this project.



In [3]:

    
metadata.head()









    Out[3]:






  
    
      
      gameid
      EV
      RO
      PB
      BR
      PW
      WR
      KM
      RE
      DT
      ...
      BL
      WL
      GN
      PX
      PY
      OH
      PBX
      MN
      CP
      LC
    
  
  
    
      0
      1
      1st All Japan #1
      Round 1
      Sakata Eio
      9p
      Kubouchi Shuchi
      9p
      5.5
      B+R
      1968-11-28
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1
      2
      1st All Japan #1
      Round 3
      Fujisawa Shuko
      9p
      Hashimoto Shoji
      9p
      4.5
      B+3.5
      1969-08-14
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      2
      3
      1st All Japan #1
      Challenger's Final
      Fujisawa Shuko
      9p
      Sugiuchi Masao
      9p
      4.5
      B+8.5
      1970
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      3
      4
      1st All Japan #1
      Round 3
      Ishida Yoshio
      6p
      Sugiuchi Masao
      9p
      5.5
      W+R
      1970
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      4
      5
      1st All Japan #1
      Round 4
      Kurosawa Tadanao
      7p
      Sugiuchi Masao
      9p
      5.5
      W+R
      1970
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
  

5 rows × 42 columns



In [4]:

    
moves = pd.read_csv('game-moves.csv') #, dtype={'moveid': np.int32, 'xnorm': np.int32, 'ynorm': np.int32})
moves = moves[~np.isnan(moves['xnorm'])]
moves['xnorm'] = moves['xnorm'].astype(int)
moves['ynorm'] = moves['ynorm'].astype(int)
moves['x'] = moves['x'].astype(int)
moves['y'] = moves['y'].astype(int)

Filtering out bad data

In the block down below, maxmove defines the number of moves to use in the subsequent analysis.



In [5]:

    
maxmoves=8
winnerknown = ((metadata['RE'].str.get(0) == 'B').astype(bool) | 
               (metadata['RE'].str.get(0) == 'W').astype(bool))

moves2 = pd.merge(
    metadata[(np.isnan(metadata['SZ']) | (metadata['SZ'] == 19)) 
         & np.isreal(metadata['AB']) 
         & np.isnan(metadata['HA'])
         & winnerknown][['gameid','SZ','RE']],
    moves[moves['moveid']<=maxmoves][['gameid', 'moveid', 'player','x','y']],
    on='gameid')
del moves2['gameid']
del moves2['SZ']

moves2.head()

A new data 'move2' contains the result of games, moveid, player, and x, y coordinates. NOTE: the coordinates are not normalized! At this point, the dataset 'moves2' is already sorted by moveid (and game id is implicit; if moveid=1, it is a start of a new game). NOTE2: Now any game, whose result doesn't start with 'B' or 'W' are excluded at this point.

Counting the path



In [6]:

    
nodes = [Counter() for i in range(maxmoves)]
paths = [Counter() for i in range(maxmoves-1)]
seqnodes = [(0, 0) for i in range(maxmoves)]
fullpath = Counter()
playeratithmove = [Counter() for i in range(maxmoves)]

# Processing a single record at a time.
for key, row in moves2.iterrows():
    result, moveid, player, x, y = row
    idx = moveid - 1
    nodes[idx][(x, y)] += 1
    playeratithmove[idx][player] += 1
    seqnodes[idx] = (x, y)
    if moveid == maxmoves:
        oldnode = seqnodes[0]
        for i, node in enumerate(seqnodes[1:maxmoves]):
            paths[i][(oldnode, node)] += 1
            oldnode = node
        fullpath[tuple(seqnodes)] += 1



In [7]:

    
playeratithmove









    Out[7]:





[Counter({'B': 28638}),
 Counter({'W': 28638}),
 Counter({'B': 28638}),
 Counter({'W': 28638}),
 Counter({'B': 28638}),
 Counter({'W': 28638}),
 Counter({'B': 28638}),
 Counter({'W': 28638})]

'playeratithmove' counts who played i-th move. As clearly shown here, only one player plays at a step; odd moves are by black players, and even moves are by white players. This clearly shows that neither player passed at the early stage of a game.



In [8]:

    
paths[0]









    Out[8]:





Counter({((3, 3), (16, 4)): 1,
         ((3, 3), (16, 16)): 1,
         ((3, 4), (16, 3)): 2,
         ((3, 4), (16, 4)): 2,
         ((3, 15), (3, 3)): 1,
         ((3, 15), (3, 4)): 3,
         ((3, 15), (4, 4)): 1,
         ((3, 15), (16, 15)): 1,
         ((3, 15), (16, 16)): 2,
         ((3, 15), (16, 17)): 1,
         ((3, 15), (17, 16)): 1,
         ((3, 16), (3, 3)): 4,
         ((3, 16), (3, 4)): 35,
         ((3, 16), (4, 3)): 1,
         ((3, 16), (4, 4)): 31,
         ((3, 16), (4, 5)): 1,
         ((3, 16), (5, 3)): 6,
         ((3, 16), (5, 16)): 7,
         ((3, 16), (5, 17)): 2,
         ((3, 16), (6, 4)): 1,
         ((3, 16), (6, 16)): 2,
         ((3, 16), (15, 16)): 2,
         ((3, 16), (15, 17)): 5,
         ((3, 16), (16, 4)): 14,
         ((3, 16), (16, 6)): 1,
         ((3, 16), (16, 15)): 5,
         ((3, 16), (16, 16)): 144,
         ((3, 16), (16, 17)): 233,
         ((3, 16), (17, 3)): 8,
         ((3, 16), (17, 4)): 7,
         ((3, 16), (17, 16)): 8,
         ((3, 16), (17, 17)): 12,
         ((3, 17), (3, 4)): 1,
         ((3, 17), (16, 4)): 13,
         ((3, 17), (16, 16)): 3,
         ((3, 17), (16, 17)): 28,
         ((3, 17), (17, 3)): 2,
         ((3, 17), (17, 4)): 2,
         ((3, 17), (17, 16)): 5,
         ((4, 3), (3, 16)): 4,
         ((4, 3), (4, 16)): 2,
         ((4, 3), (16, 4)): 1,
         ((4, 3), (16, 16)): 2,
         ((4, 3), (17, 17)): 1,
         ((4, 4), (3, 16)): 3,
         ((4, 4), (4, 16)): 1,
         ((4, 15), (3, 4)): 2,
         ((4, 15), (16, 16)): 3,
         ((4, 16), (3, 4)): 6,
         ((4, 16), (3, 5)): 1,
         ((4, 16), (4, 4)): 1,
         ((4, 16), (13, 7)): 1,
         ((4, 16), (15, 4)): 1,
         ((4, 16), (15, 16)): 3,
         ((4, 16), (15, 17)): 3,
         ((4, 16), (16, 4)): 69,
         ((4, 16), (16, 5)): 1,
         ((4, 16), (16, 15)): 2,
         ((4, 16), (16, 16)): 27,
         ((4, 16), (16, 17)): 120,
         ((4, 16), (17, 3)): 31,
         ((4, 16), (17, 4)): 5,
         ((4, 16), (17, 5)): 1,
         ((4, 16), (17, 16)): 4,
         ((4, 16), (17, 17)): 4,
         ((4, 17), (3, 4)): 3,
         ((4, 17), (4, 4)): 2,
         ((4, 17), (17, 16)): 1,
         ((5, 10), (4, 16)): 1,
         ((5, 10), (16, 5)): 1,
         ((5, 15), (17, 16)): 1,
         ((5, 16), (4, 4)): 1,
         ((5, 16), (16, 17)): 1,
         ((5, 16), (17, 15)): 1,
         ((5, 17), (4, 3)): 1,
         ((5, 17), (16, 15)): 1,
         ((5, 17), (16, 17)): 1,
         ((9, 10), (16, 4)): 1,
         ((10, 10), (3, 16)): 2,
         ((10, 10), (4, 3)): 1,
         ((10, 10), (4, 4)): 7,
         ((10, 10), (4, 16)): 13,
         ((10, 10), (5, 15)): 1,
         ((10, 10), (9, 15)): 1,
         ((10, 10), (16, 4)): 2,
         ((10, 10), (17, 4)): 1,
         ((11, 7), (4, 16)): 1,
         ((11, 10), (4, 4)): 1,
         ((12, 8), (3, 15)): 1,
         ((13, 5), (16, 16)): 2,
         ((13, 5), (17, 6)): 1,
         ((13, 7), (4, 16)): 2,
         ((14, 4), (4, 16)): 2,
         ((14, 6), (4, 16)): 3,
         ((14, 6), (7, 15)): 1,
         ((14, 6), (8, 11)): 1,
         ((14, 16), (4, 4)): 1,
         ((14, 16), (16, 4)): 1,
         ((15, 3), (4, 4)): 1,
         ((15, 3), (4, 16)): 5,
         ((15, 3), (16, 16)): 4,
         ((15, 3), (17, 4)): 1,
         ((15, 4), (3, 16)): 2,
         ((15, 4), (4, 3)): 4,
         ((15, 4), (4, 4)): 3,
         ((15, 4), (4, 16)): 4,
         ((15, 4), (5, 16)): 2,
         ((15, 4), (16, 16)): 8,
         ((15, 4), (16, 17)): 1,
         ((15, 4), (17, 4)): 1,
         ((15, 5), (3, 17)): 1,
         ((15, 5), (4, 3)): 1,
         ((15, 5), (4, 4)): 4,
         ((15, 5), (4, 16)): 13,
         ((15, 5), (7, 15)): 1,
         ((15, 5), (10, 10)): 1,
         ((15, 5), (16, 15)): 1,
         ((15, 6), (7, 5)): 1,
         ((15, 7), (4, 4)): 5,
         ((15, 7), (4, 16)): 2,
         ((15, 7), (5, 3)): 1,
         ((15, 7), (5, 7)): 1,
         ((15, 7), (5, 16)): 1,
         ((15, 7), (5, 17)): 1,
         ((15, 7), (6, 4)): 1,
         ((15, 7), (15, 16)): 1,
         ((15, 9), (3, 16)): 1,
         ((15, 16), (4, 4)): 1,
         ((15, 17), (4, 3)): 1,
         ((15, 17), (16, 4)): 1,
         ((16, 3), (4, 4)): 1,
         ((16, 3), (4, 16)): 1,
         ((16, 3), (16, 6)): 1,
         ((16, 3), (16, 16)): 2,
         ((16, 3), (17, 16)): 3,
         ((16, 4), (3, 3)): 153,
         ((16, 4), (3, 4)): 60,
         ((16, 4), (3, 5)): 6,
         ((16, 4), (3, 15)): 23,
         ((16, 4), (3, 16)): 206,
         ((16, 4), (3, 17)): 464,
         ((16, 4), (4, 3)): 2998,
         ((16, 4), (4, 4)): 6589,
         ((16, 4), (4, 5)): 14,
         ((16, 4), (4, 10)): 1,
         ((16, 4), (4, 14)): 3,
         ((16, 4), (4, 15)): 17,
         ((16, 4), (4, 16)): 4263,
         ((16, 4), (4, 17)): 22,
         ((16, 4), (5, 3)): 38,
         ((16, 4), (5, 4)): 33,
         ((16, 4), (5, 16)): 4,
         ((16, 4), (5, 17)): 3,
         ((16, 4), (6, 4)): 2,
         ((16, 4), (6, 16)): 1,
         ((16, 4), (10, 10)): 2,
         ((16, 4), (14, 3)): 1,
         ((16, 4), (15, 16)): 1,
         ((16, 4), (15, 17)): 3,
         ((16, 4), (16, 15)): 1,
         ((16, 4), (16, 16)): 51,
         ((16, 4), (16, 17)): 10,
         ((16, 4), (17, 6)): 1,
         ((16, 4), (17, 15)): 6,
         ((16, 4), (17, 16)): 87,
         ((16, 4), (17, 17)): 9,
         ((16, 5), (3, 17)): 1,
         ((16, 5), (4, 3)): 1,
         ((16, 5), (4, 4)): 35,
         ((16, 5), (4, 15)): 1,
         ((16, 5), (4, 16)): 33,
         ((16, 5), (5, 4)): 1,
         ((16, 5), (6, 4)): 1,
         ((16, 5), (16, 3)): 3,
         ((16, 5), (16, 16)): 15,
         ((16, 5), (17, 15)): 1,
         ((16, 5), (17, 16)): 15,
         ((16, 6), (4, 3)): 1,
         ((16, 6), (4, 4)): 1,
         ((16, 6), (15, 4)): 1,
         ((16, 6), (16, 16)): 5,
         ((16, 6), (17, 8)): 1,
         ((16, 6), (17, 15)): 1,
         ((16, 16), (3, 3)): 1,
         ((16, 16), (3, 4)): 1,
         ((16, 16), (4, 4)): 8,
         ((16, 16), (4, 16)): 18,
         ((16, 16), (4, 17)): 6,
         ((16, 16), (16, 4)): 2,
         ((16, 16), (17, 4)): 1,
         ((16, 17), (16, 4)): 2,
         ((16, 17), (17, 4)): 7,
         ((17, 3), (3, 3)): 2,
         ((17, 3), (3, 4)): 31,
         ((17, 3), (3, 5)): 4,
         ((17, 3), (3, 15)): 1,
         ((17, 3), (3, 16)): 13,
         ((17, 3), (3, 17)): 18,
         ((17, 3), (4, 3)): 117,
         ((17, 3), (4, 4)): 27,
         ((17, 3), (4, 5)): 1,
         ((17, 3), (4, 15)): 1,
         ((17, 3), (4, 16)): 105,
         ((17, 3), (4, 17)): 1,
         ((17, 3), (5, 17)): 1,
         ((17, 3), (15, 17)): 4,
         ((17, 3), (16, 4)): 4,
         ((17, 3), (16, 15)): 1,
         ((17, 3), (16, 16)): 1,
         ((17, 3), (16, 17)): 5,
         ((17, 3), (17, 16)): 5,
         ((17, 4), (3, 3)): 330,
         ((17, 4), (3, 4)): 81,
         ((17, 4), (3, 5)): 11,
         ((17, 4), (3, 15)): 6,
         ((17, 4), (3, 16)): 316,
         ((17, 4), (3, 17)): 179,
         ((17, 4), (4, 3)): 3550,
         ((17, 4), (4, 4)): 3431,
         ((17, 4), (4, 5)): 31,
         ((17, 4), (4, 14)): 2,
         ((17, 4), (4, 15)): 4,
         ((17, 4), (4, 16)): 1818,
         ((17, 4), (4, 17)): 4,
         ((17, 4), (5, 3)): 30,
         ((17, 4), (5, 4)): 18,
         ((17, 4), (5, 16)): 1,
         ((17, 4), (5, 17)): 4,
         ((17, 4), (6, 3)): 1,
         ((17, 4), (6, 4)): 3,
         ((17, 4), (7, 10)): 1,
         ((17, 4), (11, 9)): 1,
         ((17, 4), (14, 4)): 32,
         ((17, 4), (15, 3)): 38,
         ((17, 4), (15, 4)): 105,
         ((17, 4), (15, 16)): 6,
         ((17, 4), (15, 17)): 23,
         ((17, 4), (16, 14)): 1,
         ((17, 4), (16, 16)): 1102,
         ((17, 4), (16, 17)): 41,
         ((17, 4), (17, 15)): 6,
         ((17, 4), (17, 16)): 318,
         ((17, 4), (17, 17)): 101,
         ((17, 5), (3, 3)): 1,
         ((17, 5), (3, 4)): 10,
         ((17, 5), (3, 15)): 1,
         ((17, 5), (3, 16)): 1,
         ((17, 5), (3, 17)): 4,
         ((17, 5), (4, 3)): 7,
         ((17, 5), (4, 4)): 127,
         ((17, 5), (4, 5)): 2,
         ((17, 5), (4, 16)): 85,
         ((17, 5), (4, 17)): 1,
         ((17, 5), (5, 3)): 1,
         ((17, 5), (5, 4)): 2,
         ((17, 5), (15, 4)): 1,
         ((17, 5), (15, 16)): 3,
         ((17, 5), (15, 17)): 2,
         ((17, 5), (16, 3)): 8,
         ((17, 5), (16, 14)): 1,
         ((17, 5), (16, 15)): 3,
         ((17, 5), (16, 16)): 98,
         ((17, 5), (16, 17)): 1,
         ((17, 5), (17, 16)): 28,
         ((17, 5), (17, 17)): 6,
         ((17, 15), (17, 4)): 1,
         ((17, 16), (3, 17)): 1,
         ((17, 16), (4, 4)): 4,
         ((17, 16), (4, 16)): 9,
         ((17, 16), (4, 17)): 10,
         ((17, 16), (16, 4)): 4,
         ((17, 17), (4, 17)): 1})

'path' is a list of move from i-th to (i+1)-th move for each i separately, whereas fullpaths is a list of moves from 1-st to 5-th moves.



In [9]:

    
numgames = len(moves2[moves2.moveid==1])
numgames









    Out[9]:





28638



In [10]:

    
numunique = len(fullpath)
numunique









    Out[10]:





15249



In [11]:

    
(numgames - numunique) / numgames * 100









    Out[11]:





46.75256652000838

Up to 5-th move, there are 28677 games, out of which 4448 unique paths. The vast majority of moves are therefore represented more than once in our dataset.

Transforming the obtained results in a usable form: {nodes: [{"name": "name 1"}, {"name": "name 2"}, ... ], edges: [{"source": 1, "target": 2, "value": 3}, {"source": 2, "target": 3, "value": 4}...]}

Format for Sankey

First count all possible positions of a move at i-th move:



In [12]:

    
nodes2 = []
node_map = {}
idx_offset = 0
for i, counter in enumerate(nodes):
    ns = [{'name': '{}'.format(n[0]),
           'value': n[1],
           'level': i + 1} 
          for n in sorted(counter.items(), key = lambda x: x[1], reverse=True)]
    node_map.update({(n['level'],n['name']): idx + idx_offset for idx, n in enumerate(ns)})
    idx_offset += len(ns)
    nodes2.extend(ns)



In [13]:

    
nodes2[:5]









    Out[13]:





[{'level': 1, 'name': '(16, 4)', 'value': 15072},
 {'level': 1, 'name': '(17, 4)', 'value': 11595},
 {'level': 1, 'name': '(3, 16)', 'value': 529},
 {'level': 1, 'name': '(17, 5)', 'value': 393},
 {'level': 1, 'name': '(17, 3)', 'value': 342}]

nodes2 holds all positions per-move . The 'name' is x, y coordinate, and the 'value' is frequency/count. The 'level' represents i-th move. 'node_map' holds a mapping from the node to its index in the list; this is purely an implementation detail.



In [14]:

    
paths2 = []
for i, path in enumerate(paths):
    level = i + 1
    paths2.extend(
        {'source': node_map[(level, '{}'.format(p[0][0]))],
         'target': node_map[(level+1, '{}'.format(p[0][1]))],
         'value': p[1]}
        for p in sorted(path.items(), key = lambda x: x[1], reverse=True))



In [15]:

    
paths2[:5]









    Out[15]:





[{'source': 0, 'target': 44, 'value': 6589},
 {'source': 0, 'target': 46, 'value': 4263},
 {'source': 1, 'target': 45, 'value': 3550},
 {'source': 1, 'target': 44, 'value': 3431},
 {'source': 0, 'target': 45, 'value': 2998}]

Like 'nodes2', 'paths2' holds frquency/count of "flows". 'source' is an index of a stone at i-th move in 'nodes2'. Similarly, 'target' is an index of a stone at (i+1)-th move in 'nodes2'. Therefore, 'paths2' represents an edge/flow from i-th to (i+1)-th move.

Finally, store 'nodes2' and 'paths2' in a JSON-formatted file. This file is the only input for our Sankey diagram.



In [ ]:

    
with open('game-moves.json', 'w') as f:
    json.dump({'nodes': nodes2, 'links': paths2}, f)

Formatting for Radial-Game-Tree

Now, format for serialization. In particular, I need children of a node in a list not in a dict.



In [16]:

    
# Start from an empty root node.
move_tree = {'x': 0, 'y': 0, 'children': {}, 'player': 'root', 'count': 0,
             'win': {'B': 0, 'W': 0}} 
for _, row in moves2.iterrows():
    # current_path points to the parent of the current move.
    result, moveid, player, x, y = row
    if moveid == 1:
        current_path = move_tree
        move_tree['count'] += 1
    
    if (x, y) not in current_path['children']:
        current_path['children'][(x, y)] = {'x': x, 'y': y, 
                                            'player': player, 
                                            'children': {}, 
                                            'count': 0,
                                            'win': {'B': 0, 'W': 0}}
    # current_path is updated to the current move.
    current_path = current_path['children'][(x, y)]
    current_path['count'] += 1
    current_path['win'][result[0]] += 1



In [17]:

    
def modifyrecurse(path):
    children = path['children'].values()
    for child in children:
        modifyrecurse(child)
    path['children'] = list(children)
modifyrecurse(move_tree)



In [18]:

    
all(c['count'] == sum(c['win'].values()) for c in move_tree['children'])









    Out[18]:





True



In [19]:

    
with open('game-tree-8.json', 'w') as f:
    json.dump(move_tree, f)

	gameid	EV	RO	PB	BR	PW	WR	KM	RE	DT	...	BL	WL	GN	PX	PY	OH	PBX	MN	CP	LC
0	1	1st All Japan #1	Round 1	Sakata Eio	9p	Kubouchi Shuchi	9p	5.5	B+R	1968-11-28	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	2	1st All Japan #1	Round 3	Fujisawa Shuko	9p	Hashimoto Shoji	9p	4.5	B+3.5	1969-08-14	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	3	1st All Japan #1	Challenger's Final	Fujisawa Shuko	9p	Sugiuchi Masao	9p	4.5	B+8.5	1970	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	4	1st All Japan #1	Round 3	Ishida Yoshio	6p	Sugiuchi Masao	9p	5.5	W+R	1970	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	5	1st All Japan #1	Round 4	Kurosawa Tadanao	7p	Sugiuchi Masao	9p	5.5	W+R	1970	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN