RPS

Read data


In [1]:
import sqlite3 as sqlt

Play 100 games, than copy db/game.db to ml


In [2]:
conn = sqlt.connect("train.db")
c = conn.cursor()
c.execute("SELECT * FROM rps;")
train = c.fetchall()[1:]
len(train)


Out[2]:
101

Three armed bandit


In [3]:
from urllib import urlencode, urlopen
import json
import random

arms = 3

class Bandit:
    
    def __init__(self, series):
        self.series = series
        self.counter = 0
        self.cycle = len(series)
    

    def cheat(self):
        return self.series[self.counter % self.cycle][0]
    
    def dumb(self, i):
        return random.randint(-1, 1)
    
    def pull(self, arm):
        p1 = self.series[self.counter % self.cycle][0]
        p2 = arm
        query_args = {'player1': p1, 'player2': p2}
        data = urlencode(query_args)
        result = int(json.loads(urlopen('http://engine:5000/rps', data).read())['winner'])
        self.counter += 1
        if result == 1: return -1
        if result == 2: return 1
        return  0

In [4]:
bandit = Bandit(train)

In [6]:
bandit.pull(bandit.cheat())


Out[6]:
0

Agent


In [8]:
import tensorflow as tf
tf.reset_default_graph()

# network
weights = tf.Variable(tf.ones([arms]))
chosen_action = tf.argmax(weights,0)

reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
action_holder = tf.placeholder(shape=[1],dtype=tf.int32)
responsible_weight = tf.slice(weights,action_holder,[1])
loss = -(tf.log(responsible_weight)*reward_holder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.05)
update = optimizer.minimize(loss)

Training


In [42]:
import numpy as np
import random
epochs = 100
total_reward = np.zeros([arms])
e = 0.7

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < epochs:
        
        if np.random.rand(1) < e:
            action = random.randint(1,arms)-1
        else:
            action = sess.run(chosen_action)
        
        reward = bandit.pull(action)
        _,resp,ww = sess.run([update,responsible_weight,weights], feed_dict={reward_holder:[reward],action_holder:[action]})
        
        total_reward[action] += reward
        print 'Reward %2d Weights: %s Stats: %s' % (reward, str(weights.eval()), str(total_reward))
        i+=1


Reward  1 Weights: [ 1.          1.          1.04999995] Stats: [ 0.  0.  1.]
Reward  0 Weights: [ 1.          1.          1.04999995] Stats: [ 0.  0.  1.]
Reward  0 Weights: [ 1.          1.          1.04999995] Stats: [ 0.  0.  1.]
Reward -1 Weights: [ 1.          0.94999999  1.04999995] Stats: [ 0. -1.  1.]
Reward  0 Weights: [ 1.          0.94999999  1.04999995] Stats: [ 0. -1.  1.]
Reward  0 Weights: [ 1.          0.94999999  1.04999995] Stats: [ 0. -1.  1.]
Reward -1 Weights: [ 0.94999999  0.94999999  1.04999995] Stats: [-1. -1.  1.]
Reward -1 Weights: [ 0.94999999  0.89736843  1.04999995] Stats: [-1. -2.  1.]
Reward  0 Weights: [ 0.94999999  0.89736843  1.04999995] Stats: [-1. -2.  1.]
Reward  0 Weights: [ 0.94999999  0.89736843  1.04999995] Stats: [-1. -2.  1.]
Reward -1 Weights: [ 0.89736843  0.89736843  1.04999995] Stats: [-2. -2.  1.]
Reward -1 Weights: [ 0.89736843  0.84164995  1.04999995] Stats: [-2. -3.  1.]
Reward  0 Weights: [ 0.89736843  0.84164995  1.04999995] Stats: [-2. -3.  1.]
Reward  0 Weights: [ 0.89736843  0.84164995  1.04999995] Stats: [-2. -3.  1.]
Reward  0 Weights: [ 0.89736843  0.84164995  1.04999995] Stats: [-2. -3.  1.]
Reward  0 Weights: [ 0.89736843  0.84164995  1.04999995] Stats: [-2. -3.  1.]
Reward -1 Weights: [ 0.89736843  0.78224283  1.04999995] Stats: [-2. -4.  1.]
Reward -1 Weights: [ 0.89736843  0.78224283  1.00238085] Stats: [-2. -4.  0.]
Reward  0 Weights: [ 0.89736843  0.78224283  1.00238085] Stats: [-2. -4.  0.]
Reward  1 Weights: [ 0.89736843  0.78224283  1.05226207] Stats: [-2. -4.  1.]
Reward -1 Weights: [ 0.84164995  0.78224283  1.05226207] Stats: [-3. -4.  1.]
Reward  0 Weights: [ 0.84164995  0.78224283  1.05226207] Stats: [-3. -4.  1.]
Reward -1 Weights: [ 0.84164995  0.78224283  1.00474536] Stats: [-3. -4.  0.]
Reward  1 Weights: [ 0.84164995  0.8461616   1.00474536] Stats: [-3. -3.  0.]
Reward -1 Weights: [ 0.84164995  0.78707123  1.00474536] Stats: [-3. -4.  0.]
Reward  1 Weights: [ 0.84164995  0.78707123  1.05450916] Stats: [-3. -4.  1.]
Reward -1 Weights: [ 0.84164995  0.78707123  1.00709379] Stats: [-3. -4.  0.]
Reward -1 Weights: [ 0.84164995  0.7235446   1.00709379] Stats: [-3. -5.  0.]
Reward -1 Weights: [ 0.78224283  0.7235446   1.00709379] Stats: [-4. -5.  0.]
Reward  1 Weights: [ 0.78224283  0.7235446   1.0567416 ] Stats: [-4. -5.  1.]
Reward -1 Weights: [ 0.78224283  0.7235446   1.00942636] Stats: [-4. -5.  0.]
Reward  0 Weights: [ 0.78224283  0.7235446   1.00942636] Stats: [-4. -5.  0.]
Reward  0 Weights: [ 0.78224283  0.7235446   1.00942636] Stats: [-4. -5.  0.]
Reward -1 Weights: [ 0.78224283  0.65444034  1.00942636] Stats: [-4. -6.  0.]
Reward  0 Weights: [ 0.78224283  0.65444034  1.00942636] Stats: [-4. -6.  0.]
Reward -1 Weights: [ 0.78224283  0.65444034  0.95989329] Stats: [-4. -6. -1.]
Reward  1 Weights: [ 0.78224283  0.65444034  1.01198244] Stats: [-4. -6.  0.]
Reward  0 Weights: [ 0.78224283  0.65444034  1.01198244] Stats: [-4. -6.  0.]
Reward  1 Weights: [ 0.78224283  0.73084152  1.01198244] Stats: [-4. -5.  0.]
Reward  0 Weights: [ 0.78224283  0.73084152  1.01198244] Stats: [-4. -5.  0.]
Reward  0 Weights: [ 0.78224283  0.73084152  1.01198244] Stats: [-4. -5.  0.]
Reward -1 Weights: [ 0.71832407  0.73084152  1.01198244] Stats: [-5. -5.  0.]
Reward -1 Weights: [ 0.71832407  0.73084152  0.96257448] Stats: [-5. -5. -1.]
Reward  0 Weights: [ 0.71832407  0.73084152  0.96257448] Stats: [-5. -5. -1.]
Reward -1 Weights: [ 0.71832407  0.66242725  0.96257448] Stats: [-5. -6. -1.]
Reward -1 Weights: [ 0.71832407  0.58694726  0.96257448] Stats: [-5. -7. -1.]
Reward  1 Weights: [ 0.71832407  0.58694726  1.0145185 ] Stats: [-5. -7.  0.]
Reward -1 Weights: [ 0.71832407  0.50176072  1.0145185 ] Stats: [-5. -8.  0.]
Reward  0 Weights: [ 0.71832407  0.50176072  1.0145185 ] Stats: [-5. -8.  0.]
Reward -1 Weights: [ 0.71832407  0.50176072  0.96523404] Stats: [-5. -8. -1.]
Reward  1 Weights: [ 0.71832407  0.60140979  0.96523404] Stats: [-5. -7. -1.]
Reward -1 Weights: [ 0.71832407  0.60140979  0.91343313] Stats: [-5. -7. -2.]
Reward  0 Weights: [ 0.71832407  0.60140979  0.91343313] Stats: [-5. -7. -2.]
Reward  0 Weights: [ 0.71832407  0.60140979  0.91343313] Stats: [-5. -7. -2.]
Reward -1 Weights: [ 0.64871758  0.60140979  0.91343313] Stats: [-6. -7. -2.]
Reward  0 Weights: [ 0.64871758  0.60140979  0.91343313] Stats: [-6. -7. -2.]
Reward  1 Weights: [ 0.64871758  0.60140979  0.96817166] Stats: [-6. -7. -1.]
Reward  0 Weights: [ 0.64871758  0.60140979  0.96817166] Stats: [-6. -7. -1.]
Reward  0 Weights: [ 0.64871758  0.60140979  0.96817166] Stats: [-6. -7. -1.]
Reward -1 Weights: [ 0.64871758  0.60140979  0.91652793] Stats: [-6. -7. -2.]
Reward  0 Weights: [ 0.64871758  0.60140979  0.91652793] Stats: [-6. -7. -2.]
Reward -1 Weights: [ 0.64871758  0.5182718   0.91652793] Stats: [-6. -8. -2.]
Reward  0 Weights: [ 0.64871758  0.5182718   0.91652793] Stats: [-6. -8. -2.]
Reward  1 Weights: [ 0.64871758  0.5182718   0.97108161] Stats: [-6. -8. -1.]
Reward  0 Weights: [ 0.64871758  0.5182718   0.97108161] Stats: [-6. -8. -1.]
Reward -1 Weights: [ 0.57164246  0.5182718   0.97108161] Stats: [-7. -8. -1.]
Reward  1 Weights: [ 0.57164246  0.5182718   1.02257061] Stats: [-7. -8.  0.]
Reward  0 Weights: [ 0.57164246  0.5182718   1.02257061] Stats: [-7. -8.  0.]
Reward  0 Weights: [ 0.57164246  0.5182718   1.02257061] Stats: [-7. -8.  0.]
Reward  0 Weights: [ 0.57164246  0.5182718   1.02257061] Stats: [-7. -8.  0.]
Reward  0 Weights: [ 0.57164246  0.5182718   1.02257061] Stats: [-7. -8.  0.]
Reward -1 Weights: [ 0.48417521  0.5182718   1.02257061] Stats: [-8. -8.  0.]
Reward -1 Weights: [ 0.48417521  0.5182718   0.97367424] Stats: [-8. -8. -1.]
Reward  1 Weights: [ 0.48417521  0.5182718   1.02502608] Stats: [-8. -8.  0.]
Reward -1 Weights: [ 0.48417521  0.5182718   0.97624683] Stats: [-8. -8. -1.]
Reward -1 Weights: [ 0.48417521  0.5182718   0.92503029] Stats: [-8. -8. -2.]
Reward -1 Weights: [ 0.48417521  0.5182718   0.870978  ] Stats: [-8. -8. -3.]
Reward  0 Weights: [ 0.48417521  0.5182718   0.870978  ] Stats: [-8. -8. -3.]
Reward -1 Weights: [ 0.48417521  0.42179734  0.870978  ] Stats: [-8. -9. -3.]
Reward -1 Weights: [ 0.48417521  0.42179734  0.81357127] Stats: [-8. -9. -4.]
Reward  0 Weights: [ 0.48417521  0.42179734  0.81357127] Stats: [-8. -9. -4.]
Reward  0 Weights: [ 0.48417521  0.42179734  0.81357127] Stats: [-8. -9. -4.]
Reward  1 Weights: [ 0.48417521  0.42179734  0.87502873] Stats: [-8. -9. -3.]
Reward  1 Weights: [ 0.48417521  0.42179734  0.93216974] Stats: [-8. -9. -2.]
Reward  0 Weights: [ 0.48417521  0.42179734  0.93216974] Stats: [-8. -9. -2.]
Reward  0 Weights: [ 0.48417521  0.42179734  0.93216974] Stats: [-8. -9. -2.]
Reward -1 Weights: [ 0.38090682  0.42179734  0.93216974] Stats: [-9. -9. -2.]
Reward  0 Weights: [ 0.38090682  0.42179734  0.93216974] Stats: [-9. -9. -2.]
Reward  1 Weights: [ 0.38090682  0.54033768  0.93216974] Stats: [-9. -8. -2.]
Reward  0 Weights: [ 0.38090682  0.54033768  0.93216974] Stats: [-9. -8. -2.]
Reward  0 Weights: [ 0.38090682  0.54033768  0.93216974] Stats: [-9. -8. -2.]
Reward -1 Weights: [ 0.24964112  0.54033768  0.93216974] Stats: [-10.  -8.  -2.]
Reward  0 Weights: [ 0.24964112  0.54033768  0.93216974] Stats: [-10.  -8.  -2.]
Reward  0 Weights: [ 0.24964112  0.54033768  0.93216974] Stats: [-10.  -8.  -2.]
Reward  1 Weights: [ 0.24964112  0.6328724   0.93216974] Stats: [-10.  -7.  -2.]
Reward  1 Weights: [ 0.24964112  0.71187729  0.93216974] Stats: [-10.  -6.  -2.]
Reward -1 Weights: [ 0.24964112  0.71187729  0.87853146] Stats: [-10.  -6.  -3.]
Reward -1 Weights: [ 0.24964112  0.71187729  0.82161832] Stats: [-10.  -6.  -4.]
Reward  0 Weights: [ 0.24964112  0.71187729  0.82161832] Stats: [-10.  -6.  -4.]
Reward  0 Weights: [ 0.24964112  0.71187729  0.82161832] Stats: [-10.  -6.  -4.]

In [ ]: