From 8fe5c4eee79c73b0409d4e305e18b51b790e028c Mon Sep 17 00:00:00 2001 From: Alexander Schmidt Date: Mon, 2 Nov 2020 22:39:52 +0100 Subject: [PATCH] updating qtable --- src/qtable.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/qtable.py b/src/qtable.py index fb1f390..5dda444 100755 --- a/src/qtable.py +++ b/src/qtable.py @@ -6,9 +6,12 @@ import sys from game import Game +learning_rate = 0.1 +discount_factor = 1.0 + states_dim = 147456 # 2^12 * 6^2 actions_dim = 637 # 12+1 * (6+1)^2 -num_episodes = 10000000 +num_episodes = 10000000000 def find_state_qid(shutable, diced): qid = 0 @@ -40,7 +43,7 @@ def select_option(opts, qs): return (opt_qid_pair[0], opt_qid_pair[1]) return (None, None) -Q = np.zeros([states_dim, actions_dim]) +Q = np.ones([states_dim, actions_dim]) running_score = [0.0, 0.0] @@ -54,15 +57,17 @@ for i in range(num_episodes): old_score = g.get_score() g.shut(opt) g.dice() - reward = g.get_score() - old_score + reward = (g.get_score() - old_score) / 12.0 new_state_qid = find_state_qid(g.get_shutable(), g.get_diced()) - lr = 0.1 - gamma = 0.99 - Q[state_qid, opt_qid] = Q[state_qid, opt_qid] + \ - lr * (reward + gamma * np.max(Q[new_state_qid, :]) - Q[state_qid, opt_qid]) + Q[state_qid, opt_qid] += \ + learning_rate * (reward + + discount_factor * np.max(Q[new_state_qid, :]) + - Q[state_qid, opt_qid]) state_qid = new_state_qid - running_score[0] *= 0.999 + else: + Q[state_qid, opt_qid] = 0 + running_score[0] *= 0.99999999 running_score[0] += g.get_score() - running_score[1] *= 0.999 + running_score[1] *= 0.99999999 running_score[1] += 1.0 print( "%d: %f" % (i, running_score[0]/running_score[1]) ) -- 2.39.5