- lr = 0.1
- gamma = 0.99
- Q[state_qid, opt_qid] = Q[state_qid, opt_qid] + \
- lr * (reward + gamma * np.max(Q[new_state_qid, :]) - Q[state_qid, opt_qid])
+ Q[state_qid, opt_qid] += \
+ learning_rate * (reward
+ + discount_factor * np.max(Q[new_state_qid, :])
+ - Q[state_qid, opt_qid])