; Copyright 1993 GTE Laboratories Incorporated ; Permission granted to copy and change for research and educational purposes ; Written by Richard S. Sutton ; General Markov Control Process with s simple illustrative version of Dyna-AHC (Dyna-PI) ; This case is a 4 X 4 grid world with start and goal states ; and a barrier making access to the goal a little difficult. ; Like this: ; ; S . . . ; S is the start state ; . . . . ; . . B . ; B is a barrier - can't be entered ; . . B G ; G is the goal state - no penalty and restart ; ; The states are numbered 0 to 15 in the obvious way, left to right ; A reward of +1 is delivered upon reaching the goal. Reward is 0 otehrwise. ; This simulation includes a simple model learner and user. ; The model learner assumes the environment is deterministic ; and simply records next states as they are found. ; ; The model user simply picks a state at random from a distribution ; reflecting how often each state has occurred before in past experience. ; It looks ahead one step and then picks another. Many model steps ; are done for each real step. ; To run an experiment call (setup) and then either (ex) or else (init) then (trials 20) (defvar n 16) ; (Maximum) number of states (defvar m 4) ; (Maximum) number of actions at a state (defvar v) ; memory for evaluation function (defvar w) ; memory for reaction function (defvar alpha 1.0) ; learning rate for reaction function (defvar beta 0.1) ; learning rate for evaluation function (defvar tau 1.0) ; randomness parameter for reaction function (defvar gamma 0.9) ; discounting parameter (defvar x) ; For a single transition, x is start state (defvar y) ; y is result state (defvar a) ; a is action (defvar r) ; r is reward (defvar e) ; e is Eval(x) (defvar e-prime) ; e is better estimate using r and Eval(y) (defvar rhat) ; e-prime - e (defvar model-next-state) ; Model memory: next-state function (defvar model-reward) ; and reward function (defvar times-visited) ; Vector of how many times each state has been visited (defvar total-visits) ; How many times all states together have been visited (defvar num-model-steps 10) ;number of steps to do with model for each real step (defvar current-state) (defvar start-state 0) (defvar goal-state 15) (defvar real-next-state) (defvar height 4) (defvar width 4) (defun setup () (setq v (make-array n)) (setq w (make-array (list n m))) (setq model-next-state (make-array (list n m))) (setq model-reward (make-array (list n m))) (setq times-visited (make-array n)) (setq real-next-state (make-array (list n m))) (make-real-next-state real-next-state) ) (defun init () (setq total-visits 0) (loop for i from 0 below n do (setf (aref v i) 0.0) (setf (aref times-visited i) 0) (loop for j from 0 below m do (setf (aref w i j) 0.0) (setf (aref model-reward i j) nil) (setf (aref model-next-state i j) nil) ))) (defun learn () (setq e (aref v x)) (setq e-prime (+ r (* gamma (aref v y)))) (incf (aref v x) (* beta (- e-prime e))) (incf (aref w x a) (* alpha (- e-prime e)))) (defun learn-model () (setf (aref model-next-state x a) y) (setf (aref model-reward x a) r)) (defun pick-from-visited-states () (loop with random = (random total-visits) for i from 0 below n summing (aref times-visited i) into sum until (> sum random) finally (return i))) (defun trial () "Runs a single trial, from start to goal states" (setq current-state start-state) (loop while (not (= current-state goal-state)) with time = 0 do (incf (aref times-visited current-state)) (incf total-visits) (loop repeat num-model-steps do (setq x (pick-from-visited-states)) (setq a (action w x)) (setq y (aref model-next-state x a)) (setq r (aref model-reward x a)) when y do (learn)) (setq x current-state) (setq a (action w x)) (setq y (aref real-next-state x a)) (setq r (if (= y goal-state) 1 0)) (learn) (learn-model) (setq current-state y) (incf time) finally (return time))) (defun action (w x) (loop with max = -1000000.0 with best for i from 0 below m for sum-i = (+ (aref w x i) (random-exponential tau)) when (< max sum-i) do (setq max sum-i) (setq best i) finally (return best))) (defun make-real-next-state (Q) (loop for x below n do (loop for a below m do (setf (aref Q x a) (let ((w (x-from-state x)) (h (y-from-state x))) (case a (0 (incf h) (if (>= h height) (decf h))) (1 (incf w) (if (>= w width) (decf w))) (2 (decf h) (if (< h 0) (incf h))) (3 (decf w) (if (< w 0) (incf w))) (otherwise (error "illegal action"))) (state-from-xy w h))) (if (or (= 10 (aref Q x a)) (= 14 (aref Q x a))) (setf (aref Q x a) x))))) (defun x-from-state (x) (rem x width)) (defun y-from-state (x) (floor x width)) (defun state-from-xy (w h) (+ (* h width) w)) (defun show-v () (loop for h below height do (format t "~%~2D:" h) (loop for w below width do (format t "~8,3F" (- 0.0 (aref v (state-from-xy w h))))))) (defun show-w () (loop for h below height do (format t "~%~D:" h) (loop for wi below width do (format t "~D(" wi) (loop for i below m do (format t "~,3F " (aref w (state-from-xy wi h) i)))))) (defun show1 (v) (loop for h below height do (format t "~%~2D:" h) (loop for w below width do (format t "~8,3F" (- 0.0 (aref v (state-from-xy w h))))))) (defun show2 (w) (loop for h below height do (format t "~%~D:" h) (loop for wi below width do (format t "~D(" wi) (loop for i below m do (format t "~,3F " (aref w (state-from-xy wi h) i)))))) (defun trials (num-trials) (loop repeat num-trials collect (trial))) (defvar data) (defun ex () (setq data nil) (loop repeat 100 do (init) (push (trials 20) data)) (multi-mean data)) (defun multi-mean (list-of-lists) (loop for list in (reorder-list-of-lists list-of-lists) collect (mean list))) (defun random-exponential (tau) (- (* tau (log (- 1 (random 1.0)))))) (defun reorder-list-of-lists (list-of-lists) (loop for n from 0 below (length (first list-of-lists)) collect (loop for list in list-of-lists collect (nth n list)))) (defun mean (l) (float (/ (loop for i in l sum i) (length l))))