diff --git a/RGBtest.jpeg b/RGBtest.jpeg new file mode 100644 index 0000000..e14d157 Binary files /dev/null and b/RGBtest.jpeg differ diff --git a/agent.py b/agent.py index 5668373..486003d 100755 --- a/agent.py +++ b/agent.py @@ -3,7 +3,7 @@ import utils # Default architectures for the lower level controller/actor -defaultNSample = 1000 +defaultNSample = 64 defaultGamma = 0.975 defaultEpsilon = 1.0 defaultControllerEpsilon = [1.0]*6 @@ -15,7 +15,12 @@ ############ defaultMetaEpsilon = 1 -defaultMetaNSamples = 200 +defaultMetaNSamples = 64 +updateFrequency = 10000 +controllerMemCap = 1000000 +maxReward = 1000 +minReward = -1000 +trueSubgoalOrder = [2, 4, 3, 5] class Agent: @@ -40,16 +45,34 @@ def selectMove(self, state, goal): return np.argmax(self.net.controllerNet.predict([np.reshape(state, (1, 84, 84, 1)), np.asarray([goalVec])], verbose=0)) return random.choice(self.actionSet) + def setControllerEpsilon(self, epsilonArr): + self.controllerEpsilon = epsilonArr + def selectGoal(self, state): if self.metaEpsilon < random.random(): # predict action pred = self.net.metaNet.predict(np.reshape(state, (1, 84, 84, 1)), verbose=0) return np.argmax(pred) - print("Exploring") return random.choice(self.goalSet) - def criticize(self, goalReached): - return 1.0 if goalReached else 0.0 + def selectTrueGoal(self, goalNum): + return trueSubgoalOrder[goalNum] + + def setMetaEpsilon(self, epsilon): + self.metaEpsilon = epsilon + + def criticize(self, reachGoal, action, die, distanceReward): + reward = 0.0 + if action == 0: + reward -= 0.1 + if reachGoal: + reward += 50.0 + if die: + reward -= 200.0 + reward += distanceReward + reward = np.minimum(reward, maxReward) + reward = np.maximum(reward, minReward) + return reward def store(self, experience, meta=False): if meta: @@ -58,11 +81,11 @@ def store(self, experience, meta=False): self.metaMemory = self.metaMemory[-500:] else: self.memory.append(experience) - if len(self.memory) > 1000000: - self.memory = self.memory[-1000000:] + if len(self.memory) > controllerMemCap: + self.memory = self.memory[-controllerMemCap:] - def _update(self): + def _update(self, stepCount): exps = [random.choice(self.memory) for _ in range(self.nSamples)] # stateVectors = np.squeeze(np.asarray([np.concatenate([exp.state, exp.goal], axis=1) for exp in exps])) stateVector = [] @@ -85,7 +108,7 @@ def _update(self): if not exp.done: rewardVectors[i][exp.action] += self.gamma * max(nextStateRewardVectors[i]) rewardVectors = np.asarray(rewardVectors) - self.net.controllerNet.fit([stateVector, goalVector], rewardVectors, epochs = 1, verbose=1) + self.net.controllerNet.train_on_batch([stateVector, goalVector], rewardVectors) #Update target network controllerWeights = self.net.controllerNet.get_weights() @@ -94,7 +117,7 @@ def _update(self): controllerTargetWeights[i] = self.targetTau * controllerWeights[i] + (1 - self.targetTau) * controllerTargetWeights[i] self.net.targetControllerNet.set_weights(controllerTargetWeights) - def _update_meta(self): + def _update_meta(self, stepCount): if 0 < len(self.metaMemory): exps = [random.choice(self.metaMemory) for _ in range(self.metaNSamples)] stateVectors = np.asarray([exp.state for exp in exps]) @@ -107,7 +130,7 @@ def _update_meta(self): rewardVectors[i][np.argmax(exp.goal)] = exp.reward if not exp.done: rewardVectors[i][np.argmax(exp.goal)] += self.gamma * max(nextStateRewardVectors[i]) - self.net.metaNet.fit(stateVectors, rewardVectors, epochs = 1, verbose=1) + self.net.metaNet.train_on_batch(stateVectors, rewardVectors) #Update target network metaWeights = self.net.metaNet.get_weights() @@ -116,18 +139,17 @@ def _update_meta(self): metaTargetWeights[i] = self.targetTau * metaWeights[i] + (1 - self.targetTau) * metaTargetWeights[i] self.net.targetMetaNet.set_weights(metaTargetWeights) - def update(self, meta=False): + def update(self, stepCount, meta=False): if meta: - self._update_meta() + self._update_meta(stepCount) else: - self._update() + self._update(stepCount) def annealMetaEpsilon(self, stepCount): - self.metaEpsilon = defaultEndEpsilon + (defaultMetaEpsilon - defaultEndEpsilon) * \ - (defaultAnnealSteps - max(0, stepCount - defaultRandomPlaySteps)) / defaultAnnealSteps + self.metaEpsilon = defaultEndEpsilon + max(0, (defaultMetaEpsilon - defaultEndEpsilon) * \ + (defaultAnnealSteps - max(0, stepCount - defaultRandomPlaySteps)) / defaultAnnealSteps) def annealControllerEpsilon(self, stepCount, goal): - self.controllerEpsilon[goal] = defaultEndEpsilon + (defaultControllerEpsilon[goal] - defaultEndEpsilon) * \ - (defaultAnnealSteps - max(0, stepCount - defaultRandomPlaySteps)) / defaultAnnealSteps - + self.controllerEpsilon[goal] = defaultEndEpsilon + max(0, (defaultControllerEpsilon[goal] - defaultEndEpsilon) * \ + (defaultAnnealSteps - max(0, stepCount - defaultRandomPlaySteps)) / defaultAnnealSteps) diff --git a/agent.pyc b/agent.pyc new file mode 100644 index 0000000..be38aad Binary files /dev/null and b/agent.pyc differ diff --git a/ale_python_interface.pyc b/ale_python_interface.pyc new file mode 100644 index 0000000..2fb9e4d Binary files /dev/null and b/ale_python_interface.pyc differ diff --git a/de.py b/de.py new file mode 100644 index 0000000..f965359 --- /dev/null +++ b/de.py @@ -0,0 +1,65 @@ +import argparse +import sys +import time +import numpy as np +import tensorflow as tf +from collections import namedtuple +from environment import ALEEnvironment +from agent import Agent +from hdqn import Hdqn +from PIL import Image + +# Constant defined here +defaultRandomPlaySteps = 100000 +maxStepsPerEpisode = 5000 + +def str2bool(v): + return v.lower() in ("yes", "true", "t", "1") + +def main(): + # Initilization for tensor board + session = tf.Session() + tensorVar = tf.Variable(0) + tf.summary.scalar("reward", tensorVar) + sumWriterIntrinsic = tf.summary.FileWriter('./reward/intrinsic') + sumWriterExternal = tf.summary.FileWriter('./reward/external') + merged = tf.summary.merge_all() + session.run(tf.initialize_all_variables()) + + actionMap = [0, 1, 2, 3, 4, 5, 11, 12] + actionExplain = ['no action', 'jump', 'up', 'right', 'left', 'down', 'jump right', 'jump left'] + goalExplain = ['top left door', 'top right door', 'middle ladder', 'lower left ladder', 'lower right ladder', 'key'] + stepCount = 0 + parser = argparse.ArgumentParser() + parser.add_argument("--game", default="montezuma_revenge.bin") + parser.add_argument("--display_screen", type=str2bool, default=False) + parser.add_argument("--frame_skip", default=4) + #parser.add_argument("--repeat_action_probability", default=0.25) + parser.add_argument("--color_averaging", default=False) + parser.add_argument("--random_seed") + #parser.add_argument("--record_screen_path", default="./record") + #parser.add_argument("--record_sound_filename") + parser.add_argument("--minimal_action_set", default=False) + parser.add_argument("--screen_width", default=84) + parser.add_argument("--screen_height", default=84) + args = parser.parse_args() + ActorExperience = namedtuple("ActorExperience", ["state", "goal", "action", "reward", "next_state", "done"]) + MetaExperience = namedtuple("MetaExperience", ["state", "goal", "reward", "next_state", "done"]) + env = ALEEnvironment(args.game, args) + hdqn = Hdqn() + agent = Agent(hdqn, range(8), range(6)) + # set goalNum to hardcoded subgoal + goalNum = 0 + intrinsicRewardMonitor = 0 + externalRewardMonitor = 0 + env.actWrapper(5) + x = env.getScreenRGB() + im = Image.fromarray(x) + im.save("RGBtest.jpeg") + # for i in range(100): + # env.act(0) + # print(env.isTerminal()) + print(env.isTerminal()) + +if __name__ == "__main__": + main() diff --git a/environment.py b/environment.py index 36557b5..cf83408 100755 --- a/environment.py +++ b/environment.py @@ -13,7 +13,7 @@ class ALEEnvironment(): def __init__(self, rom_file, args): self.ale = ALEInterface() - ''' + if args.display_screen: if sys.platform == 'darwin': import pygame @@ -22,8 +22,7 @@ def __init__(self, rom_file, args): elif sys.platform.startswith('linux'): self.ale.setBool('sound', True) self.ale.setBool('display_screen', True) - self.ale.setBool('display_screen', True) - ''' + self.ale.setInt('frame_skip', args.frame_skip) #self.ale.setFloat('repeat_action_probability', args.repeat_action_probability) self.ale.setBool('color_averaging', args.color_averaging) @@ -60,12 +59,20 @@ def __init__(self, rom_file, args): self.life_lost = False self.initSrcreen = self.getScreen() self.goalSet = [] - self.goalSet.append([[8, 21], [16, 36]]) # top left door - self.goalSet.append([[69, 21], [77, 36]]) # top right door - self.goalSet.append([[37, 40], [47, 53]]) # middle ladder - self.goalSet.append([[8, 57], [19, 72]]) # lower left ladder - self.goalSet.append([[66,57], [76, 72]]) # lower right ladder - self.goalSet.append([[6, 39], [12, 47]]) # key + self.goalSet.append([[8, 21], [16, 36]]) # top left door 0 + self.goalSet.append([[69, 21], [77, 36]]) # top right door 1 + self.goalSet.append([[37, 40], [47, 53]]) # middle ladder 2 + self.goalSet.append([[8, 57], [19, 72]]) # lower left ladder 3 + self.goalSet.append([[66,57], [76, 72]]) # lower right ladder 4 + self.goalSet.append([[6, 39], [12, 47]]) # key 5 + self.goalCenterLoc = [] + self.goalCenterLoc.append([(8.0 + 16.0)/2, (21.0 + 36.0)/2]) + self.goalCenterLoc.append([(69.0 + 77.0)/2, (21.0+36.0)/2]) + self.goalCenterLoc.append([(37.0 + 47.0)/2, (40.0+53.0)/2]) + self.goalCenterLoc.append([(8.0 + 19.0)/2, (57.0+72.0)/2]) + self.goalCenterLoc.append([(66.0 + 76.0)/2, (57.0+72.0)/2]) + self.goalCenterLoc.append([(6.0 + 12.0)/2, (39.0+47.0)/2]) + self.reachedGoal = [0, 0, 0, 0, 0, 0] def numActions(self): return len(self.actions) @@ -82,13 +89,23 @@ def restart(self): ): self.ale.reset_game() self.life_lost = False - + + def resetLife(self): + self.life_lost = False + def act(self, action): lives = self.ale.lives() reward = self.ale.act(self.actions[action]) self.life_lost = (not lives == self.ale.lives()) - if reward != 0: - return 1.0 + return reward + + def actWrapper(self, action): + lives = self.ale.lives() + reward = self.act(action) + for i in range (20): + if lives == self.ale.lives(): + self.ale.act(0) + self.life_lost = (not lives == self.ale.lives()) return reward def getScreen(self): @@ -96,6 +113,38 @@ def getScreen(self): resized = cv2.resize(screen, (self.screen_width, self.screen_height)) return resized + def getScreenRGB(self): + screen = self.ale.getScreenRGB() + resized = cv2.resize(screen, (self.screen_width, self.screen_height)) + return resized + + def getAgentLoc(self): + img = self.getScreenRGB() + man = [200, 72, 72] + mask = np.zeros(np.shape(img)) + mask[:,:,0] = man[0]; + mask[:,:,1] = man[1]; + mask[:,:,2] = man[2]; + + diff = img - mask + indxs = np.where(diff == 0) + diff[np.where(diff < 0)] = 0 + diff[np.where(diff > 0)] = 0 + diff[indxs] = 255 + mean_y = np.sum(indxs[0]) / np.shape(indxs[0])[0] + mean_x = np.sum(indxs[1]) / np.shape(indxs[1])[0] + return (mean_x, mean_y) + + def distanceReward(self, lastGoal, goal): + if (lastGoal == -1): + return 0.0 + goalCenter = self.goalCenterLoc[goal] + agentX, agentY = self.getAgentLoc() + lastGoalCenter = self.goalCenterLoc[lastGoal] + dis = np.sqrt((goalCenter[0] - agentX)*(goalCenter[0] - agentX) + (goalCenter[1]-agentY)*(goalCenter[1]-agentY)) + disLast = np.sqrt((lastGoalCenter[0] - agentX)*(lastGoalCenter[0] - agentX) + (lastGoalCenter[1]-agentY)*(lastGoalCenter[1]-agentY)) + return disLast - dis + # add color channel for input of network def getState(self): screen = self.ale.getScreenGrayscale() @@ -107,6 +156,12 @@ def isTerminal(self): return self.ale.game_over() or self.life_lost return self.ale.game_over() + def isGameOver(self): + return self.ale.game_over() + + def isLifeLost(self): + return self.life_lost + def reset(self): self.ale.reset_game() self.life_lost = False @@ -121,7 +176,13 @@ def goalReached(self, goal): if goalScreen[x][y] != stateScreen[x][y]: count = count + 1 # 30 is total number of pixels of agent - if float(count) / ((goalPosition[1][0] - goalPosition[0][0]) * 30) > 0.15: + if float(count) / 30 > 0.5: + self.reachedGoal[goal] = 1 return True return False - \ No newline at end of file + + def goalNotReachedBefore(self, goal): + if (self.reachedGoal[goal] == 1): + return False + return True + \ No newline at end of file diff --git a/environment.pyc b/environment.pyc new file mode 100644 index 0000000..e484b51 Binary files /dev/null and b/environment.pyc differ diff --git a/goalReaced.jpeg b/goalReaced.jpeg new file mode 100644 index 0000000..ed35f75 Binary files /dev/null and b/goalReaced.jpeg differ diff --git a/hdqn.py b/hdqn.py index f77b48b..dad4056 100644 --- a/hdqn.py +++ b/hdqn.py @@ -63,5 +63,8 @@ def saveWeight(self, episodeNumber): self.metaNet.save('metaNet_' + str(episodeNumber) + '.h5') def loadWeight(self): - self.controllerNet.load_model('controllerNet.h5') - self.metaNet.load_model('metaNet.h5') \ No newline at end of file + path = 'weight/' + self.controllerNet = load_model(path + 'controllerNet.h5') + self.targetControllerNet = load_model(path + 'controllerNet.h5') + self.metaNet = load_model(path + 'metaNet.h5') + self.targetMetaNet = load_model(path + 'metaNet.h5') \ No newline at end of file diff --git a/hdqn.pyc b/hdqn.pyc new file mode 100644 index 0000000..1e2f239 Binary files /dev/null and b/hdqn.pyc differ diff --git a/libale_c.so b/libale_c.so index 427b45b..0426c36 100755 Binary files a/libale_c.so and b/libale_c.so differ diff --git a/main.py b/main.py index d608387..1694662 100755 --- a/main.py +++ b/main.py @@ -1,21 +1,37 @@ import argparse import sys +import time +import numpy as np +import tensorflow as tf from collections import namedtuple from environment import ALEEnvironment from agent import Agent from hdqn import Hdqn +from PIL import Image # Constant defined here -anneal_factor = (1.0-0.1)/12000 +maxStepsPerEpisode = 5000 + +def str2bool(v): + return v.lower() in ("yes", "true", "t", "1") def main(): + # Initilization for tensor board + session = tf.Session() + tensorVar = tf.Variable(0) + tf.summary.scalar("reward", tensorVar) + sumWriterIntrinsic = tf.summary.FileWriter('./reward/intrinsic') + sumWriterExternal = tf.summary.FileWriter('./reward/external') + merged = tf.summary.merge_all() + session.run(tf.initialize_all_variables()) + actionMap = [0, 1, 2, 3, 4, 5, 11, 12] actionExplain = ['no action', 'jump', 'up', 'right', 'left', 'down', 'jump right', 'jump left'] - print("Annealing factor: " + str(anneal_factor)) + goalExplain = ['top left door', 'top right door', 'middle ladder', 'lower left ladder', 'lower right ladder', 'key'] stepCount = 0 parser = argparse.ArgumentParser() parser.add_argument("--game", default="montezuma_revenge.bin") - parser.add_argument("--display_screen", action="store_true", default=False) + parser.add_argument("--display_screen", type=str2bool, default=False) parser.add_argument("--frame_skip", default=1) #parser.add_argument("--repeat_action_probability", default=0.25) parser.add_argument("--color_averaging", default=False) @@ -25,57 +41,126 @@ def main(): parser.add_argument("--minimal_action_set", default=False) parser.add_argument("--screen_width", default=84) parser.add_argument("--screen_height", default=84) + parser.add_argument("--load_weight", default=False) args = parser.parse_args() ActorExperience = namedtuple("ActorExperience", ["state", "goal", "action", "reward", "next_state", "done"]) MetaExperience = namedtuple("MetaExperience", ["state", "goal", "reward", "next_state", "done"]) + annealComplete = False env = ALEEnvironment(args.game, args) hdqn = Hdqn() - agent = Agent(hdqn, range(8), range(6)) - for episode_thousand in range(100): - # save the model every 1000 episode - hdqn.saveWeight(episode_thousand) - for episode in range(1000): - print("\n\n### EPISODE " + str(episode_thousand*1000 + episode) + "###") - env.reset() - goal = agent.selectGoal(env.getState()) - print(goal) - print(env.isTerminal()) - while not env.isTerminal(): - totalExternalRewards = 0 - stateLastGoal = env.getState() - while not env.isTerminal() and not env.goalReached(goal): - state = env.getState() - action = agent.selectMove(state, goal) - print('selected action is :' + str(actionExplain[action])) - externalRewards = env.act(actionMap[action]) - print('reward is :' + str(externalRewards)) - stepCount += 1 - nextState = env.getState() - intrinsicRewards = agent.criticize(env.goalReached(goal)) - # Store transition and update network params - exp = ActorExperience(state, goal, action, intrinsicRewards, nextState, env.isTerminal()) - agent.store(exp, meta=False) - # Do not update the network during random play - if (stepCount >= 100000): - if (stepCount == 100000): - print('start training (random walk ends)') - agent.update(meta=False) - agent.update(meta=True) - totalExternalRewards += externalRewards + + # Initilize network and agent + if (args.load_weight): + defaultRandomPlaySteps = 100000 + print('loading weight') + hdqn.loadWeight() + print('loading weight complete') + agent = Agent(hdqn, range(8), range(6)) + else: + defaultRandomPlaySteps = 100000 + agent = Agent(hdqn, range(8), range(6)) - # Store meta controller's experience - exp = MetaExperience(stateLastGoal, goal, totalExternalRewards, nextState, env.isTerminal()) - agent.store(exp, meta=True) - - # Update goal - if env.isTerminal() is False: - goal = agent.selectGoal(env.getState()) + intrinsicRewardMonitor = 0 + externalRewardMonitor = 0 + for episode in range(30000): + # save the model every 100 episode + if (episode % 500 == 0): + hdqn.saveWeight(episode) + print("\n\n### EPISODE " + str(episode) + "###") + print("\n\n### STEPS " + str(stepCount) + "###") + # Restart the game + env.restart() + for i in range(30): + env.act(0) + episodeSteps = 0 + # set goalNum to hardcoded subgoal + goalNum = 0 + # initial last goal + lastGoal = -1 + while not env.isGameOver() and episodeSteps <= maxStepsPerEpisode: + totalExternalRewards = 0 # NOT SURE IF IT SHOULD BE CLEARED HERE! + stateLastGoal = env.getState() + # goal = agent.selectGoal(env.getState()) + goal = agent.selectTrueGoal(goalNum) + print('predicted subgoal is: ' + goalExplain[goal]) + while not env.isTerminal() and not env.goalReached(goal) and episodeSteps <= maxStepsPerEpisode: + state = env.getState() + action = agent.selectMove(state, goal) - #Annealing - agent.annealMetaEpsilon(stepCount) - agent.annealControllerEpsilon(stepCount, goal) + #print('selected action is :' + str(actionExplain[action])) + externalRewards = env.actWrapper(actionMap[action]) + #print('reward is :' + str(externalRewards)) + stepCount += 1 + episodeSteps += 1 + nextState = env.getState() + distanceReward = env.distanceReward(lastGoal, goal) + # only assign intrinsic reward if the goal is reached and it has not been reached previously + intrinsicRewards = agent.criticize(env.goalNotReachedBefore(goal) & env.goalReached(goal), actionMap[action], env.isLifeLost(), distanceReward) + ''' Debugging + if (intrinsicRewards == 1.0): + print('subgoal reached') + im = Image.fromarray(np.squeeze(nextState)) + im.save('goalReaced.jpeg') + sys.exit() + ''' + ''' + if (intrinsicRewards == 1.0): + print(stepCount) + ''' + # Store transition and update network params + exp = ActorExperience(state, goal, action, intrinsicRewards, nextState, env.isTerminal()) + agent.store(exp, meta=False) + # Do not update the network during random play + if (stepCount >= defaultRandomPlaySteps): + if (stepCount == defaultRandomPlaySteps): + print('start training (random walk ends)') + if (stepCount % 4 == 0): + agent.update(stepCount, meta=False) + # agent.update(stepCount, meta=True) + + # Update external reward for D2 + totalExternalRewards += externalRewards + + # Update data for visualization + externalRewardMonitor += externalRewards + intrinsicRewardMonitor += intrinsicRewards + + # Store meta controller's experience + exp = MetaExperience(stateLastGoal, goal, totalExternalRewards, nextState, env.isTerminal()) + agent.store(exp, meta=True) + + # Update goal + if episodeSteps > maxStepsPerEpisode: + break + elif env.isTerminal() is False: + lastGoal = goal + goalNum = goalNum + 1 + if goalNum >= 4: + break + else: + # Re-initialize game if not game over + if not env.isGameOver(): + goalNum = 0 + lastGoal = -1 + env.resetLife() + for i in range(30): + env.act(0) + + # Training Visualization + intrinsicPlot = session.run(merged, feed_dict={tensorVar: intrinsicRewardMonitor}) + sumWriterIntrinsic.add_summary(intrinsicPlot, stepCount) + sumWriterIntrinsic.flush() + externalPlot = session.run(merged, feed_dict={tensorVar: externalRewardMonitor}) + sumWriterExternal.add_summary(externalPlot, stepCount) + sumWriterExternal.flush() + #intrinsicRewardMonitor = 0 + #externalRewardMonitor = 0 + if (not annealComplete): + #Annealing + agent.annealMetaEpsilon(stepCount) + agent.annealControllerEpsilon(stepCount, goal) if __name__ == "__main__": main() diff --git a/test.py b/test.py new file mode 100644 index 0000000..b1ad251 --- /dev/null +++ b/test.py @@ -0,0 +1,66 @@ +import argparse +import sys +import numpy as np + +from environment import ALEEnvironment +from agent import Agent +from hdqn import Hdqn +from PIL import Image + +def str2bool(v): + return v.lower() in ("yes", "true", "t", "1") + +def main(): + actionMap = [0, 1, 2, 3, 4, 5, 11, 12] + goalExplain = ['top left door', 'top right door', 'middle ladder', 'lower left ladder', 'lower right ladder', 'key'] + actionExplain = ['no action', 'jump', 'up', 'right', 'left', 'down', 'jump right', 'jump left'] + stepCount = 0 + parser = argparse.ArgumentParser() + parser.add_argument("--game", default="montezuma_revenge.bin") + parser.add_argument("--display_screen", type=str2bool, default=False) + parser.add_argument("--frame_skip", default=4) + #parser.add_argument("--repeat_action_probability", default=0.25) + parser.add_argument("--color_averaging", default=False) + parser.add_argument("--random_seed") + #parser.add_argument("--record_screen_path", default="./record") + #parser.add_argument("--record_sound_filename") + parser.add_argument("--minimal_action_set", default=False) + parser.add_argument("--screen_width", default=84) + parser.add_argument("--screen_height", default=84) + args = parser.parse_args() + env = ALEEnvironment(args.game, args) + hdqn = Hdqn() + print('loading weights') + hdqn.loadWeight() + print('weight loaded') + agent = Agent(hdqn, range(8), range(6)) + # Probability of making random action is 0.1 + agent.setControllerEpsilon([0.1]*6) + agent.setMetaEpsilon(0.1) + while True: + env.restart() + for i in range(10): + env.act(0) + goalNum = 0 + while not env.isGameOver(): + goal = agent.selectTrueGoal(goalNum) + print('predicted subgoal is: ' + str(goal) + ' ' + goalExplain[goal]) + while not env.isTerminal() and not env.goalReached(goal): + state = env.getState() + action = agent.selectMove(state, goal) + #print ('selected action is: ' + str(actionMap[action]) + ' ' + actionExplain[actionMap[action]]) + #print('selected action is :' + str(actionExplain[action])) + externalRewards = env.act(actionMap[action]) + if env.isTerminal() is False: + goalNum = goalNum + 1 + else: + # Re-initialize game if not game over + if not env.isGameOver(): + goalNum = 0 + env.resetLife() + for i in range(10): + env.act(0) + +if __name__ == "__main__": + main() + diff --git a/utils.pyc b/utils.pyc new file mode 100644 index 0000000..63b16fe Binary files /dev/null and b/utils.pyc differ