Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added RGBtest.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
60 changes: 41 additions & 19 deletions agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import utils

# Default architectures for the lower level controller/actor
defaultNSample = 1000
defaultNSample = 64
defaultGamma = 0.975
defaultEpsilon = 1.0
defaultControllerEpsilon = [1.0]*6
Expand All @@ -15,7 +15,12 @@

############
defaultMetaEpsilon = 1
defaultMetaNSamples = 200
defaultMetaNSamples = 64
updateFrequency = 10000
controllerMemCap = 1000000
maxReward = 1000
minReward = -1000
trueSubgoalOrder = [2, 4, 3, 5]

class Agent:

Expand All @@ -40,16 +45,34 @@ def selectMove(self, state, goal):
return np.argmax(self.net.controllerNet.predict([np.reshape(state, (1, 84, 84, 1)), np.asarray([goalVec])], verbose=0))
return random.choice(self.actionSet)

def setControllerEpsilon(self, epsilonArr):
self.controllerEpsilon = epsilonArr

def selectGoal(self, state):
if self.metaEpsilon < random.random():
# predict action
pred = self.net.metaNet.predict(np.reshape(state, (1, 84, 84, 1)), verbose=0)
return np.argmax(pred)
print("Exploring")
return random.choice(self.goalSet)

def criticize(self, goalReached):
return 1.0 if goalReached else 0.0
def selectTrueGoal(self, goalNum):
return trueSubgoalOrder[goalNum]

def setMetaEpsilon(self, epsilon):
self.metaEpsilon = epsilon

def criticize(self, reachGoal, action, die, distanceReward):
reward = 0.0
if action == 0:
reward -= 0.1
if reachGoal:
reward += 50.0
if die:
reward -= 200.0
reward += distanceReward
reward = np.minimum(reward, maxReward)
reward = np.maximum(reward, minReward)
return reward

def store(self, experience, meta=False):
if meta:
Expand All @@ -58,11 +81,11 @@ def store(self, experience, meta=False):
self.metaMemory = self.metaMemory[-500:]
else:
self.memory.append(experience)
if len(self.memory) > 1000000:
self.memory = self.memory[-1000000:]
if len(self.memory) > controllerMemCap:
self.memory = self.memory[-controllerMemCap:]


def _update(self):
def _update(self, stepCount):
exps = [random.choice(self.memory) for _ in range(self.nSamples)]
# stateVectors = np.squeeze(np.asarray([np.concatenate([exp.state, exp.goal], axis=1) for exp in exps]))
stateVector = []
Expand All @@ -85,7 +108,7 @@ def _update(self):
if not exp.done:
rewardVectors[i][exp.action] += self.gamma * max(nextStateRewardVectors[i])
rewardVectors = np.asarray(rewardVectors)
self.net.controllerNet.fit([stateVector, goalVector], rewardVectors, epochs = 1, verbose=1)
self.net.controllerNet.train_on_batch([stateVector, goalVector], rewardVectors)

#Update target network
controllerWeights = self.net.controllerNet.get_weights()
Expand All @@ -94,7 +117,7 @@ def _update(self):
controllerTargetWeights[i] = self.targetTau * controllerWeights[i] + (1 - self.targetTau) * controllerTargetWeights[i]
self.net.targetControllerNet.set_weights(controllerTargetWeights)

def _update_meta(self):
def _update_meta(self, stepCount):
if 0 < len(self.metaMemory):
exps = [random.choice(self.metaMemory) for _ in range(self.metaNSamples)]
stateVectors = np.asarray([exp.state for exp in exps])
Expand All @@ -107,7 +130,7 @@ def _update_meta(self):
rewardVectors[i][np.argmax(exp.goal)] = exp.reward
if not exp.done:
rewardVectors[i][np.argmax(exp.goal)] += self.gamma * max(nextStateRewardVectors[i])
self.net.metaNet.fit(stateVectors, rewardVectors, epochs = 1, verbose=1)
self.net.metaNet.train_on_batch(stateVectors, rewardVectors)

#Update target network
metaWeights = self.net.metaNet.get_weights()
Expand All @@ -116,18 +139,17 @@ def _update_meta(self):
metaTargetWeights[i] = self.targetTau * metaWeights[i] + (1 - self.targetTau) * metaTargetWeights[i]
self.net.targetMetaNet.set_weights(metaTargetWeights)

def update(self, meta=False):
def update(self, stepCount, meta=False):
if meta:
self._update_meta()
self._update_meta(stepCount)
else:
self._update()
self._update(stepCount)

def annealMetaEpsilon(self, stepCount):
self.metaEpsilon = defaultEndEpsilon + (defaultMetaEpsilon - defaultEndEpsilon) * \
(defaultAnnealSteps - max(0, stepCount - defaultRandomPlaySteps)) / defaultAnnealSteps
self.metaEpsilon = defaultEndEpsilon + max(0, (defaultMetaEpsilon - defaultEndEpsilon) * \
(defaultAnnealSteps - max(0, stepCount - defaultRandomPlaySteps)) / defaultAnnealSteps)

def annealControllerEpsilon(self, stepCount, goal):
self.controllerEpsilon[goal] = defaultEndEpsilon + (defaultControllerEpsilon[goal] - defaultEndEpsilon) * \
(defaultAnnealSteps - max(0, stepCount - defaultRandomPlaySteps)) / defaultAnnealSteps

self.controllerEpsilon[goal] = defaultEndEpsilon + max(0, (defaultControllerEpsilon[goal] - defaultEndEpsilon) * \
(defaultAnnealSteps - max(0, stepCount - defaultRandomPlaySteps)) / defaultAnnealSteps)

Binary file added agent.pyc
Binary file not shown.
Binary file added ale_python_interface.pyc
Binary file not shown.
65 changes: 65 additions & 0 deletions de.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import argparse
import sys
import time
import numpy as np
import tensorflow as tf
from collections import namedtuple
from environment import ALEEnvironment
from agent import Agent
from hdqn import Hdqn
from PIL import Image

# Constant defined here
defaultRandomPlaySteps = 100000
maxStepsPerEpisode = 5000

def str2bool(v):
return v.lower() in ("yes", "true", "t", "1")

def main():
# Initilization for tensor board
session = tf.Session()
tensorVar = tf.Variable(0)
tf.summary.scalar("reward", tensorVar)
sumWriterIntrinsic = tf.summary.FileWriter('./reward/intrinsic')
sumWriterExternal = tf.summary.FileWriter('./reward/external')
merged = tf.summary.merge_all()
session.run(tf.initialize_all_variables())

actionMap = [0, 1, 2, 3, 4, 5, 11, 12]
actionExplain = ['no action', 'jump', 'up', 'right', 'left', 'down', 'jump right', 'jump left']
goalExplain = ['top left door', 'top right door', 'middle ladder', 'lower left ladder', 'lower right ladder', 'key']
stepCount = 0
parser = argparse.ArgumentParser()
parser.add_argument("--game", default="montezuma_revenge.bin")
parser.add_argument("--display_screen", type=str2bool, default=False)
parser.add_argument("--frame_skip", default=4)
#parser.add_argument("--repeat_action_probability", default=0.25)
parser.add_argument("--color_averaging", default=False)
parser.add_argument("--random_seed")
#parser.add_argument("--record_screen_path", default="./record")
#parser.add_argument("--record_sound_filename")
parser.add_argument("--minimal_action_set", default=False)
parser.add_argument("--screen_width", default=84)
parser.add_argument("--screen_height", default=84)
args = parser.parse_args()
ActorExperience = namedtuple("ActorExperience", ["state", "goal", "action", "reward", "next_state", "done"])
MetaExperience = namedtuple("MetaExperience", ["state", "goal", "reward", "next_state", "done"])
env = ALEEnvironment(args.game, args)
hdqn = Hdqn()
agent = Agent(hdqn, range(8), range(6))
# set goalNum to hardcoded subgoal
goalNum = 0
intrinsicRewardMonitor = 0
externalRewardMonitor = 0
env.actWrapper(5)
x = env.getScreenRGB()
im = Image.fromarray(x)
im.save("RGBtest.jpeg")
# for i in range(100):
# env.act(0)
# print(env.isTerminal())
print(env.isTerminal())

if __name__ == "__main__":
main()
89 changes: 75 additions & 14 deletions environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class ALEEnvironment():
def __init__(self, rom_file, args):

self.ale = ALEInterface()
'''

if args.display_screen:
if sys.platform == 'darwin':
import pygame
Expand All @@ -22,8 +22,7 @@ def __init__(self, rom_file, args):
elif sys.platform.startswith('linux'):
self.ale.setBool('sound', True)
self.ale.setBool('display_screen', True)
self.ale.setBool('display_screen', True)
'''

self.ale.setInt('frame_skip', args.frame_skip)
#self.ale.setFloat('repeat_action_probability', args.repeat_action_probability)
self.ale.setBool('color_averaging', args.color_averaging)
Expand Down Expand Up @@ -60,12 +59,20 @@ def __init__(self, rom_file, args):
self.life_lost = False
self.initSrcreen = self.getScreen()
self.goalSet = []
self.goalSet.append([[8, 21], [16, 36]]) # top left door
self.goalSet.append([[69, 21], [77, 36]]) # top right door
self.goalSet.append([[37, 40], [47, 53]]) # middle ladder
self.goalSet.append([[8, 57], [19, 72]]) # lower left ladder
self.goalSet.append([[66,57], [76, 72]]) # lower right ladder
self.goalSet.append([[6, 39], [12, 47]]) # key
self.goalSet.append([[8, 21], [16, 36]]) # top left door 0
self.goalSet.append([[69, 21], [77, 36]]) # top right door 1
self.goalSet.append([[37, 40], [47, 53]]) # middle ladder 2
self.goalSet.append([[8, 57], [19, 72]]) # lower left ladder 3
self.goalSet.append([[66,57], [76, 72]]) # lower right ladder 4
self.goalSet.append([[6, 39], [12, 47]]) # key 5
self.goalCenterLoc = []
self.goalCenterLoc.append([(8.0 + 16.0)/2, (21.0 + 36.0)/2])
self.goalCenterLoc.append([(69.0 + 77.0)/2, (21.0+36.0)/2])
self.goalCenterLoc.append([(37.0 + 47.0)/2, (40.0+53.0)/2])
self.goalCenterLoc.append([(8.0 + 19.0)/2, (57.0+72.0)/2])
self.goalCenterLoc.append([(66.0 + 76.0)/2, (57.0+72.0)/2])
self.goalCenterLoc.append([(6.0 + 12.0)/2, (39.0+47.0)/2])
self.reachedGoal = [0, 0, 0, 0, 0, 0]

def numActions(self):
return len(self.actions)
Expand All @@ -82,20 +89,62 @@ def restart(self):
):
self.ale.reset_game()
self.life_lost = False


def resetLife(self):
self.life_lost = False

def act(self, action):
lives = self.ale.lives()
reward = self.ale.act(self.actions[action])
self.life_lost = (not lives == self.ale.lives())
if reward != 0:
return 1.0
return reward

def actWrapper(self, action):
lives = self.ale.lives()
reward = self.act(action)
for i in range (20):
if lives == self.ale.lives():
self.ale.act(0)
self.life_lost = (not lives == self.ale.lives())
return reward

def getScreen(self):
screen = self.ale.getScreenGrayscale()
resized = cv2.resize(screen, (self.screen_width, self.screen_height))
return resized

def getScreenRGB(self):
screen = self.ale.getScreenRGB()
resized = cv2.resize(screen, (self.screen_width, self.screen_height))
return resized

def getAgentLoc(self):
img = self.getScreenRGB()
man = [200, 72, 72]
mask = np.zeros(np.shape(img))
mask[:,:,0] = man[0];
mask[:,:,1] = man[1];
mask[:,:,2] = man[2];

diff = img - mask
indxs = np.where(diff == 0)
diff[np.where(diff < 0)] = 0
diff[np.where(diff > 0)] = 0
diff[indxs] = 255
mean_y = np.sum(indxs[0]) / np.shape(indxs[0])[0]
mean_x = np.sum(indxs[1]) / np.shape(indxs[1])[0]
return (mean_x, mean_y)

def distanceReward(self, lastGoal, goal):
if (lastGoal == -1):
return 0.0
goalCenter = self.goalCenterLoc[goal]
agentX, agentY = self.getAgentLoc()
lastGoalCenter = self.goalCenterLoc[lastGoal]
dis = np.sqrt((goalCenter[0] - agentX)*(goalCenter[0] - agentX) + (goalCenter[1]-agentY)*(goalCenter[1]-agentY))
disLast = np.sqrt((lastGoalCenter[0] - agentX)*(lastGoalCenter[0] - agentX) + (lastGoalCenter[1]-agentY)*(lastGoalCenter[1]-agentY))
return disLast - dis

# add color channel for input of network
def getState(self):
screen = self.ale.getScreenGrayscale()
Expand All @@ -107,6 +156,12 @@ def isTerminal(self):
return self.ale.game_over() or self.life_lost
return self.ale.game_over()

def isGameOver(self):
return self.ale.game_over()

def isLifeLost(self):
return self.life_lost

def reset(self):
self.ale.reset_game()
self.life_lost = False
Expand All @@ -121,7 +176,13 @@ def goalReached(self, goal):
if goalScreen[x][y] != stateScreen[x][y]:
count = count + 1
# 30 is total number of pixels of agent
if float(count) / ((goalPosition[1][0] - goalPosition[0][0]) * 30) > 0.15:
if float(count) / 30 > 0.5:
self.reachedGoal[goal] = 1
return True
return False


def goalNotReachedBefore(self, goal):
if (self.reachedGoal[goal] == 1):
return False
return True

Binary file added environment.pyc
Binary file not shown.
Binary file added goalReaced.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
7 changes: 5 additions & 2 deletions hdqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,5 +63,8 @@ def saveWeight(self, episodeNumber):
self.metaNet.save('metaNet_' + str(episodeNumber) + '.h5')

def loadWeight(self):
self.controllerNet.load_model('controllerNet.h5')
self.metaNet.load_model('metaNet.h5')
path = 'weight/'
self.controllerNet = load_model(path + 'controllerNet.h5')
self.targetControllerNet = load_model(path + 'controllerNet.h5')
self.metaNet = load_model(path + 'metaNet.h5')
self.targetMetaNet = load_model(path + 'metaNet.h5')
Binary file added hdqn.pyc
Binary file not shown.
Binary file modified libale_c.so
Binary file not shown.
Loading