policy-gradient-methods/play.py at main · till2/policy-gradient-methods · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# file for showcasing the agent by playing a couple of live episodes

# To play:
# python play.py --weights LunarLander-v2_ReinforceAgent_episode_46100_acc_r_181.h5

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gym
import argparse
import wandb
import os
from reinforce_agent import ReinforceAgent
from actor_critic_agent import ActorCriticAgent

# Init parser
parser = argparse.ArgumentParser(description='Required args: --cuda|--cpu and --weights=filename')
parser.add_argument('--weights', type=str,
                    help='weights [weights_filename]')
parser.add_argument('--cuda', action='store_true',
                    help='--cuda if the gpu should be used')
args = parser.parse_args()

# hyperparams
env_name = 'LunarLander-v2'
episodes = 5

# environment setup
print(f'Showing a trained agent in the {env_name} environment.')
env = gym.make(env_name, render_mode='human') #new_step_api=True,
obs_shape = env.observation_space.shape[0]
action_shape = env.action_space.n

# load weights
if args.weights:
    weights_filename = args.weights
else:
    weights_filename = os.listdir('weights')[len(os.listdir('weights'))-1]

# init agent
# agent = ActorCriticAgent(n_features=obs_shape, n_actions=action_shape, device='cpu', lr=0)
agent = ReinforceAgent(n_features=obs_shape, n_actions=action_shape, device='cpu', lr=0)
agent.load_params(weights_filename)

# showcase loop
for episode in range(episodes):
    acc_reward = 0.0
    # get a trajectory from the trained policy
    obs, _ = env.reset()
    for step in range(250):
        action, action_log_likelihood = agent.get_action(obs[None, :])
        obs, reward, done, truncated, info = env.step(action)
        acc_reward += reward
        if done or truncated:
            break

    print(f'acc rewards of episode {episode}: {acc_reward}')

env.close()