-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathQL.m
More file actions
99 lines (89 loc) · 3.34 KB
/
QL.m
File metadata and controls
99 lines (89 loc) · 3.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
classdef QL
%%
properties
S % state set (vector of all the states):
A % action set (vector of all the actions):
pol % greedy policy vector (= pi(s))
gamma % discount factor
alpha % learning factor
Q % action-value set (matrix of Q(s,a))
epsilon % eps-Greedy factor, \in [0,1]
histAct % probability distribution of the action-space
end
%%
methods
function obj = QL(S, A, gamma, alpha, epsilon)
%
% class constructor
%
obj.S = S;
obj.A = A;
obj.pol = zeros(length(obj.S),1);
obj.histAct = (1/length(obj.A))*ones(length(obj.A),1); % uniform distribution by default
obj.Q = zeros(length(obj.S),length(obj.A));
obj.gamma = gamma;
obj.alpha = alpha;
obj.epsilon = epsilon;
end
function obj = learning(obj, s, s_next, a, R)
%
% This method implements the Q-Learning algorithm
% s and s_next are "index" of current and next states
%
delta = R + obj.gamma * max(obj.Q(s_next,:)) - obj.Q(s,a);
obj.Q(s,a) = obj.Q(s,a) + obj.alpha*delta; % updating Q
[~,obj.pol(s)] = max(obj.Q(s,:)); % updating greedy policy
end
function obj = dpLearning(obj, s, s_next, a, R)
%
% This method implements the CC learning algorithm
% s and s_next are "index" of current and next states
%
delta = R + obj.gamma * mean(obj.Q(s_next,:)) - obj.Q(s,a);
obj.Q(s,a) = obj.Q(s,a) + obj.alpha*delta; % updating Q
[~,obj.pol(s)] = max(obj.Q(s,:)); % updating greedy policy
end
function obj = dp(obj, s, s_next, a, R)
%
% This method implements the DP algorithm
% s and s_next are "index" of current and next states
%
delta = R + obj.gamma * (obj.Q(s_next,:)*obj.histAct) - obj.Q(s,a);
obj.Q(s,a) = obj.Q(s,a) + obj.alpha*delta; % updating Q
[~,obj.pol(s)] = max(obj.Q(s,:)); % updating greedy policy
end
function a = egAction(obj, s)
%
% This method implements the epsilon-greedy policy
%
r = rand;
if r < obj.epsilon
a = fix(length(obj.A)*rand) + 1; % selecting a random state, regardless of greedy policy
else
a = obj.pol(s); % greedy selection of state
end
end
%% Setter functions:
function obj = set.alpha(obj, value)
if ~(value >= 0)
error('alpha must be nonnegetive')
else
obj.alpha = value;
end
end
function obj = set.gamma(obj, value)
if ~(value >= 0 && value <= 1)
error('gamma must be in [0,1]')
else
obj.gamma = value;
end
end
function obj = set.epsilon(obj, value)
if ~(value >= 0 && value <= 1)
error('epsilon must be in [0,1]')
else
obj.epsilon = value;
end
end
end
end