-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathMDP.m
More file actions
99 lines (80 loc) · 2.74 KB
/
MDP.m
File metadata and controls
99 lines (80 loc) · 2.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
function Policy = MDP(params, noise, S, A)
% Returns the optimal policy of inverted pendulum
% with given a set of states S, possible actions A
% O(A * S^2)
% Generate all possible state vectors
% e.g. [theta1 theta1 theta2 theta2; thetaDot1 thetaDot2 thetaDot1 thetadot2]
[Thetas, ThetaDots] = meshgrid(S(1,:), S(2,:));
vS = [reshape(Thetas, 1, numel(Thetas)); reshape(ThetaDots, 1, numel(ThetaDots))];
numStates = params.numStates;
discount = params.discount;
dt = params.dt;
totalStates = length(vS);
for i = 1:totalStates
PercentageCompleted = i/totalStates * 100
s = vS(:,i);
Policy(:,i) = s;
bestActions(:,i) = VStar(discount, params, noise, S, vS, A, dt, vS(:,i));
end
Policy = [Policy; bestActions];
end
function a = VStar(discount, params, noise, S, vS, A, dt, s)
% Given a state s, compute the expection for every action for every
% possible future state. Return the max.
for i = 1:length(A)
% Commit to action a
a = A(1, i);
R(1,i) = a;
R(2,i) = 0;
depth = 0;
% Compute the next state for the given
sPrime = simulateOneStep(s(1,1), s(2,1), dt, a);
T = transitionProbabilities(S, sPrime, params, noise);
T = T(1,:) .* T(2,:);
for j = 1:length(vS)
sPrime = vS(:,j);
psPrime = T(1,j);
if psPrime < 0.001
continue;
end
% Bellman Equation. Sum of future rewards
futureRewards = psPrime * (getReward(params, sPrime) + ...
discount * QStar(depth + 1, discount, params, noise, S, vS, A, dt, sPrime));
R(2, i) = R(2,i) + futureRewards;
end
end
a = R(1,1);
reward = R(2,1);
maxIndex = 1;
for i = 2:length(R)
if R(2,i) > R(2, maxIndex)
reward = R(2,i);
a = R(1,i);
maxIndex = i;
end
end
end
function r = QStar(depth, discount, params, noise, S, vS, A, dt, s)
% Given a state and action compute the sum of the rewards
% for all future states
r = 0;
if depth >= params.depthLimit
return;
end
for i = 1:length(A)
a = A(1, i);
sPrime = simulateOneStep(s(1,1), s(2,1), dt, a);
T = transitionProbabilities(S, sPrime, params, noise);
T = T(1,:) .* T(2,:);
for j = 1:length(vS)
sPrime = vS(:,j);
psPrime = T(1,j);
if psPrime < 0.001
continue;
end
%Bellman Equation. Sum of future rewards
r = r + psPrime * (getReward(params, sPrime) + ...
discount * QStar(depth + 1, discount, params, noise, S, vS, A, dt, sPrime));
end
end
end