MDPController/MDP.m at master · ManuelMeraz/MDPController · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
function Policy = MDP(params, noise, S, A)
    % Returns the optimal policy of inverted pendulum
    % with given a set of states S, possible actions A
    % O(A * S^2)

    % Generate all possible state vectors
    % e.g. [theta1 theta1 theta2 theta2; thetaDot1 thetaDot2 thetaDot1 thetadot2]
    [Thetas, ThetaDots] = meshgrid(S(1,:), S(2,:));
    vS = [reshape(Thetas, 1, numel(Thetas)); reshape(ThetaDots, 1, numel(ThetaDots))];

    numStates = params.numStates;
    discount = params.discount;
    dt = params.dt;

    totalStates = length(vS);
    for i = 1:totalStates
        PercentageCompleted = i/totalStates * 100
        s = vS(:,i);
        Policy(:,i) = s;
        bestActions(:,i) = VStar(discount, params, noise, S, vS, A, dt, vS(:,i));
    end

    Policy = [Policy; bestActions];

end

function a = VStar(discount, params, noise, S, vS, A, dt, s)
    % Given a state s, compute the expection for every action for every
    % possible future state. Return the max.

    for i = 1:length(A)

        % Commit to action a
        a = A(1, i);
        R(1,i) = a;
        R(2,i) = 0;
        depth = 0;

        % Compute the next state for the given
        sPrime = simulateOneStep(s(1,1), s(2,1), dt, a);
        T = transitionProbabilities(S, sPrime, params, noise);
        T = T(1,:) .* T(2,:);
        for j = 1:length(vS)

            sPrime = vS(:,j);
            psPrime = T(1,j);

            if psPrime <  0.001
                continue;
            end

            % Bellman Equation. Sum of future rewards
            futureRewards = psPrime * (getReward(params, sPrime) + ...
            discount * QStar(depth + 1, discount, params, noise, S, vS, A, dt, sPrime));
            R(2, i) = R(2,i) +  futureRewards;
        end
    end

    a = R(1,1);
    reward = R(2,1);
    maxIndex = 1;
    for i = 2:length(R)
        if R(2,i) > R(2, maxIndex)
            reward = R(2,i);
            a = R(1,i);
            maxIndex = i;
        end
    end
end


function r = QStar(depth, discount, params, noise, S, vS, A, dt, s)
    % Given a state and action compute the sum of the rewards
    % for all future states
    r = 0;
    if depth >= params.depthLimit
        return;
    end

    for i = 1:length(A)
        a = A(1, i);
        sPrime = simulateOneStep(s(1,1), s(2,1), dt, a);
        T = transitionProbabilities(S, sPrime, params, noise);
        T = T(1,:) .* T(2,:);

        for j = 1:length(vS)
            sPrime = vS(:,j);
            psPrime = T(1,j);

            if psPrime < 0.001
                continue;
            end
            %Bellman Equation. Sum of future rewards
            r = r +  psPrime * (getReward(params, sPrime) + ...
            discount * QStar(depth + 1, discount, params, noise, S, vS, A, dt, sPrime));
        end
    end
end