-
Notifications
You must be signed in to change notification settings - Fork 0
/
cartpole_learn.py
158 lines (130 loc) · 4.65 KB
/
cartpole_learn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
'''
This is the main file for implementing your cartpole control logic.
There is only one TODO, but it includes writing some setup code at
the global scope to compute the LQR control gains, K and then
applying those gains within the policyfn() method.
There are many other things going on in the code lower in this file
and in the accompanying files, which you can
mostly ignore.
'''
import numpy as np
import scipy.linalg
from plant import gTrig_np
from cartpole import default_params
from cartpole import CartpoleDraw
#np.random.seed(31337)
np.set_printoptions(linewidth=500)
np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
# An lqr helper function that you can use in call in your solution
# Note: We expect this code to return with error for invalid
# A, B, Q, and R values such as the defaults we entered in the starter code.
# You must derive the proper matrices before this will start working.
def lqr( A, B, Q, R ):
x = scipy.linalg.solve_continuous_are( A, B, Q, R )
k = np.linalg.inv(R) * np.dot( B.T, x )
return k
# Constants for our cartpole that you can use in your solution
g = 9.82
m = 0.5
M = 0.5
l = 0.5
b = 0.1
# FOR YOU TODO: Fill in the values for a, b, q and r here.
# Note that they should be matrices not scalars.
# Then, figure out how to apply the resulting k
# to solve for a control, u, within the policyfn that balances the cartpole.
#A = np.array([[ 0, 1, 0, 0 ],
# [ 0, 1, 1, 0 ],
# [ 0, 0, 0, 1 ],
# [ 0, 1, 1, 0 ]] )
xddot_den = 4 * (M + m) - 3 * m
tddot_den = l * (4 * (M + m) - 3 * m)
a11 = -4 * b / xddot_den
a13 = 3 * m * g / xddot_den
a21 = -6 * b / tddot_den
a23 = 6 * (M + m) * g / tddot_den
A = np.array([[0 , 1, 0, 0],
[0 , a11, 0, a13],
[0 , a21, 0, a23],
[0 , 0, 1, 0]])
#B = np.array( [[0, 1, 0, 1 ]] )
b1 = 4 / xddot_den
b2 = -6 / tddot_den
B = np.array([[ 0.,
b1,
b2,
0.]])
B.shape = (4,1)
Q = np.diag([0, 0, 0, 3])
R = np.array([[600]])
print( "A holds:\n",A)
print( "B holds:\n",B)
print( "Q holds:\n",Q)
print( "R holds:\n",R)
# Uncomment this to get the LQR gains k once you have
# filled in the correct matrices.
k = lqr( A, B, Q, R )
print( "k holds:\n",k)
def policyfn( x ):
x[3] -= np.pi
u = k.dot(x)
return np.array([u])
def apply_controller(plant,params,H,policy=None):
'''
Starts the plant and applies the current policy to the plant for a duration specified by H (in seconds).
@param plant is a class that controls our robot (simulation)
@param params is a dictionary with some useful values
@param H Horizon for applying controller (in seconds)
@param policy is a function pointer to the code that implements your
control solution. It will be called as u = policy( state )
'''
# start robot
x_t, t = plant.get_plant_state()
if plant.noise is not None:
# randomize state
Sx_t = np.zeros((x_t.shape[0],x_t.shape[0]))
L_noise = np.linalg.cholesky(plant.noise)
x_t = x_t + np.random.randn(x_t.shape[0]).dot(L_noise);
sum_of_error = 0
H_steps = int(np.ceil(H/plant.dt))
for i in range(H_steps):
# convert input angle dimensions to complex representation
x_t_ = gTrig_np(x_t[None,:], params['angle_dims']).flatten()
# get command from policy (this should be fast, or at least account for delays in processing):
u_t = policy(x_t_)
# send command to robot:
plant.apply_control(u_t)
plant.step()
x_t, t = plant.get_plant_state()
l = plant.params['l']
st = np.sin(x_t_[3])
ct = np.cos(x_t_[3])
goal = np.array([0,l])
end = np.array( [ x_t[0] + l*st, -l*ct ] )
dist = np.sqrt( (goal[0]-end[0])*(goal[0]-end[0]) + (goal[1]-end[1])*(goal[1]-end[1]) )
sum_of_error = sum_of_error + dist
if plant.noise is not None:
# randomize state
x_t = x_t + np.random.randn(x_t.shape[0]).dot(L_noise);
if plant.done:
break
print("Error this episode was: ",sum_of_error)
# stop robot
plant.stop()
def main():
# learning iterations
N = 5
H = 10
learner_params = default_params()
plant_params = learner_params['params']['plant']
plant = learner_params['plant_class'](**plant_params)
draw_cp = CartpoleDraw(plant)
draw_cp.start()
input("Press ENTER...")
# loop to run controller repeatedly
for i in range(N):
# execute it on the robot
plant.reset_state()
apply_controller(plant,learner_params['params'], H, policyfn)
if __name__ == '__main__':
main()