-
Notifications
You must be signed in to change notification settings - Fork 0
/
cart-pole-modelV1.py
91 lines (64 loc) · 2.7 KB
/
cart-pole-modelV1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np
import time, math, random
from typing import Tuple
# import gym
import gym
env = gym.make('CartPole-v1')
policy = lambda obs: 1
for _ in range(5):
obs = env.reset()
for _ in range(80):
actions = policy(obs)
obs, reward, done, info = env.step(actions)
env.render()
time.sleep(0.05)
env.close()
# Simple policy function
policy = lambda _,__,___, tip_velocity : int( tip_velocity > 0 )
n_bins = ( 6 , 12 )
lower_bounds = [ env.observation_space.low[2], -math.radians(50) ]
upper_bounds = [ env.observation_space.high[2], math.radians(50) ]
def discretizer( _ , __ , angle, pole_velocity ) -> Tuple[int,...]:
"""Convert continues state intro a discrete state"""
est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
est.fit([lower_bounds, upper_bounds ])
return tuple(map(int,est.transform([[angle, pole_velocity]])[0]))
Q_table = np.zeros(n_bins + (env.action_space.n,))
Q_table.shape
def policy( state : tuple ):
"""Choosing action based on epsilon-greedy policy"""
return np.argmax(Q_table[state])
def new_Q_value( reward : float , new_state : tuple , discount_factor=1 ) -> float:
"""Temperal diffrence for updating Q-value of state-action pair"""
future_optimal_value = np.max(Q_table[new_state])
learned_value = reward + discount_factor * future_optimal_value
return learned_value
# Adaptive learning of Learning Rate
def learning_rate(n : int , min_rate=0.01 ) -> float :
"""Decaying learning rate"""
return max(min_rate, min(1.0, 1.0 - math.log10((n + 1) / 25)))
def exploration_rate(n : int, min_rate= 0.1 ) -> float :
"""Decaying exploration rate"""
return max(min_rate, min(1, 1.0 - math.log10((n + 1) / 25)))
n_episodes = 10000
for e in range(n_episodes):
# Siscretize state into buckets
current_state, done = discretizer(*env.reset()), False
while done==False:
# policy action
action = policy(current_state) # exploit
# insert random action
if np.random.random() < exploration_rate(e) :
action = env.action_space.sample() # explore
# increment enviroment
obs, reward, done, _ = env.step(action)
new_state = discretizer(*obs)
# Update Q-Table
lr = learning_rate(e)
learnt_value = new_Q_value(reward , new_state )
old_value = Q_table[current_state][action]
Q_table[current_state][action] = (1-lr)*old_value + lr*learnt_value
current_state = new_state
# Render the cartpole environment
env.render()