-
Notifications
You must be signed in to change notification settings - Fork 0
/
env.py
108 lines (82 loc) · 2.99 KB
/
env.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import gym
from ray.rllib.env.multi_agent_env import MultiAgentEnv
import numpy as np
from gym.spaces import Box
class Minority_Game(MultiAgentEnv):
def __init__(
self,
configs
):
# Store the environment's configs
self.configs = configs
# History of aggregated actions, A = sum_{i=1}^{N}a_{i}(t)
self.history = []
# Agents take actions in [-1,1]
self.action_space = gym.spaces.Box(
low=-1,
high=1,
shape=(1,)
)
# All agents receive an observation containing the past m joint actions
self.observation_space = gym.spaces.Dict(
{
"history": Box(
0,
1,
shape=(self.configs.get('memory'),)
),
}
)
pass
def reset(
self
):
"""
Resets the environment
"""
# Reset the timestep
self.timestep = 0
# Reset the history to a vector of 0's
self.history = [0 for i in range(self.configs.get('memory'))]
# Retrieve the observations of the resetted environment
# All agents receive the same observation
observations = self.history[-self.configs.get('memory'):]
return observations
def step(
self,
actions
):
"""
Takes one transition step in the environment
:args actions dictionary containing the actions decided by each agent
:output observations dictionary containing the next observations for each agent
:output rewards dictionary containing the rewards for each agent
:output done dictionary containing __all__ reflecting if the episode is finished
:output info dictionary containing any additional episode information
"""
# Increment the timestep counter
self.timestep += 1
# Convert the actions into binary value, D
# D(t) = \frac{1}{2}[Sgn(2A-N)+1] \in {0,1}
A = np.sum(actions)
N = self.configs.get('n_agents')
D = 0.5 * ( np.sign(2*A-N) + 1 )
# Store the aggregated action into the history
self.history.append(int(D))
# Retrieve the last m elements as the universal observation
observations = self.history[-self.configs.get('memory'):]
# Compute the true reward to each agent
rewards = -np.sign(A) * np.array(actions)
# The episode is finished after the timesteps reaches the maximum number of steps
if self.configs.get('max_steps') != 'None':
done = {"__all__" : self.timestep == self.configs.get('max_steps')}
else:
done = {"__all__" : False}
# Return extra info
info = {
'A': A,
'actions': actions,
'D': D,
'timestep': self.timestep,
}
return observations, rewards, done, info