-
Notifications
You must be signed in to change notification settings - Fork 0
/
agents.py
142 lines (104 loc) · 3.88 KB
/
agents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from abc import ABC, abstractmethod
import numpy as np
class Agent(ABC):
def __init__(
self,
configs
):
self.configs = configs
super().__init__()
@abstractmethod
def compute_action(
self,
observation
):
pass
def update_valuation(
self,
observations,
info
):
pass
class Random_Agent(Agent):
"""
The random agent selects an action from the valid actions at every timestep
"""
def compute_action(
self,
observation
):
"""
:args observation observation presented to the agent in current timestep
:output action the agent's action for the current observation
"""
return np.random.choice(self.configs['actions'])
class Optimizing_Agent(Agent):
def __init__(
self,
configs
):
# Store the configs
self.configs = configs
# Endow each agent with S strategies, each of size 2^m
self.strategies = np.random.choice(
[0, 1],
size=(self.configs.get('n_strategies'), np.power(2, self.configs.get('memory')))
)
# Store the rolling valuations
self.time_horizon = np.zeros((self.configs.get('tau'), self.configs.get('n_strategies')))
# Initialize the value of each strategy to 0
self.valuation = np.sum(self.time_horizon,axis=0)
# Hypothetical Scores
self.hypothetical_scores = np.zeros(self.configs.get('n_strategies'))
# Counterfactual Scores
self.counterfactual_rewards = np.zeros(self.configs.get('n_strategies'))
pass
def compute_action(
self,
observation
):
# Identify strategies with the highest valuation.
best_strategies = np.where(self.valuation == np.max(self.valuation))[0]
# Determine if a tie exists
tie_exists = best_strategies.shape[0] > 1
# Ties are broken via a fair coin-flip
if tie_exists:
strategy = np.random.choice(best_strategies)
else:
strategy = best_strategies[0]
# Determine the strategy's prediction based on the history
history = int("".join(str(x) for x in observation), 2)
prediction = self.strategies[strategy][history]
# The agent takes the opposite of the prediction made by the best strategy
if prediction == 1:
action = -1
else:
action = 1
return action
def update_valuation(
self,
observation,
info
):
# Convert the history to a integer for lookup
history = int("".join(str(x) for x in observation), 2)
# If a strategy predicted the outcome correctly; it gets +1; -1 otherwise
counterfactual_rewards = np.zeros(self.configs.get('n_strategies'))
# Compute the counterfactual reward for each strategy
for strategy_id, strategy in enumerate(self.strategies):
if strategy[history] == 1:
action = -1
else:
action = 1
counterfactual_rewards[strategy_id] = -np.sign(info.get('A')) * action
# Update the hypothetical score of each strategy
self.hypothetical_scores += counterfactual_rewards
# Store the most recent counterfactual score
self.counterfactual_rewards = counterfactual_rewards
# after a "reasonable" number of iterations, begin the rolling window of cumulative strategy scores
if info.get('timestep') > self.configs.get('THMG_horizon'):
# Append to the time horizon buffer
self.time_horizon[info.get('timestep') % self.configs.get('tau')] = counterfactual_rewards
# Update the valuation
self.valuation = np.sum(self.time_horizon, axis=0)
pass