-
Notifications
You must be signed in to change notification settings - Fork 0
/
UCB.py
executable file
·54 lines (38 loc) · 1.24 KB
/
UCB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import numpy as np
import matplotlib.pyplot as plt
num_trails = 10000
bandit_prob = [0.2 , 0.5 , 0.8]
class Bandit():
def __init__(self , p):
self.p = p
self.p_est = 0.
self.N = 1.
def pull(self):
return np.random.random()<self.p
def update(self , x):
self.N+=1
self.p_est = ((self.N - 1)*self.p_est + x) /self.N
def experiment():
rewards = np.zeros(num_trails)
bandits = [Bandit(g) for g in bandit_prob]
n = 0
def ucb(mean , n , nj) :
return mean + np.sqrt(2*np.log(n)/nj)
for b in range(len(bandits)):
x = bandits[b].pull()
bandits[b].update(x)
n+=1
for i in range(num_trails):
j = np.argmax([ucb(b.p_est , n , b.N) for b in bandits])
x = bandits[j].pull()
n+=1
rewards[i] = x
bandits[j].update(x)
cumulative_rewards = np.cumsum(rewards)
win_rates = cumulative_rewards / (np.arange(num_trails) + 1)
plt.ylim([0, 1])
plt.plot(win_rates)
plt.plot(np.ones(num_trails) * np.max(bandit_prob))
plt.show()
if __name__ == "__main__":
experiment()