-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathTD3.py
145 lines (130 loc) · 6.43 KB
/
TD3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import os
import pickle
import gym
import torch
import torch.nn as nn
import numpy as np
import numpy.random as rd
from torch.nn.modules import loss
from plotDRL import plot_optimization_result
from random_generator_battery import ESSEnv
import pandas as pd
from tools import Arguments,get_episode_return,test_one_episode,ReplayBuffer,optimization_base_result
from agent import AgentTD3
from random_generator_battery import ESSEnv
def update_buffer(_trajectory):
ten_state = torch.as_tensor([item[0] for item in _trajectory], dtype=torch.float32)
ary_other = torch.as_tensor([item[1] for item in _trajectory])
ary_other[:, 0] = ary_other[:, 0] # ten_reward
ary_other[:, 1] = (1.0 - ary_other[:, 1]) * gamma # ten_mask = (1.0 - ary_done) * gamma
buffer.extend_buffer(ten_state, ary_other)
_steps = ten_state.shape[0]
_r_exp = ary_other[:, 0].mean() # other = (reward, mask, action)
return _steps, _r_exp
if __name__=='__main__':
args=Arguments()
reward_record={'episode':[],'steps':[],'mean_episode_reward':[],'unbalance':[]}
loss_record={'episode':[],'steps':[],'critic_loss':[],'actor_loss':[],'entropy_loss':[]}
args.visible_gpu = '1'
if bool(args.random_seed_list):
for seed in args.random_seed_list:
args.random_seed = seed
args.agent = AgentTD3()
agent_name = f'{args.agent.__class__.__name__}'
args.agent.cri_target = True
args.init_before_training(if_main=True)
'''init agent and environment'''
agent = args.agent
env = args.env
agent.init(args.net_dim, env.state_space.shape[0], env.action_space.shape[0], args.learning_rate,
args.if_per_or_gae)
'''init replay buffer'''
buffer = ReplayBuffer(max_len=args.max_memo, state_dim=env.state_space.shape[0],
action_dim=env.action_space.shape[0])
'''start training'''
cwd = args.cwd
gamma = args.gamma
batch_size = args.batch_size # how much data should be used to update net
target_step = args.target_step # how manysteps of one episode should stop
repeat_times = args.repeat_times # how many times should update for one batch size data
soft_update_tau = args.soft_update_tau
agent.state = env.reset()
'''collect data and train and update network'''
num_episode = args.num_episode
##
# args.train=False
# args.save_network=False
# args.test_network=False
# args.save_test_data=False
# args.compare_with_pyomo=False
#
if args.train:
collect_data = True
while collect_data:
print(f'buffer:{buffer.now_len}')
with torch.no_grad():
trajectory = agent.explore_env(env, target_step)
steps, r_exp = update_buffer(trajectory)
buffer.update_now_len()
if buffer.now_len >= 10000:
collect_data = False
for i_episode in range(num_episode):
critic_loss, actor_loss = agent.update_net(buffer, batch_size, repeat_times, soft_update_tau)
loss_record['critic_loss'].append(critic_loss)
loss_record['actor_loss'].append(actor_loss)
with torch.no_grad():
episode_reward, episode_unbalance = get_episode_return(env, agent.act, agent.device)
reward_record['mean_episode_reward'].append(episode_reward)
reward_record['unbalance'].append(episode_unbalance)
print(
f'curren epsiode is {i_episode}, reward:{episode_reward},unbalance:{episode_unbalance},buffer_length: {buffer.now_len}')
if i_episode % 10 == 0:
# target_step
with torch.no_grad():
trajectory = agent.explore_env(env, target_step)
steps, r_exp = update_buffer(trajectory)
loss_record_path = f'{args.cwd}/loss_data.pkl'
reward_record_path = f'{args.cwd}/reward_data.pkl'
# current only store last seed corresponded actor
with open(loss_record_path, 'wb') as tf:
pickle.dump(loss_record, tf)
with open(reward_record_path, 'wb') as tf:
pickle.dump(reward_record, tf)
act_save_path = f'{args.cwd}/actor.pth'
if args.save_network:
torch.save(agent.act.state_dict(), act_save_path)
print('actor parameters have been saved')
if args.test_network:
args.cwd = agent_name
agent.act.load_state_dict(torch.load(act_save_path))
print('parameters have been reload and test')
record = test_one_episode(env, agent.act, agent.device)
eval_data = pd.DataFrame(record['information'])
eval_data.columns = ['time_step', 'price', 'netload', 'action', 'real_action', 'soc', 'battery', 'gen1', 'gen2',
'gen3', 'unbalance', 'operation_cost']
if args.save_test_data:
test_data_save_path = f'{args.cwd}/test_data.pkl'
with open(test_data_save_path, 'wb') as tf:
pickle.dump(record, tf)
'''compare with pyomo data and results'''
if args.compare_with_pyomo:
month = record['init_info'][0][0]
day = record['init_info'][0][1]
initial_soc = record['init_info'][0][3]
print(initial_soc)
base_result = optimization_base_result(env, month, day, initial_soc)
if args.plot_on:
# from plotDRL import PlotArgs,make_dir,plot_reward,plot_evaluation_information,plot_loss,plot_pyomo_information
from plotDRL import PlotArgs, make_dir,plot_evaluation_information,\
plot_optimization_result
plot_args = PlotArgs()
plot_args.feature_change = ''
args.cwd = agent_name # change
plot_dir = make_dir(args.cwd, plot_args.feature_change)
plot_optimization_result(base_result, plot_dir)
plot_evaluation_information(args.cwd + '/' + 'test_data.pkl', plot_dir)
'''compare the different cost get from pyomo and SAC'''
ration = sum(eval_data['operation_cost']) / sum(base_result['step_cost'])
print(sum(eval_data['operation_cost']))
print(sum(base_result['step_cost']))
print(ration)