import torch
from elegantrl.agents.AgentBase import AgentBase
from elegantrl.agents.net import Actor, Critic
from elegantrl.agents.AgentDDPG import AgentDDPG
[docs]class AgentMADDPG(AgentBase):
"""
Bases: ``AgentBase``
Multi-Agent DDPG algorithm. “Multi-Agent Actor-Critic for Mixed Cooperative-Competitive”. R Lowe. et al.. 2017.
:param net_dim[int]: the dimension of networks (the width of neural networks)
:param state_dim[int]: the dimension of state (the number of state vector)
:param action_dim[int]: the dimension of action (the number of discrete action)
:param learning_rate[float]: learning rate of optimizer
:param gamma[float]: learning rate of optimizer
:param n_agents[int]: number of agents
:param if_per_or_gae[bool]: PER (off-policy) or GAE (on-policy) for sparse reward
:param env_num[int]: the env number of VectorEnv. env_num == 1 means don't use VectorEnv
:param agent_id[int]: if the visible_gpu is '1,9,3,4', agent_id=1 means (1,9,4,3)[agent_id] == 9
"""
def __init__(self):
super().__init__()
self.ClassAct = Actor
self.ClassCri = Critic
self.if_use_cri_target = True
self.if_use_act_target = True
def init(
self,
net_dim,
state_dim,
action_dim,
learning_rate=1e-4,
gamma=0.95,
n_agents=1,
if_use_per=False,
env_num=1,
agent_id=0,
):
self.agents = [AgentDDPG() for i in range(n_agents)]
self.explore_env = self.explore_one_env
self.if_off_policy = True
self.n_agents = n_agents
for i in range(self.n_agents):
self.agents[i].init(
net_dim,
state_dim,
action_dim,
learning_rate=1e-4,
n_agents=self.n_agents,
if_use_per=False,
env_num=1,
agent_id=0,
)
self.n_states = state_dim
self.n_actions = action_dim
self.batch_size = net_dim
self.gamma = gamma
self.update_tau = 0
self.device = torch.device(
f"cuda:{agent_id}"
if (torch.cuda.is_available() and (agent_id >= 0))
else "cpu"
)
[docs] def update_agent(self, rewards, dones, actions, observations, next_obs, index):
"""
Update the single agent neural networks, called by update_net.
:param rewards: reward list of the sampled buffer
:param dones: done list of the sampled buffer
:param actions: action list of the sampled buffer
:param observations: observation list of the sampled buffer
:param next_obs: next_observation list of the sample buffer
:param index: ID of the agent
"""
curr_agent = self.agents[index]
curr_agent.cri_optim.zero_grad()
all_target_actions = []
for i in range(self.n_agents):
if i == index:
all_target_actions.append(curr_agent.act_target(next_obs[:, index]))
if i != index:
action = self.agents[i].act_target(next_obs[:, i])
all_target_actions.append(action)
action_target_all = (
torch.cat(all_target_actions, dim=1)
.to(self.device)
.reshape(actions.shape[0], actions.shape[1] * actions.shape[2])
)
target_value = rewards[:, index] + self.gamma * curr_agent.cri_target(
next_obs.reshape(next_obs.shape[0], next_obs.shape[1] * next_obs.shape[2]),
action_target_all,
).detach().squeeze(dim=1)
actual_value = curr_agent.cri(
observations.reshape(
next_obs.shape[0], next_obs.shape[1] * next_obs.shape[2]
),
actions.reshape(actions.shape[0], actions.shape[1] * actions.shape[2]),
).squeeze(dim=1)
vf_loss = curr_agent.loss_td(actual_value, target_value.detach())
curr_agent.act_optim.zero_grad()
curr_pol_out = curr_agent.act(observations[:, index])
curr_pol_vf_in = curr_pol_out
all_pol_acs = []
for i in range(self.n_agents):
if i == index:
all_pol_acs.append(curr_pol_vf_in)
else:
all_pol_acs.append(actions[:, i])
pol_loss = -torch.mean(
curr_agent.cri(
observations.reshape(
observations.shape[0], observations.shape[1] * observations.shape[2]
),
torch.cat(all_pol_acs, dim=1)
.to(self.device)
.reshape(actions.shape[0], actions.shape[1] * actions.shape[2]),
)
)
curr_agent.act_optim.zero_grad()
pol_loss.backward()
curr_agent.act_optim.step()
curr_agent.cri_optim.zero_grad()
vf_loss.backward()
curr_agent.cri_optim.step()
[docs] def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
"""
Update the neural networks by sampling batch data from ``ReplayBuffer``.
:param buffer: the ReplayBuffer instance that stores the trajectories.
:param batch_size: the size of batch data for Stochastic Gradient Descent (SGD).
:param repeat_times: the re-using times of each trajectory.
:param soft_update_tau: the soft update parameter.
"""
buffer.update_now_len()
self.batch_size = batch_size
self.update_tau = soft_update_tau
rewards, dones, actions, observations, next_obs = buffer.sample_batch(
self.batch_size
)
for index in range(self.n_agents):
self.update_agent(rewards, dones, actions, observations, next_obs, index)
for agent in self.agents:
self.soft_update(agent.cri_target, agent.cri, self.update_tau)
self.soft_update(agent.act_target, agent.act, self.update_tau)
return
[docs] def explore_one_env(self, env, target_step) -> list:
"""
Exploring the environment for target_step.
param env: the Environment instance to be explored.
param target_step: target steps to explore.
"""
traj_temp = []
k = 0
for _ in range(target_step):
k += 1
actions = []
for i in range(self.n_agents):
action = self.agents[i].select_actions(self.states[i])
actions.append(action)
# print(actions)
next_s, reward, done, _ = env.step(actions)
traj_temp.append((self.states, reward, done, actions))
global_done = all(done[i] is True for i in range(self.n_agents))
if global_done or k > 100:
state = env.reset()
k = 0
else:
state = next_s
self.states = state
return traj_temp
[docs] def select_actions(self, states):
"""
Select continuous actions for exploration
:param state: states.shape==(n_agents,batch_size, state_dim, )
:return: actions.shape==(n_agents,batch_size, action_dim, ), -1 < action < +1
"""
actions = []
for i in range(self.n_agents):
action = self.agents[i].select_actions(states[i])
actions.append(action)
return actions
[docs] def save_or_load_agent(self, cwd, if_save):
"""
save or load training files for Agent
:param cwd: Current Working Directory. ElegantRL save training files in CWD.
:param if_save: True: save files. False: load files.
"""
for i in range(self.n_agents):
self.agents[i].save_or_load_agent(cwd + "/" + str(i), if_save)