Source code for elegantrl.agents.AgentMADDPG

import torch

from elegantrl.agents.AgentBase import AgentBase
from elegantrl.agents.net import Actor, Critic
from elegantrl.agents.AgentDDPG import AgentDDPG

[docs]class AgentMADDPG(AgentBase):
    """
    Bases: ``AgentBase``

    Multi-Agent DDPG algorithm. “Multi-Agent Actor-Critic for Mixed Cooperative-Competitive”. R Lowe. et al.. 2017.

    :param net_dim[int]: the dimension of networks (the width of neural networks)
    :param state_dim[int]: the dimension of state (the number of state vector)
    :param action_dim[int]: the dimension of action (the number of discrete action)
    :param learning_rate[float]: learning rate of optimizer
    :param gamma[float]: learning rate of optimizer
    :param n_agents[int]: number of agents
    :param if_per_or_gae[bool]: PER (off-policy) or GAE (on-policy) for sparse reward
    :param env_num[int]: the env number of VectorEnv. env_num == 1 means don't use VectorEnv
    :param agent_id[int]: if the visible_gpu is '1,9,3,4', agent_id=1 means (1,9,4,3)[agent_id] == 9
    """

    def __init__(self):
        super().__init__()
        self.ClassAct = Actor
        self.ClassCri = Critic
        self.if_use_cri_target = True
        self.if_use_act_target = True

    def init(
        self,
        net_dim,
        state_dim,
        action_dim,
        learning_rate=1e-4,
        gamma=0.95,
        n_agents=1,
        if_use_per=False,
        env_num=1,
        agent_id=0,
    ):
        self.agents = [AgentDDPG() for i in range(n_agents)]
        self.explore_env = self.explore_one_env
        self.if_off_policy = True
        self.n_agents = n_agents

        for i in range(self.n_agents):
            self.agents[i].init(
                net_dim,
                state_dim,
                action_dim,
                learning_rate=1e-4,
                n_agents=self.n_agents,
                if_use_per=False,
                env_num=1,
                agent_id=0,
            )
        self.n_states = state_dim
        self.n_actions = action_dim

        self.batch_size = net_dim
        self.gamma = gamma
        self.update_tau = 0
        self.device = torch.device(
            f"cuda:{agent_id}"
            if (torch.cuda.is_available() and (agent_id >= 0))
            else "cpu"
        )

[docs]    def update_agent(self, rewards, dones, actions, observations, next_obs, index):
        """
        Update the single agent neural networks, called by update_net.

        :param rewards: reward list of the sampled buffer
        :param dones: done list of the sampled buffer
        :param actions: action list of the sampled buffer
        :param observations: observation list of the sampled buffer
        :param next_obs: next_observation list of the sample buffer
        :param index: ID of the agent
        """
        curr_agent = self.agents[index]
        curr_agent.cri_optim.zero_grad()
        all_target_actions = []
        for i in range(self.n_agents):
            if i == index:
                all_target_actions.append(curr_agent.act_target(next_obs[:, index]))
            if i != index:
                action = self.agents[i].act_target(next_obs[:, i])
                all_target_actions.append(action)
        action_target_all = (
            torch.cat(all_target_actions, dim=1)
            .to(self.device)
            .reshape(actions.shape[0], actions.shape[1] * actions.shape[2])
        )

        target_value = rewards[:, index] + self.gamma * curr_agent.cri_target(
            next_obs.reshape(next_obs.shape[0], next_obs.shape[1] * next_obs.shape[2]),
            action_target_all,
        ).detach().squeeze(dim=1)
        actual_value = curr_agent.cri(
            observations.reshape(
                next_obs.shape[0], next_obs.shape[1] * next_obs.shape[2]
            ),
            actions.reshape(actions.shape[0], actions.shape[1] * actions.shape[2]),
        ).squeeze(dim=1)
        vf_loss = curr_agent.loss_td(actual_value, target_value.detach())
        curr_agent.act_optim.zero_grad()
        curr_pol_out = curr_agent.act(observations[:, index])
        curr_pol_vf_in = curr_pol_out
        all_pol_acs = []
        for i in range(self.n_agents):
            if i == index:
                all_pol_acs.append(curr_pol_vf_in)
            else:
                all_pol_acs.append(actions[:, i])
        pol_loss = -torch.mean(
            curr_agent.cri(
                observations.reshape(
                    observations.shape[0], observations.shape[1] * observations.shape[2]
                ),
                torch.cat(all_pol_acs, dim=1)
                .to(self.device)
                .reshape(actions.shape[0], actions.shape[1] * actions.shape[2]),
            )
        )
        curr_agent.act_optim.zero_grad()
        pol_loss.backward()
        curr_agent.act_optim.step()
        curr_agent.cri_optim.zero_grad()
        vf_loss.backward()
        curr_agent.cri_optim.step()

[docs]    def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
        """
        Update the neural networks by sampling batch data from ``ReplayBuffer``.

        :param buffer: the ReplayBuffer instance that stores the trajectories.
        :param batch_size: the size of batch data for Stochastic Gradient Descent (SGD).
        :param repeat_times: the re-using times of each trajectory.
        :param soft_update_tau: the soft update parameter.
        """
        buffer.update_now_len()
        self.batch_size = batch_size
        self.update_tau = soft_update_tau
        rewards, dones, actions, observations, next_obs = buffer.sample_batch(
            self.batch_size
        )
        for index in range(self.n_agents):
            self.update_agent(rewards, dones, actions, observations, next_obs, index)

        for agent in self.agents:
            self.soft_update(agent.cri_target, agent.cri, self.update_tau)
            self.soft_update(agent.act_target, agent.act, self.update_tau)

        return

[docs]    def explore_one_env(self, env, target_step) -> list:
        """
        Exploring the environment for target_step.
        param env: the Environment instance to be explored.
        param target_step: target steps to explore.
        """
        traj_temp = []
        k = 0
        for _ in range(target_step):
            k += 1
            actions = []
            for i in range(self.n_agents):
                action = self.agents[i].select_actions(self.states[i])
                actions.append(action)
            # print(actions)
            next_s, reward, done, _ = env.step(actions)
            traj_temp.append((self.states, reward, done, actions))
            global_done = all(done[i] is True for i in range(self.n_agents))
            if global_done or k > 100:
                state = env.reset()
                k = 0
            else:
                state = next_s
        self.states = state
        return traj_temp

[docs]    def select_actions(self, states):
        """
        Select continuous actions for exploration

        :param state: states.shape==(n_agents,batch_size, state_dim, )
        :return: actions.shape==(n_agents,batch_size, action_dim, ),  -1 < action < +1
        """
        actions = []
        for i in range(self.n_agents):
            action = self.agents[i].select_actions(states[i])
            actions.append(action)
        return actions

[docs]    def save_or_load_agent(self, cwd, if_save):
        """
        save or load training files for Agent

        :param cwd: Current Working Directory. ElegantRL save training files in CWD.
        :param if_save: True: save files. False: load files.
        """
        for i in range(self.n_agents):
            self.agents[i].save_or_load_agent(cwd + "/" + str(i), if_save)