Source code for csle_agents.agents.pomcp.pomcp_agent

from typing import Union, List, Dict, Optional
import math
import time
import gymnasium as gym
import os
import numpy as np
import gym_csle_stopping_game.constants.constants as env_constants
from csle_common.dao.emulation_config.emulation_env_config import EmulationEnvConfig
from csle_common.dao.simulation_config.simulation_env_config import SimulationEnvConfig
from csle_common.dao.training.experiment_config import ExperimentConfig
from csle_common.dao.training.experiment_execution import ExperimentExecution
from csle_common.dao.training.experiment_result import ExperimentResult
from csle_common.dao.training.agent_type import AgentType
from csle_common.util.experiment_util import ExperimentUtil
from csle_common.logging.log import Logger
from csle_common.metastore.metastore_facade import MetastoreFacade
from csle_common.dao.jobs.training_job_config import TrainingJobConfig
from csle_common.util.general_util import GeneralUtil
from csle_common.dao.simulation_config.base_env import BaseEnv
from csle_agents.agents.base.base_agent import BaseAgent
import csle_agents.constants.constants as agents_constants
from csle_agents.agents.pomcp.pomcp import POMCP


[docs]class POMCPAgent(BaseAgent):
    """
    POMCP Agent
    """

    def __init__(self, simulation_env_config: SimulationEnvConfig,
                 emulation_env_config: Union[None, EmulationEnvConfig],
                 experiment_config: ExperimentConfig, env: Optional[BaseEnv] = None,
                 training_job: Optional[TrainingJobConfig] = None, save_to_metastore: bool = True) -> None:
        """
        Initializes the POMCP Agent

        :param simulation_env_config: the simulation env config
        :param emulation_env_config: the emulation env config
        :param experiment_config: the experiment config
        :param env: (optional) the gym environment to use for simulation
        :param training_job: (optional) a training job configuration
        :param save_to_metastore: boolean flag that can be set to avoid saving results and progress to the metastore
        """
        super().__init__(simulation_env_config=simulation_env_config, emulation_env_config=emulation_env_config,
                         experiment_config=experiment_config)
        assert experiment_config.agent_type == AgentType.POMCP
        self.env = env
        self.training_job = training_job
        self.save_to_metastore = save_to_metastore

[docs]    def train(self) -> ExperimentExecution:
        """
        Performs the policy training for the given random seeds using POMCP

        :return: the training metrics and the trained policies
        """
        pid = os.getpid()

        # Initialize metrics
        exp_result = ExperimentResult()
        exp_result.plot_metrics.append(agents_constants.COMMON.AVERAGE_RETURN)
        exp_result.plot_metrics.append(agents_constants.COMMON.RUNNING_AVERAGE_RETURN)
        exp_result.plot_metrics.append(env_constants.ENV_METRICS.TIME_HORIZON)
        exp_result.plot_metrics.append(agents_constants.COMMON.RUNNING_AVERAGE_TIME_HORIZON)
        exp_result.plot_metrics.append(agents_constants.COMMON.RUNTIME)

        descr = f"{self.experiment_config.title}. \n Training of policies with the POMCP search algorithm using " \
                f"simulation:{self.simulation_env_config.name}"
        for seed in self.experiment_config.random_seeds:
            exp_result.all_metrics[seed] = {}
            exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_RETURN] = []
            exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_RETURN] = []
            exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_TIME_HORIZON] = []
            exp_result.all_metrics[seed][env_constants.ENV_METRICS.TIME_HORIZON] = []
            exp_result.all_metrics[seed][agents_constants.COMMON.RUNTIME] = []

        eval_env_config = self.experiment_config.hparams[agents_constants.POMCP.EVAL_ENV_CONFIG].value
        initial_particles = self.experiment_config.hparams[agents_constants.POMCP.INITIAL_PARTICLES].value
        rollout_policy = self.experiment_config.hparams[agents_constants.POMCP.ROLLOUT_POLICY].value
        value_function = self.experiment_config.hparams[agents_constants.POMCP.VALUE_FUNCTION].value
        self.experiment_config.hparams[agents_constants.POMCP.EVAL_ENV_CONFIG].value = -1
        self.experiment_config.hparams[agents_constants.POMCP.INITIAL_PARTICLES].value = -1
        self.experiment_config.hparams[agents_constants.POMCP.ROLLOUT_POLICY].value = -1
        self.experiment_config.hparams[agents_constants.POMCP.VALUE_FUNCTION].value = -1
        # Initialize training job
        if self.training_job is None:
            emulation_name = ""
            if self.emulation_env_config is not None:
                emulation_name = self.emulation_env_config.name
            self.training_job = TrainingJobConfig(
                simulation_env_name=self.simulation_env_config.name, experiment_config=self.experiment_config,
                progress_percentage=0, pid=pid, experiment_result=exp_result,
                emulation_env_name=emulation_name, simulation_traces=[],
                num_cached_traces=agents_constants.COMMON.NUM_CACHED_SIMULATION_TRACES,
                log_file_path=Logger.__call__().get_log_file_path(), descr=descr,
                physical_host_ip=GeneralUtil.get_host_ip())
            if self.save_to_metastore:
                training_job_id = MetastoreFacade.save_training_job(training_job=self.training_job)
                self.training_job.id = training_job_id
        else:
            self.training_job.pid = pid
            self.training_job.progress_percentage = 0
            self.training_job.experiment_result = exp_result
            if self.save_to_metastore:
                MetastoreFacade.update_training_job(training_job=self.training_job, id=self.training_job.id)

        # Initialize execution result
        ts = time.time()
        emulation_name = ""
        if self.emulation_env_config is not None:
            emulation_name = self.emulation_env_config.name
        simulation_name = self.simulation_env_config.name
        self.exp_execution = ExperimentExecution(
            result=exp_result, config=self.experiment_config, timestamp=ts, emulation_name=emulation_name,
            simulation_name=simulation_name, descr=descr, log_file_path=self.training_job.log_file_path)
        if self.save_to_metastore:
            exp_execution_id = MetastoreFacade.save_experiment_execution(self.exp_execution)
            self.exp_execution.id = exp_execution_id

        self.experiment_config.hparams[agents_constants.POMCP.EVAL_ENV_CONFIG].value = eval_env_config
        self.experiment_config.hparams[agents_constants.POMCP.INITIAL_PARTICLES].value = initial_particles
        self.experiment_config.hparams[agents_constants.POMCP.ROLLOUT_POLICY].value = rollout_policy
        self.experiment_config.hparams[agents_constants.POMCP.VALUE_FUNCTION].value = value_function

        for seed in self.experiment_config.random_seeds:
            ExperimentUtil.set_seed(seed)
            exp_result = self.pomcp(exp_result=exp_result, seed=seed, training_job=self.training_job,
                                    random_seeds=self.experiment_config.random_seeds)

        # Calculate average and std metrics
        exp_result.avg_metrics = {}
        exp_result.std_metrics = {}
        for metric in exp_result.all_metrics[self.experiment_config.random_seeds[0]].keys():
            value_vectors = []
            for seed in self.experiment_config.random_seeds:
                value_vectors.append(exp_result.all_metrics[seed][metric])

            avg_metrics = []
            std_metrics = []
            for i in range(len(value_vectors[0])):
                if type(value_vectors[0][0]) is int or type(value_vectors[0][0]) is float \
                        or type(value_vectors[0][0]) is np.int64 or type(value_vectors[0][0]) is np.float64:
                    seed_values = []
                    for seed_idx in range(len(self.experiment_config.random_seeds)):
                        seed_values.append(value_vectors[seed_idx][i])
                    avg = ExperimentUtil.mean_confidence_interval(
                        data=seed_values,
                        confidence=self.experiment_config.hparams[agents_constants.COMMON.CONFIDENCE_INTERVAL].value)[0]
                    if not math.isnan(avg):
                        avg_metrics.append(avg)
                    ci = ExperimentUtil.mean_confidence_interval(
                        data=seed_values,
                        confidence=self.experiment_config.hparams[agents_constants.COMMON.CONFIDENCE_INTERVAL].value)[1]
                    if not math.isnan(ci):
                        std_metrics.append(ci)
                    else:
                        std_metrics.append(-1)
                else:
                    avg_metrics.append(-1)
                    std_metrics.append(-1)
                exp_result.avg_metrics[metric] = avg_metrics
                exp_result.std_metrics[metric] = std_metrics

        ts = time.time()
        self.exp_execution.timestamp = ts
        self.exp_execution.result = exp_result
        if self.save_to_metastore:
            eval_env_config = self.experiment_config.hparams[agents_constants.POMCP.EVAL_ENV_CONFIG].value
            initial_particles = self.experiment_config.hparams[agents_constants.POMCP.INITIAL_PARTICLES].value
            rollout_policy = self.experiment_config.hparams[agents_constants.POMCP.ROLLOUT_POLICY].value
            value_function = self.experiment_config.hparams[agents_constants.POMCP.VALUE_FUNCTION].value
            MetastoreFacade.update_experiment_execution(experiment_execution=self.exp_execution,
                                                        id=self.exp_execution.id)
            self.experiment_config.hparams[agents_constants.POMCP.EVAL_ENV_CONFIG].value = eval_env_config
            self.experiment_config.hparams[agents_constants.POMCP.INITIAL_PARTICLES].value = initial_particles
            self.experiment_config.hparams[agents_constants.POMCP.ROLLOUT_POLICY].value = rollout_policy
            self.experiment_config.hparams[agents_constants.POMCP.VALUE_FUNCTION].value = value_function
        return self.exp_execution

[docs]    def hparam_names(self) -> List[str]:
        """
        :return: a list with the hyperparameter names
        """
        return [agents_constants.POMCP.OBJECTIVE_TYPE, agents_constants.POMCP.ROLLOUT_POLICY,
                agents_constants.POMCP.VALUE_FUNCTION, agents_constants.POMCP.N, agents_constants.POMCP.REINVIGORATION,
                agents_constants.POMCP.A, agents_constants.POMCP.GAMMA,
                agents_constants.POMCP.INITIAL_PARTICLES, agents_constants.POMCP.PLANNING_TIME,
                agents_constants.POMCP.LOG_STEP_FREQUENCY, agents_constants.POMCP.VERBOSE,
                agents_constants.POMCP.DEFAULT_NODE_VALUE, agents_constants.POMCP.MAX_NEGATIVE_SAMPLES,
                agents_constants.POMCP.MAX_PARTICLES, agents_constants.POMCP.C,
                agents_constants.POMCP.MAX_PLANNING_DEPTH, agents_constants.POMCP.PARALLEL_ROLLOUT,
                agents_constants.POMCP.NUM_PARALLEL_PROCESSES, agents_constants.POMCP.NUM_EVALS_PER_PROCESS,
                agents_constants.POMCP.PRIOR_WEIGHT, agents_constants.POMCP.PRUNE_ACTION_SPACE,
                agents_constants.POMCP.PRUNE_SIZE, agents_constants.POMCP.EVAL_ENV_NAME,
                agents_constants.POMCP.EVAL_ENV_CONFIG,
                agents_constants.COMMON.EVAL_BATCH_SIZE, agents_constants.COMMON.CONFIDENCE_INTERVAL,
                agents_constants.COMMON.RUNNING_AVERAGE, agents_constants.COMMON.MAX_ENV_STEPS]

[docs]    def pomcp(self, exp_result: ExperimentResult, seed: int,
              training_job: TrainingJobConfig, random_seeds: List[int]) -> ExperimentResult:
        """
        Runs the POMCP algorithm

        :param exp_result: the experiment result object to store the result
        :param seed: the seed
        :param training_job: the training job config
        :param random_seeds: list of seeds
        :return: the updated experiment result and the trained policy
        """
        start: float = time.time()
        rollout_policy = self.experiment_config.hparams[agents_constants.POMCP.ROLLOUT_POLICY].value
        use_rollout_policy = self.experiment_config.hparams[agents_constants.POMCP.USE_ROLLOUT_POLICY].value
        value_function = self.experiment_config.hparams[agents_constants.POMCP.VALUE_FUNCTION].value
        log_steps_frequency = self.experiment_config.hparams[agents_constants.POMCP.LOG_STEP_FREQUENCY].value
        verbose = self.experiment_config.hparams[agents_constants.POMCP.VERBOSE].value
        default_node_value = self.experiment_config.hparams[agents_constants.POMCP.DEFAULT_NODE_VALUE].value
        prior_weight = self.experiment_config.hparams[agents_constants.POMCP.PRIOR_WEIGHT].value
        prior_confidence = self.experiment_config.hparams[agents_constants.POMCP.PRIOR_CONFIDENCE].value
        max_env_steps = self.experiment_config.hparams[agents_constants.COMMON.MAX_ENV_STEPS].value
        N = self.experiment_config.hparams[agents_constants.POMCP.N].value
        A = self.experiment_config.hparams[agents_constants.POMCP.A].value
        acquisition_function_type = \
            self.experiment_config.hparams[agents_constants.POMCP.ACQUISITION_FUNCTION_TYPE].value
        reinvigoration = self.experiment_config.hparams[agents_constants.POMCP.REINVIGORATION].value
        gamma = self.experiment_config.hparams[agents_constants.POMCP.GAMMA].value
        initial_particles = self.experiment_config.hparams[agents_constants.POMCP.INITIAL_PARTICLES].value
        planning_time = self.experiment_config.hparams[agents_constants.POMCP.PLANNING_TIME].value
        max_particles = self.experiment_config.hparams[agents_constants.POMCP.MAX_PARTICLES].value
        c = self.experiment_config.hparams[agents_constants.POMCP.C].value
        c2 = self.experiment_config.hparams[agents_constants.POMCP.C2].value
        max_rollout_depth = self.experiment_config.hparams[agents_constants.POMCP.MAX_ROLLOUT_DEPTH].value
        max_planning_depth = self.experiment_config.hparams[agents_constants.POMCP.MAX_PLANNING_DEPTH].value
        prune_action_space = self.experiment_config.hparams[agents_constants.POMCP.PRUNE_ACTION_SPACE].value
        prune_size = self.experiment_config.hparams[agents_constants.POMCP.PRUNE_SIZE].value
        reinvigorated_particles_ratio = \
            self.experiment_config.hparams[agents_constants.POMCP.REINVIGORATED_PARTICLES_RATIO].value
        config = self.simulation_env_config.simulation_env_input_config
        eval_env_name = self.experiment_config.hparams[agents_constants.POMCP.EVAL_ENV_NAME].value
        eval_env_config = self.experiment_config.hparams[agents_constants.POMCP.EVAL_ENV_CONFIG].value
        eval_env: BaseEnv = gym.make(eval_env_name, config=eval_env_config)
        self.experiment_config.hparams[agents_constants.POMCP.EVAL_ENV_CONFIG].value = -1
        self.experiment_config.hparams[agents_constants.POMCP.INITIAL_PARTICLES].value = -1
        self.experiment_config.hparams[agents_constants.POMCP.ROLLOUT_POLICY].value = -1
        self.experiment_config.hparams[agents_constants.POMCP.VALUE_FUNCTION].value = -1

        # Run N episodes
        returns = []
        for i in range(N):
            done = False
            action_sequence = []
            train_env: BaseEnv = gym.make(self.simulation_env_config.gym_env_name, config=config)
            _, info = eval_env.reset()
            s = info[agents_constants.COMMON.STATE]
            train_env.reset()
            pomcp = POMCP(A=A, gamma=gamma, env=train_env, c=c, initial_particles=initial_particles,
                          planning_time=planning_time, max_particles=max_particles, rollout_policy=rollout_policy,
                          value_function=value_function, reinvigoration=reinvigoration, verbose=verbose,
                          default_node_value=default_node_value, prior_weight=prior_weight,
                          acquisition_function_type=acquisition_function_type, c2=c2,
                          use_rollout_policy=use_rollout_policy, prior_confidence=prior_confidence,
                          reinvigorated_particles_ratio=reinvigorated_particles_ratio,
                          prune_action_space=prune_action_space, prune_size=prune_size)
            R = 0
            t = 1
            if t % log_steps_frequency == 0:
                Logger.__call__().get_logger().info(f"[POMCP] t: {t}, s: {s}")

            # Run episode
            while not done and t <= max_env_steps:
                rollout_depth = max_rollout_depth
                planning_depth = max_planning_depth
                pomcp.solve(max_rollout_depth=rollout_depth, max_planning_depth=planning_depth, t=t)
                action = pomcp.get_action()
                o, r, done, _, info = eval_env.step(action)
                action_sequence.append(action)
                s_prime = info[agents_constants.COMMON.STATE]
                obs_id = info[agents_constants.COMMON.OBSERVATION]
                pomcp.update_tree_with_new_samples(action_sequence=action_sequence, observation=obs_id, t=t)
                R += r
                t += 1
                if t % log_steps_frequency == 0:
                    Logger.__call__().get_logger().info(f"[POMCP] t: {t}, a: {action}, r: {r}, o: {obs_id}, "
                                                        f"s_prime: {s_prime}, action sequence: {action_sequence}, "
                                                        f"R: {R}")

            if i % self.experiment_config.log_every == 0:
                # Logging
                exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_RETURN].append(R)
                running_avg_J = ExperimentUtil.running_average(
                    exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_RETURN],
                    self.experiment_config.hparams[agents_constants.COMMON.RUNNING_AVERAGE].value)
                exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_RETURN].append(running_avg_J)
                progress = round((i + 1) / N, 2)
                time_elapsed_minutes = round((time.time() - start) / 60, 3)
                Logger.__call__().get_logger().info(
                    f"[POMCP] episode: {i}, J:{R}, "
                    f"J_avg_{self.experiment_config.hparams[agents_constants.COMMON.RUNNING_AVERAGE].value}:"
                    f"{running_avg_J}, "
                    f"progress: {round(progress * 100, 2)}%, "
                    f"runtime: {time_elapsed_minutes} min")

                # Update training job
                total_iterations = len(random_seeds) * N
                iterations_done = (random_seeds.index(seed)) * N + i
                progress = round(iterations_done / total_iterations, 2)
                training_job.progress_percentage = progress
                training_job.experiment_result = exp_result
                if eval_env is not None and len(eval_env.get_traces()) > 0:
                    training_job.simulation_traces.append(eval_env.get_traces()[-1])
                if len(training_job.simulation_traces) > training_job.num_cached_traces:
                    training_job.simulation_traces = training_job.simulation_traces[1:]
                if self.save_to_metastore:
                    MetastoreFacade.update_training_job(training_job=training_job, id=training_job.id)

                # Update execution
                ts = time.time()
                self.exp_execution.timestamp = ts
                self.exp_execution.result = exp_result
                if self.save_to_metastore:
                    MetastoreFacade.update_experiment_execution(experiment_execution=self.exp_execution,
                                                                id=self.exp_execution.id)
            returns.append(R)
            Logger.__call__().get_logger().info(f"avg return: {np.mean(returns)}")

        if eval_env is not None:
            # Save latest trace
            if self.save_to_metastore:
                MetastoreFacade.save_simulation_trace(eval_env.get_traces()[-1])
            eval_env.reset_traces()
        return exp_result

[docs]    @staticmethod
    def update_metrics(metrics: Dict[str, List[Union[float, int]]], info: Dict[str, Union[float, int]]) \
            -> Dict[str, List[Union[float, int]]]:
        """
        Update a dict with aggregated metrics using new information from the environment

        :param metrics: the dict with the aggregated metrics
        :param info: the new information
        :return: the updated dict
        """
        for k, v in info.items():
            if k in metrics:
                metrics[k].append(round(v, 3))
            else:
                metrics[k] = [v]
        return metrics

[docs]    @staticmethod
    def compute_avg_metrics(metrics: Dict[str, List[Union[float, int]]]) -> Dict[str, Union[float, int]]:
        """
        Computes the average metrics of a dict with aggregated metrics

        :param metrics: the dict with the aggregated metrics
        :return: the average metrics
        """
        avg_metrics = {}
        for k, v in metrics.items():
            avg = round(sum(v) / len(v), 2)
            avg_metrics[k] = avg
        return avg_metrics