Source code for csle_agents.agents.t_fp.t_fp_agent

from typing import Union, List, Dict, Tuple, Optional, Any
import time
import gymnasium as gym
import os
import math
import numpy as np
from csle_common.dao.emulation_config.emulation_env_config import EmulationEnvConfig
from csle_common.dao.simulation_config.simulation_env_config import SimulationEnvConfig
from csle_common.dao.training.experiment_config import ExperimentConfig
from csle_common.dao.training.experiment_execution import ExperimentExecution
from csle_common.dao.training.experiment_result import ExperimentResult
from csle_common.dao.training.agent_type import AgentType
from csle_common.util.experiment_util import ExperimentUtil
from csle_common.dao.training.player_type import PlayerType
from csle_common.logging.log import Logger
from csle_common.metastore.metastore_facade import MetastoreFacade
from csle_common.dao.jobs.training_job_config import TrainingJobConfig
from csle_common.dao.training.multi_threshold_stopping_policy import MultiThresholdStoppingPolicy
from csle_common.dao.training.mixed_multi_threshold_stopping_policy import MixedMultiThresholdStoppingPolicy
from csle_common.dao.training.policy import Policy
import csle_common.constants.constants as constants
from csle_common.util.general_util import GeneralUtil
from csle_common.dao.simulation_config.base_env import BaseEnv
from csle_agents.agents.base.base_agent import BaseAgent
from csle_agents.agents.t_spsa.t_spsa_agent import TSPSAAgent
import csle_agents.constants.constants as agents_constants
import gym_csle_stopping_game.constants.constants as env_constants


[docs]class TFPAgent(BaseAgent):
    """
    RL Agent implementing the T-FP algorithm from (Hammar & Stadler '23 - Learning Near-Optimal Intrusion Responses
    Against Dynamic Attackers)
    """

    def __init__(self, defender_simulation_env_config: SimulationEnvConfig,
                 attacker_simulation_env_config: SimulationEnvConfig,
                 emulation_env_config: Union[None, EmulationEnvConfig], experiment_config: ExperimentConfig,
                 training_job: Optional[TrainingJobConfig] = None):
        """
        Initializes the T-FP agent

        :param attacker_simulation_env_config: the simulation env config of the attacker
        :param defender_simulation_env_config: the simulation env config of the defender
        :param emulation_env_config: the emulation env config
        :param experiment_config: the experiment config
        :param training_job: (optional) reuse an existing training job configuration
        """
        super().__init__(simulation_env_config=defender_simulation_env_config,
                         emulation_env_config=emulation_env_config,
                         experiment_config=experiment_config)
        assert experiment_config.agent_type == AgentType.T_FP
        self.root_output_dir = str(self.experiment_config.output_dir)
        self.defender_experiment_config = self.get_defender_experiment_config()
        self.attacker_experiment_config = self.get_attacker_experiment_config()
        self.attacker_simulation_env_config = attacker_simulation_env_config
        self.defender_simulation_env_config = defender_simulation_env_config
        self.training_job = training_job

[docs]    def train(self) -> ExperimentExecution:
        """
        Performs the policy training for the given random seeds using T-FP

        :return: the training metrics and the trained policies
        """
        pid = os.getpid()

        # Initialize result metrics
        exp_result = ExperimentResult()

        # Define which metrics to plot in the UI
        exp_result.plot_metrics.append(agents_constants.COMMON.EXPLOITABILITY)
        exp_result.plot_metrics.append(agents_constants.COMMON.RUNNING_AVERAGE_EXPLOITABILITY)
        exp_result.plot_metrics.append(agents_constants.COMMON.AVERAGE_ATTACKER_RETURN)
        exp_result.plot_metrics.append(agents_constants.COMMON.RUNNING_AVERAGE_ATTACKER_RETURN)
        exp_result.plot_metrics.append(agents_constants.T_FP.RUNNING_AVERAGE_BEST_RESPONSE_ATTACKER_RETURN)
        exp_result.plot_metrics.append(agents_constants.COMMON.AVERAGE_DEFENDER_RETURN)
        exp_result.plot_metrics.append(agents_constants.COMMON.RUNNING_AVERAGE_DEFENDER_RETURN)
        exp_result.plot_metrics.append(agents_constants.T_FP.RUNNING_AVERAGE_BEST_RESPONSE_DEFENDER_RETURN)
        exp_result.plot_metrics.append(env_constants.ENV_METRICS.INTRUSION_LENGTH)
        exp_result.plot_metrics.append(agents_constants.COMMON.RUNNING_AVERAGE_INTRUSION_LENGTH)
        exp_result.plot_metrics.append(env_constants.ENV_METRICS.INTRUSION_START)
        exp_result.plot_metrics.append(agents_constants.COMMON.RUNNING_AVERAGE_INTRUSION_START)
        exp_result.plot_metrics.append(env_constants.ENV_METRICS.TIME_HORIZON)
        exp_result.plot_metrics.append(agents_constants.COMMON.RUNNING_AVERAGE_TIME_HORIZON)
        exp_result.plot_metrics.append(env_constants.ENV_METRICS.AVERAGE_UPPER_BOUND_RETURN)
        exp_result.plot_metrics.append(env_constants.ENV_METRICS.AVERAGE_DEFENDER_BASELINE_STOP_ON_FIRST_ALERT_RETURN)
        for l in range(1, self.experiment_config.hparams[constants.T_SPSA.L].value + 1):
            exp_result.plot_metrics.append(env_constants.ENV_METRICS.STOP + f"_{l}")
            exp_result.plot_metrics.append(env_constants.ENV_METRICS.STOP + f"_running_average_{l}")

        descr = f"Approximating a Nash equilibrium with the T-FP algorithm using " \
                f"simulations: {self.defender_simulation_env_config.name} " \
                f"and {self.attacker_simulation_env_config.name}"
        for seed in self.experiment_config.random_seeds:
            exp_result.all_metrics[seed] = {}
            exp_result.all_metrics[seed][constants.T_SPSA.THETAS] = []
            exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_DEFENDER_RETURN] = []
            exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_DEFENDER_RETURN] = []
            exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_ATTACKER_RETURN] = []
            exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_ATTACKER_RETURN] = []
            exp_result.all_metrics[seed][agents_constants.T_FP.AVERAGE_BEST_RESPONSE_DEFENDER_RETURN] = []
            exp_result.all_metrics[seed][agents_constants.T_FP.RUNNING_AVERAGE_BEST_RESPONSE_DEFENDER_RETURN] = []
            exp_result.all_metrics[seed][agents_constants.T_FP.AVERAGE_BEST_RESPONSE_ATTACKER_RETURN] = []
            exp_result.all_metrics[seed][agents_constants.T_FP.RUNNING_AVERAGE_BEST_RESPONSE_ATTACKER_RETURN] = []
            exp_result.all_metrics[seed][agents_constants.COMMON.EXPLOITABILITY] = []
            exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_EXPLOITABILITY] = []
            exp_result.all_metrics[seed][agents_constants.T_FP.DEFENDER_THRESHOLDS] = []
            exp_result.all_metrics[seed][agents_constants.T_FP.ATTACKER_THRESHOLDS] = []
            for l in range(1, self.experiment_config.hparams[constants.T_SPSA.L].value + 1):
                exp_result.all_metrics[seed][constants.T_SPSA.STOP_DISTRIBUTION_DEFENDER + f"_l={l}"] = []
            for s in self.simulation_env_config.state_space_config.states:
                for l in range(1, self.experiment_config.hparams[constants.T_SPSA.L].value + 1):
                    exp_result.all_metrics[seed][constants.T_SPSA.STOP_DISTRIBUTION_ATTACKER
                                                 + f"_l={l}_s={s.id}"] = []
            exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_INTRUSION_START] = []
            exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_TIME_HORIZON] = []
            exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_INTRUSION_LENGTH] = []
            exp_result.all_metrics[seed][env_constants.ENV_METRICS.INTRUSION_START] = []
            exp_result.all_metrics[seed][env_constants.ENV_METRICS.INTRUSION_LENGTH] = []
            exp_result.all_metrics[seed][env_constants.ENV_METRICS.TIME_HORIZON] = []
            exp_result.all_metrics[seed][env_constants.ENV_METRICS.AVERAGE_UPPER_BOUND_RETURN] = []
            exp_result.all_metrics[seed][
                env_constants.ENV_METRICS.AVERAGE_DEFENDER_BASELINE_STOP_ON_FIRST_ALERT_RETURN] = []
            for l in range(1, self.experiment_config.hparams[constants.T_SPSA.L].value + 1):
                exp_result.all_metrics[seed][env_constants.ENV_METRICS.STOP + f"_{l}"] = []
                exp_result.all_metrics[seed][env_constants.ENV_METRICS.STOP + f"_running_average_{l}"] = []

        if self.training_job is None:
            emulation_name = ""
            if self.emulation_env_config is not None:
                emulation_name = self.emulation_env_config.name
            self.training_job = TrainingJobConfig(
                simulation_env_name=self.simulation_env_config.name, experiment_config=self.experiment_config,
                experiment_result=exp_result, progress_percentage=0, pid=pid,
                emulation_env_name=emulation_name, simulation_traces=[],
                num_cached_traces=agents_constants.COMMON.NUM_CACHED_SIMULATION_TRACES,
                log_file_path=Logger.__call__().get_log_file_path(), descr=descr,
                physical_host_ip=GeneralUtil.get_host_ip())
            training_job_id = MetastoreFacade.save_training_job(training_job=self.training_job)
            self.training_job.id = training_job_id
        else:
            self.training_job.pid = pid
            self.training_job.progress_percentage = 0
            self.training_job.experiment_result = exp_result
            MetastoreFacade.update_training_job(training_job=self.training_job, id=self.training_job.id)
        config = self.simulation_env_config.simulation_env_input_config
        env: BaseEnv = gym.make(self.simulation_env_config.gym_env_name, config=config)
        for seed in self.experiment_config.random_seeds:
            ExperimentUtil.set_seed(seed)
            exp_result = self.t_fp(exp_result=exp_result, seed=seed, env=env, training_job=self.training_job,
                                   random_seeds=self.experiment_config.random_seeds)
            self.training_job = MetastoreFacade.get_training_job_config(id=self.training_job.id)

        # Calculate average and std metrics
        exp_result.avg_metrics = {}
        exp_result.std_metrics = {}
        for metric in exp_result.all_metrics[self.experiment_config.random_seeds[0]].keys():
            confidence = 0.95
            value_vectors = []
            for seed in self.experiment_config.random_seeds:
                value_vectors.append(exp_result.all_metrics[seed][metric])

            avg_metrics = []
            std_metrics = []
            for i in range(len(value_vectors[0])):
                seed_values = []
                for seed_idx in range(len(self.experiment_config.random_seeds)):
                    seed_values.append(value_vectors[seed_idx][i])
                try:
                    avg_metrics.append(
                        ExperimentUtil.mean_confidence_interval(data=seed_values, confidence=confidence)[0])
                    std_metrics.append(
                        ExperimentUtil.mean_confidence_interval(data=seed_values, confidence=confidence)[1])
                except Exception:
                    pass
            exp_result.avg_metrics[metric] = avg_metrics
            exp_result.std_metrics[metric] = std_metrics

        ts = time.time()
        emulation_name = ""
        if self.emulation_env_config is not None:
            emulation_name = self.emulation_env_config.name
        simulation_name = self.simulation_env_config.name
        exp_execution = ExperimentExecution(result=exp_result, config=self.experiment_config, timestamp=ts,
                                            emulation_name=emulation_name, simulation_name=simulation_name,
                                            descr=descr, log_file_path=self.training_job.log_file_path)
        traces = env.unwrapped.get_traces()
        if len(traces) > 0:
            MetastoreFacade.save_simulation_trace(traces[-1])
        MetastoreFacade.remove_training_job(self.training_job)
        return exp_execution

[docs]    def t_fp(self, exp_result: ExperimentResult, seed: int, env: BaseEnv,
             training_job: TrainingJobConfig, random_seeds: List[int]):
        """
        Runs the T-FP algorithm (Hammar, Stadler 2023)

        :param exp_result: the experiment result
        :param seed: the seed for the experiment
        :param env: environment for evaluation
        :param training_job: the training job for the evaluation
        :param random_seeds: the random seeds for the evaluation
        :return: the experiment result
        """

        # Initialize policies
        defender_policy = MixedMultiThresholdStoppingPolicy(
            defender_Theta=np.zeros((self.experiment_config.hparams[constants.T_SPSA.L].value, 2, 1)).tolist(),
            attacker_Theta=[],
            simulation_name=self.defender_simulation_env_config.name,
            states=self.defender_simulation_env_config.state_space_config.states,
            player_type=PlayerType.DEFENDER,
            L=self.defender_experiment_config.hparams[constants.T_SPSA.L].value,
            actions=self.defender_simulation_env_config.joint_action_space_config.action_spaces[
                self.defender_experiment_config.player_idx].actions,
            experiment_config=self.defender_experiment_config, avg_R=-1,
            agent_type=AgentType.T_FP)
        attacker_policy = MixedMultiThresholdStoppingPolicy(
            attacker_Theta=np.zeros((2, self.experiment_config.hparams[constants.T_SPSA.L].value, 2, 1)).tolist(),
            defender_Theta=[],
            simulation_name=self.attacker_simulation_env_config.name,
            states=self.attacker_simulation_env_config.state_space_config.states,
            player_type=PlayerType.ATTACKER,
            L=self.attacker_experiment_config.hparams[constants.T_SPSA.L].value,
            actions=self.attacker_simulation_env_config.joint_action_space_config.action_spaces[
                self.attacker_experiment_config.player_idx].actions,
            experiment_config=self.attacker_experiment_config, avg_R=-1,
            agent_type=AgentType.T_FP, opponent_strategy=defender_policy)

        initial_attacker_thresholds: List[List[List[float]]] = []
        initial_defender_thresholds: List[List[float]] = []
        initial_attacker_thresholds.append(
            [[0.0] * self.attacker_experiment_config.hparams[constants.T_SPSA.L].value,
             [1.0] * self.attacker_experiment_config.hparams[constants.T_SPSA.L].value
             ])
        initial_defender_thresholds.append([0.0] * self.attacker_experiment_config.hparams[constants.T_SPSA.L].value)

        attacker_policy._update_Theta_attacker(new_thresholds=initial_attacker_thresholds)
        defender_policy._update_Theta_defender(new_thresholds=initial_defender_thresholds)
        attacker_policy.opponent_strategy = defender_policy

        for i in range(self.experiment_config.hparams[agents_constants.T_FP.N_2].value):

            # Compute best responses
            br_seed = np.random.randint(0, 100)
            attacker_thresholds, attacker_val = self.attacker_best_response(
                seed=br_seed, defender_strategy=defender_policy, attacker_strategy=attacker_policy)
            defender_thresholds, defender_val = self.defender_best_response(
                seed=br_seed, attacker_strategy=attacker_policy)

            attacker_metrics = self.evaluate_attacker_policy(
                attacker_thresholds=attacker_thresholds, defender_strategy=defender_policy,
                attacker_strategy=attacker_policy)
            defender_metrics = self.evaluate_defender_policy(
                defender_thresholds=defender_thresholds, attacker_strategy=attacker_policy)
            strategy_profile_metrics = self.evaluate_strategy_profile(
                defender_strategy=defender_policy, attacker_strategy=attacker_policy)

            attacker_val = round(attacker_metrics[env_constants.ENV_METRICS.RETURN], 3)
            defender_val = round(defender_metrics[env_constants.ENV_METRICS.RETURN], 3)
            val = round(strategy_profile_metrics[env_constants.ENV_METRICS.RETURN], 3)

            attacker_policy._update_Theta_attacker(new_thresholds=[attacker_thresholds])
            defender_policy._update_Theta_defender(new_thresholds=[defender_thresholds])
            val_attacker_exp = attacker_val
            val_defender_exp = defender_val

            attacker_policy.opponent_strategy = defender_policy

            # Log rewards
            exp_result.all_metrics[seed][agents_constants.T_FP.AVERAGE_BEST_RESPONSE_ATTACKER_RETURN].append(
                val_attacker_exp)
            exp_result.all_metrics[seed][agents_constants.T_FP.AVERAGE_BEST_RESPONSE_DEFENDER_RETURN].append(
                val_defender_exp)
            exp_result.all_metrics[seed][agents_constants.T_FP.RUNNING_AVERAGE_BEST_RESPONSE_ATTACKER_RETURN].append(
                ExperimentUtil.running_average(
                    exp_result.all_metrics[seed][agents_constants.T_FP.AVERAGE_BEST_RESPONSE_ATTACKER_RETURN],
                    self.experiment_config.hparams[agents_constants.COMMON.RUNNING_AVERAGE].value))
            exp_result.all_metrics[seed][agents_constants.T_FP.RUNNING_AVERAGE_BEST_RESPONSE_DEFENDER_RETURN].append(
                ExperimentUtil.running_average(
                    exp_result.all_metrics[seed][agents_constants.T_FP.AVERAGE_BEST_RESPONSE_DEFENDER_RETURN],
                    self.experiment_config.hparams[agents_constants.COMMON.RUNNING_AVERAGE].value))

            exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_ATTACKER_RETURN].append(val)
            exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_DEFENDER_RETURN].append(-val)
            exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_ATTACKER_RETURN].append(
                ExperimentUtil.running_average(
                    exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_ATTACKER_RETURN],
                    self.experiment_config.hparams[agents_constants.COMMON.RUNNING_AVERAGE].value))
            exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_DEFENDER_RETURN].append(
                ExperimentUtil.running_average(
                    exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_DEFENDER_RETURN],
                    self.experiment_config.hparams[agents_constants.COMMON.RUNNING_AVERAGE].value))

            # Log thresholds
            exp_result.all_metrics[seed][agents_constants.T_FP.ATTACKER_THRESHOLDS].append(
                attacker_policy.attacker_Theta)
            exp_result.all_metrics[seed][agents_constants.T_FP.DEFENDER_THRESHOLDS].append(
                defender_policy.defender_Theta)

            # Log stop distributions
            for k, v in attacker_policy.stop_distributions().items():
                exp_result.all_metrics[seed][k].append(v)
            for k, v in defender_policy.stop_distributions().items():
                exp_result.all_metrics[seed][k].append(v)

            # Log intrusion lengths
            exp_result.all_metrics[seed][env_constants.ENV_METRICS.INTRUSION_LENGTH].append(
                round(strategy_profile_metrics[env_constants.ENV_METRICS.INTRUSION_LENGTH], 3))
            exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_INTRUSION_LENGTH].append(
                ExperimentUtil.running_average(
                    exp_result.all_metrics[seed][env_constants.ENV_METRICS.INTRUSION_LENGTH],
                    self.experiment_config.hparams[agents_constants.COMMON.RUNNING_AVERAGE].value))

            # Log stopping times
            exp_result.all_metrics[seed][env_constants.ENV_METRICS.INTRUSION_START].append(
                round(strategy_profile_metrics[env_constants.ENV_METRICS.INTRUSION_START], 3))
            exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_INTRUSION_START].append(
                ExperimentUtil.running_average(
                    exp_result.all_metrics[seed][env_constants.ENV_METRICS.INTRUSION_START],
                    self.experiment_config.hparams[agents_constants.COMMON.RUNNING_AVERAGE].value))
            exp_result.all_metrics[seed][env_constants.ENV_METRICS.TIME_HORIZON].append(
                round(strategy_profile_metrics[env_constants.ENV_METRICS.TIME_HORIZON], 3))
            exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_TIME_HORIZON].append(
                ExperimentUtil.running_average(
                    exp_result.all_metrics[seed][env_constants.ENV_METRICS.TIME_HORIZON],
                    self.experiment_config.hparams[agents_constants.COMMON.RUNNING_AVERAGE].value))
            for l in range(1, self.experiment_config.hparams[constants.T_SPSA.L].value + 1):
                exp_result.plot_metrics.append(env_constants.ENV_METRICS.STOP + f"_{l}")
                exp_result.all_metrics[seed][env_constants.ENV_METRICS.STOP + f"_{l}"].append(
                    round(strategy_profile_metrics[env_constants.ENV_METRICS.STOP + f"_{l}"], 3))
                exp_result.all_metrics[seed][env_constants.ENV_METRICS.STOP + f"_running_average_{l}"].append(
                    ExperimentUtil.running_average(
                        exp_result.all_metrics[seed][env_constants.ENV_METRICS.STOP + f"_{l}"],
                        self.experiment_config.hparams[agents_constants.COMMON.RUNNING_AVERAGE].value))

            # Log baseline returns
            exp_result.all_metrics[seed][env_constants.ENV_METRICS.AVERAGE_UPPER_BOUND_RETURN].append(
                round(strategy_profile_metrics[env_constants.ENV_METRICS.AVERAGE_UPPER_BOUND_RETURN], 3))
            exp_result.all_metrics[seed][
                env_constants.ENV_METRICS.AVERAGE_DEFENDER_BASELINE_STOP_ON_FIRST_ALERT_RETURN].append(
                round(
                    strategy_profile_metrics[
                        env_constants.ENV_METRICS.AVERAGE_DEFENDER_BASELINE_STOP_ON_FIRST_ALERT_RETURN], 3))

            # Compute and log exploitability
            exp = TFPAgent.exploitability(attacker_val=val_attacker_exp, defender_val=val_defender_exp)
            exp_result.all_metrics[seed][agents_constants.COMMON.EXPLOITABILITY].append(exp)
            running_avg_exp = ExperimentUtil.running_average(
                exp_result.all_metrics[seed][agents_constants.COMMON.EXPLOITABILITY],
                self.experiment_config.hparams[agents_constants.COMMON.RUNNING_AVERAGE].value)
            exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_EXPLOITABILITY].append(running_avg_exp)

            # Logging the progress
            if i % self.experiment_config.log_every == 0:
                Logger.__call__().get_logger().info(
                    f"[T-FP] i: {i}, Exp: {exp}, "
                    f"Exp_avg_{self.experiment_config.hparams[agents_constants.COMMON.RUNNING_AVERAGE].value}: "
                    f"{running_avg_exp}, game_val: {val} "
                    f"opt_val:{exp_result.all_metrics[seed][env_constants.ENV_METRICS.AVERAGE_UPPER_BOUND_RETURN][-1]},"
                    f" Defender val:{defender_val}, Attacker val:{attacker_val}, "
                    f"defender BR thresholds:{defender_thresholds},"
                    f" attacker BR thresholds: {attacker_thresholds},"
                    f" defender stop dists: {defender_policy.stop_distributions()}, "
                    f"attacker stop dists: {attacker_policy.stop_distributions()}")

                # Update training job
                total_iterations = len(random_seeds) * self.experiment_config.hparams[agents_constants.T_FP.N_2].value
                iterations_done = ((random_seeds.index(seed)) *
                                   self.experiment_config.hparams[agents_constants.T_FP.N_2].value + i)
                progress = round(iterations_done / total_iterations, 2)
                training_job.progress_percentage = progress
                MetastoreFacade.update_training_job(training_job=training_job, id=training_job.id)
        return exp_result

[docs]    def evaluate_defender_policy(self, defender_thresholds: List[float],
                                 attacker_strategy: MixedMultiThresholdStoppingPolicy) -> Dict[str, Union[float, int]]:
        """
        Monte-Carlo evaluation of the game value of a given defender policy against the average attacker strategy

        :param defender_thresholds: the defender strategy to evaluate
        :param attacker_strategy: the average attacker strategy
        :return: the average reward
        """
        defender_policy = MultiThresholdStoppingPolicy(
            theta=defender_thresholds, simulation_name=self.simulation_env_config.name,
            states=self.simulation_env_config.state_space_config.states,
            player_type=PlayerType.DEFENDER, L=self.defender_experiment_config.hparams[constants.T_SPSA.L].value,
            actions=self.defender_simulation_env_config.joint_action_space_config.action_spaces[
                self.defender_experiment_config.player_idx].actions, experiment_config=self.defender_experiment_config,
            avg_R=-1, agent_type=AgentType.NONE)
        self.defender_simulation_env_config.simulation_env_input_config.attacker_strategy = attacker_strategy
        env = gym.make(self.defender_simulation_env_config.gym_env_name,
                       config=self.defender_simulation_env_config.simulation_env_input_config)
        return self._eval_env(
            env=env, policy=defender_policy,
            num_iterations=self.experiment_config.hparams[
                agents_constants.T_FP.BEST_RESPONSE_EVALUATION_ITERATIONS].value)

[docs]    def evaluate_strategy_profile(self, defender_strategy: MixedMultiThresholdStoppingPolicy,
                                  attacker_strategy: MixedMultiThresholdStoppingPolicy) -> Dict[str, Union[float, int]]:
        """
        Monte-Carlo evaluation of the game value following a given strategy profile

        :param defender_strategy: the average defender strategy
        :param attacker_strategy: the average attacker strategy
        :return: the average reward
        """
        self.attacker_simulation_env_config.simulation_env_input_config.defender_strategy = defender_strategy
        env: BaseEnv = gym.make(self.attacker_simulation_env_config.gym_env_name,
                                config=self.attacker_simulation_env_config.simulation_env_input_config)
        env.unwrapped.set_model(attacker_strategy)
        attacker_strategy.opponent_strategy = env.unwrapped.static_defender_strategy
        return self._eval_env(
            env=env, policy=attacker_strategy,
            num_iterations=self.experiment_config.hparams[
                agents_constants.T_FP.EQUILIBRIUM_STRATEGIES_EVALUATION_ITERATIONS].value)

[docs]    def evaluate_attacker_policy(self, attacker_thresholds: List[List[float]],
                                 defender_strategy: MixedMultiThresholdStoppingPolicy,
                                 attacker_strategy: MixedMultiThresholdStoppingPolicy) -> Dict[str, Union[float, int]]:
        """
        Monte-Carlo evaluation of the game value of a given attacker policy against the average defender strategy

        :param defender_thresholds: the defender strategy to evaluate
        :param defender_strategy: the average defender strategy
        :param attacker_strategy: the average attacker strategy
        :return: the average reward
        """
        theta = [item for sublist in attacker_thresholds for item in sublist]
        attacker_policy = MultiThresholdStoppingPolicy(
            theta=theta, simulation_name=self.simulation_env_config.name,
            states=self.simulation_env_config.state_space_config.states,
            player_type=PlayerType.ATTACKER, L=self.attacker_experiment_config.hparams[constants.T_SPSA.L].value,
            actions=self.attacker_simulation_env_config.joint_action_space_config.action_spaces[
                self.attacker_experiment_config.player_idx].actions, experiment_config=self.attacker_experiment_config,
            avg_R=-1, agent_type=AgentType.NONE)
        self.attacker_simulation_env_config.simulation_env_input_config.defender_strategy = defender_strategy
        env: BaseEnv = gym.make(self.attacker_simulation_env_config.gym_env_name,
                                config=self.attacker_simulation_env_config.simulation_env_input_config)
        env.unwrapped.set_model(attacker_strategy)
        attacker_policy.opponent_strategy = env.unwrapped.static_defender_strategy
        return self._eval_env(
            env=env, policy=attacker_policy,
            num_iterations=self.experiment_config.hparams[
                agents_constants.T_FP.BEST_RESPONSE_EVALUATION_ITERATIONS].value)

[docs]    def defender_best_response(self, seed: int, attacker_strategy: MixedMultiThresholdStoppingPolicy) \
            -> Tuple[List[float], float]:
        """
        Learns a best response for the defender against a given attacker strategy

        :param seed: the random seed
        :param attacker_strategy: the attacker strategy
        :return: the learned thresholds and the value
        """
        self.defender_experiment_config.random_seeds = [seed]
        self.defender_experiment_config.output_dir = str(self.root_output_dir)
        self.defender_simulation_env_config.simulation_env_input_config.attacker_strategy = attacker_strategy
        env = gym.make(self.defender_simulation_env_config.gym_env_name,
                       config=self.defender_simulation_env_config.simulation_env_input_config)
        agent = TSPSAAgent(emulation_env_config=self.emulation_env_config,
                           simulation_env_config=self.defender_simulation_env_config,
                           experiment_config=self.defender_experiment_config, env=env, save_to_metastore=False)
        Logger.__call__().get_logger().info(f"[T-FP] Starting training of the defender's best response "
                                            f"against attacker strategy: {attacker_strategy}")
        experiment_execution = agent.train()
        policy: MultiThresholdStoppingPolicy = experiment_execution.result.policies[seed]
        thresholds = policy.thresholds()
        val = experiment_execution.result.avg_metrics[agents_constants.COMMON.RUNNING_AVERAGE_RETURN][-1]
        return thresholds, val

    def _eval_env(self, env: BaseEnv, policy: Policy, num_iterations: int) -> Dict[str, Union[float, int]]:
        """
        Evaluates a given policy

        :param env: the environment to use for evaluation
        :param policy: the policy to evaluate
        :param num_iterations: number of iterations to evaluate
        :return: the average reward
        """
        metrics: Dict[str, Any] = {}
        for j in range(num_iterations):
            done = False
            o, _ = env.reset()
            J = 0
            t = 1
            while not done and t <= self.experiment_config.hparams[agents_constants.COMMON.MAX_ENV_STEPS].value:
                a = policy.action(o=o)
                o, r, done, _, info = env.step(a)
                J += r
                t += 1
            metrics = TSPSAAgent.update_metrics(metrics=metrics, info=info)
        avg_metrics = TSPSAAgent.compute_avg_metrics(metrics=metrics)
        return avg_metrics

[docs]    @staticmethod
    def update_metrics(metrics: Dict[str, List[Union[float, int]]], info: Dict[str, Union[float, int]]) \
            -> Dict[str, List[Union[float, int]]]:
        """
        Update a dict with aggregated metrics using new information from the environment

        :param metrics: the dict with the aggregated metrics
        :param info: the new information
        :return: the updated dict
        """
        for k, v in info.items():
            if k in metrics:
                metrics[k].append(round(v, 3))
            else:
                metrics[k] = [v]
        return metrics

[docs]    @staticmethod
    def compute_avg_metrics(metrics: Dict[str, List[Union[float, int]]]) -> Dict[str, Union[float, int]]:
        """
        Computes the average metrics of a dict with aggregated metrics

        :param metrics: the dict with the aggregated metrics
        :return: the average metrics
        """
        avg_metrics = {}
        for k, v in metrics.items():
            avg = round(sum(v) / len(v), 2)
            avg_metrics[k] = avg
        return avg_metrics

[docs]    def attacker_best_response(self, seed: int, defender_strategy: MixedMultiThresholdStoppingPolicy,
                               attacker_strategy: MixedMultiThresholdStoppingPolicy) -> Tuple[List[List[float]], float]:
        """
        Learns a threshold best response strategy for the attacker against a given defender strategy

        :param seed: the random seed
        :param defender_strategy: the defender strategy
        :param attacker_strategy: the attacker strategy
        :return: the learned threshold strategy and its estimated value
        """
        self.attacker_experiment_config.random_seeds = [seed]
        self.attacker_experiment_config.output_dir = str(self.root_output_dir)
        self.attacker_simulation_env_config.simulation_env_input_config.defender_strategy = defender_strategy
        env: BaseEnv = gym.make(self.attacker_simulation_env_config.gym_env_name,
                                config=self.attacker_simulation_env_config.simulation_env_input_config)
        env.unwrapped.set_model(attacker_strategy)
        agent = TSPSAAgent(emulation_env_config=self.emulation_env_config,
                           simulation_env_config=self.attacker_simulation_env_config,
                           experiment_config=self.attacker_experiment_config,
                           env=env, save_to_metastore=False)
        Logger.__call__().get_logger().info(f"[T-FP] Starting training of the attacker's best response "
                                            f"against defender strategy: {defender_strategy}")
        experiment_execution = agent.train()
        policy: MultiThresholdStoppingPolicy = experiment_execution.result.policies[seed]
        thresholds = policy.thresholds()
        val = experiment_execution.result.avg_metrics[agents_constants.COMMON.RUNNING_AVERAGE_RETURN][-1]
        attacker_thresholds = [
            thresholds[0:self.attacker_experiment_config.hparams[constants.T_SPSA.L].value],
            thresholds[self.attacker_experiment_config.hparams[constants.T_SPSA.L].value:]
        ]
        return attacker_thresholds, val

[docs]    def hparam_names(self) -> List[str]:
        """
        :return: a list with the hyperparameter names
        """
        return [constants.T_SPSA.a, constants.T_SPSA.c, constants.T_SPSA.LAMBDA,
                constants.T_SPSA.A, constants.T_SPSA.EPSILON, constants.T_SPSA.N,
                constants.T_SPSA.L, agents_constants.T_FP.THETA1_ATTACKER, agents_constants.T_FP.THETA1_DEFENDER,
                agents_constants.COMMON.EVAL_BATCH_SIZE,
                agents_constants.T_FP.N_2, constants.T_SPSA.GRADIENT_BATCH_SIZE,
                agents_constants.COMMON.CONFIDENCE_INTERVAL, agents_constants.COMMON.RUNNING_AVERAGE,
                agents_constants.T_FP.BEST_RESPONSE_EVALUATION_ITERATIONS,
                agents_constants.T_FP.EQUILIBRIUM_STRATEGIES_EVALUATION_ITERATIONS,
                constants.T_SPSA.POLICY_TYPE, constants.T_SPSA.OBJECTIVE_TYPE]

[docs]    @staticmethod
    def exploitability(attacker_val: float, defender_val: float) -> float:
        """
        Computes the exploitability metric given the value of the attacker when following a best response
        against the current defender strategy and the value of the defender when following a best response against
        the current attacker strategy.

        :param attacker_val: the value of the attacker when following a best response against
                             the current defender strategy
        :param defender_val: the value of the defender when following a best response against the
                             current attacker strategy
        :return: the exploitability
        """
        return round(math.fabs(attacker_val + defender_val), 2)

[docs]    def get_defender_experiment_config(self) -> ExperimentConfig:
        """
        :return: the experiment configuration for learning a best response of the defender
        """
        hparams = {
            constants.T_SPSA.N: self.experiment_config.hparams[constants.T_SPSA.N],
            constants.T_SPSA.c: self.experiment_config.hparams[constants.T_SPSA.c],
            constants.T_SPSA.a: self.experiment_config.hparams[constants.T_SPSA.a],
            constants.T_SPSA.A: self.experiment_config.hparams[constants.T_SPSA.A],
            constants.T_SPSA.POLICY_TYPE: self.experiment_config.hparams[constants.T_SPSA.POLICY_TYPE],
            constants.T_SPSA.OBJECTIVE_TYPE: self.experiment_config.hparams[constants.T_SPSA.OBJECTIVE_TYPE],
            constants.T_SPSA.LAMBDA: self.experiment_config.hparams[constants.T_SPSA.LAMBDA],
            constants.T_SPSA.EPSILON: self.experiment_config.hparams[constants.T_SPSA.EPSILON],
            constants.T_SPSA.L: self.experiment_config.hparams[constants.T_SPSA.L],
            agents_constants.COMMON.EVAL_BATCH_SIZE: self.experiment_config.hparams[
                agents_constants.COMMON.EVAL_BATCH_SIZE],
            agents_constants.COMMON.CONFIDENCE_INTERVAL: self.experiment_config.hparams[
                agents_constants.COMMON.CONFIDENCE_INTERVAL],
            agents_constants.COMMON.MAX_ENV_STEPS: self.experiment_config.hparams[
                agents_constants.COMMON.MAX_ENV_STEPS],
            constants.T_SPSA.GRADIENT_BATCH_SIZE: self.experiment_config.hparams[
                constants.T_SPSA.GRADIENT_BATCH_SIZE],
            agents_constants.COMMON.RUNNING_AVERAGE: self.experiment_config.hparams[
                agents_constants.COMMON.RUNNING_AVERAGE],
            agents_constants.COMMON.GAMMA: self.experiment_config.hparams[agents_constants.COMMON.GAMMA]
        }
        if agents_constants.T_FP.THETA1_DEFENDER in self.experiment_config.hparams:
            hparams[constants.T_SPSA.THETA1] = \
                self.experiment_config.hparams[agents_constants.T_FP.THETA1_DEFENDER]
        return ExperimentConfig(
            output_dir=str(self.root_output_dir),
            title="Learning a best response of the defender as part of T-FP",
            random_seeds=[], agent_type=AgentType.T_SPSA,
            log_every=self.experiment_config.br_log_every,
            hparams=hparams,
            player_type=PlayerType.DEFENDER, player_idx=0
        )

[docs]    def get_attacker_experiment_config(self) -> ExperimentConfig:
        """
        :return: the experiment configuration for learning a best response of the attacker
        """
        hparams = {
            constants.T_SPSA.N: self.experiment_config.hparams[constants.T_SPSA.N],
            constants.T_SPSA.c: self.experiment_config.hparams[constants.T_SPSA.c],
            constants.T_SPSA.a: self.experiment_config.hparams[constants.T_SPSA.a],
            constants.T_SPSA.A: self.experiment_config.hparams[constants.T_SPSA.A],
            constants.T_SPSA.POLICY_TYPE: self.experiment_config.hparams[constants.T_SPSA.POLICY_TYPE],
            constants.T_SPSA.OBJECTIVE_TYPE: self.experiment_config.hparams[constants.T_SPSA.OBJECTIVE_TYPE],
            constants.T_SPSA.LAMBDA: self.experiment_config.hparams[constants.T_SPSA.LAMBDA],
            constants.T_SPSA.EPSILON: self.experiment_config.hparams[constants.T_SPSA.EPSILON],
            constants.T_SPSA.L: self.experiment_config.hparams[constants.T_SPSA.L],
            agents_constants.COMMON.EVAL_BATCH_SIZE: self.experiment_config.hparams[
                agents_constants.COMMON.EVAL_BATCH_SIZE],
            agents_constants.COMMON.CONFIDENCE_INTERVAL: self.experiment_config.hparams[
                agents_constants.COMMON.CONFIDENCE_INTERVAL],
            agents_constants.COMMON.MAX_ENV_STEPS: self.experiment_config.hparams[
                agents_constants.COMMON.MAX_ENV_STEPS],
            constants.T_SPSA.GRADIENT_BATCH_SIZE: self.experiment_config.hparams[
                constants.T_SPSA.GRADIENT_BATCH_SIZE],
            agents_constants.COMMON.RUNNING_AVERAGE: self.experiment_config.hparams[
                agents_constants.COMMON.RUNNING_AVERAGE],
            agents_constants.COMMON.GAMMA: self.experiment_config.hparams[
                agents_constants.COMMON.GAMMA]
        }
        if agents_constants.T_FP.THETA1_ATTACKER in self.experiment_config.hparams:
            hparams[constants.T_SPSA.THETA1] = \
                self.experiment_config.hparams[agents_constants.T_FP.THETA1_ATTACKER]
        return ExperimentConfig(
            output_dir=str(self.root_output_dir),
            title="Learning a best response of the attacker as part of T-FP",
            random_seeds=[], agent_type=AgentType.T_SPSA,
            log_every=self.experiment_config.br_log_every,
            hparams=hparams,
            player_type=PlayerType.ATTACKER, player_idx=1
        )

[docs]    @staticmethod
    def round_vec(vec) -> List[float]:
        """
        Rounds a vector to 3 decimals

        :param vec: the vector to round
        :return: the rounded vector
        """
        return list(map(lambda x: round(x, 3), vec))

[docs]    @staticmethod
    def running_average(x: List[float], N: int) -> List[float]:
        """
        Calculates the running average of the last N elements of vector x

        :param x: the vector
        :param N: the number of elements to use for average calculation
        :return: the running average vector
        """
        if len(x) >= N:
            y = np.copy(x)
            y[N - 1:] = np.convolve(x, np.ones((N,)) / N, mode='valid')
        else:
            N = len(x)
            y = np.copy(x)
            y[N - 1:] = np.convolve(x, np.ones((N,)) / N, mode='valid')
        return list(y.tolist())