Source code for csle_agents.agents.shapley_iteration.shapley_iteration_agent

from typing import List, Optional, Tuple, Any
import math
import time
import os
import numpy as np
import numpy.typing as npt
import pulp
from csle_common.dao.simulation_config.simulation_env_config import SimulationEnvConfig
from csle_common.dao.training.experiment_config import ExperimentConfig
from csle_common.dao.training.experiment_result import ExperimentResult
from csle_common.dao.training.agent_type import AgentType
from csle_common.util.experiment_util import ExperimentUtil
from csle_common.logging.log import Logger
from csle_common.metastore.metastore_facade import MetastoreFacade
from csle_common.dao.jobs.training_job_config import TrainingJobConfig
from csle_common.dao.training.experiment_execution import ExperimentExecution
from csle_common.dao.training.tabular_policy import TabularPolicy
from csle_common.util.general_util import GeneralUtil
from csle_agents.agents.base.base_agent import BaseAgent
import csle_agents.constants.constants as agents_constants


[docs]class ShapleyIterationAgent(BaseAgent):
    """
    Shapley Iteration Agent
    """

    def __init__(self, simulation_env_config: SimulationEnvConfig, experiment_config: ExperimentConfig,
                 training_job: Optional[TrainingJobConfig] = None, save_to_metastore: bool = True):
        """
        Initializes the value iteration agent

        :param simulation_env_config: configuration of the simulation environment
        :param experiment_config: the experiment configuration
        :param training_job: an existing training job to use (optional)
        :param save_to_metastore: boolean flag whether to save the execution to the metastore
        """
        super().__init__(simulation_env_config=simulation_env_config, emulation_env_config=None,
                         experiment_config=experiment_config)
        assert experiment_config.agent_type == AgentType.SHAPLEY_ITERATION
        self.training_job = training_job
        self.save_to_metastore = save_to_metastore

[docs]    def train(self) -> ExperimentExecution:
        """
        Runs the value iteration algorithm to compute V*

        :return: the results
        """
        pid = os.getpid()

        # Initialize metrics
        exp_result = ExperimentResult()
        exp_result.plot_metrics.append(agents_constants.SHAPLEY_ITERATION.DELTA)

        descr = f"Computation of Nash equilibrium with the Shapley Iteration algorithm using " \
                f"simulation:{self.simulation_env_config.name}"

        for seed in self.experiment_config.random_seeds:
            exp_result.all_metrics[seed] = {}
            exp_result.all_metrics[seed][agents_constants.SHAPLEY_ITERATION.DELTA] = []

        # Initialize training job
        if self.training_job is None:
            self.training_job = TrainingJobConfig(
                simulation_env_name=self.simulation_env_config.name, experiment_config=self.experiment_config,
                progress_percentage=0, pid=pid, experiment_result=exp_result,
                emulation_env_name=None, simulation_traces=[],
                num_cached_traces=0,
                log_file_path=Logger.__call__().get_log_file_path(), descr=descr,
                physical_host_ip=GeneralUtil.get_host_ip())
            if self.save_to_metastore:
                training_job_id = MetastoreFacade.save_training_job(training_job=self.training_job)
                self.training_job.id = training_job_id
        else:
            self.training_job.pid = pid
            self.training_job.progress_percentage = 0
            self.training_job.experiment_result = exp_result
            if self.save_to_metastore:
                MetastoreFacade.update_training_job(training_job=self.training_job, id=self.training_job.id)

        # Initialize execution result
        ts = time.time()
        emulation_name = None
        if self.emulation_env_config is not None:
            emulation_name = self.emulation_env_config.name
        simulation_name = self.simulation_env_config.name
        self.exp_execution = ExperimentExecution(result=exp_result, config=self.experiment_config, timestamp=ts,
                                                 emulation_name=emulation_name, simulation_name=simulation_name,
                                                 descr=descr, log_file_path=self.training_job.log_file_path)
        if self.save_to_metastore:
            exp_execution_id = MetastoreFacade.save_experiment_execution(self.exp_execution)
            self.exp_execution.id = exp_execution_id

        for seed in self.experiment_config.random_seeds:
            ExperimentUtil.set_seed(seed)
            exp_result = self.shapley_iteration(exp_result=exp_result, seed=seed)

        # Calculate average and std metrics
        exp_result.avg_metrics = {}
        exp_result.std_metrics = {}
        for metric in exp_result.all_metrics[self.experiment_config.random_seeds[0]].keys():
            value_vectors = []
            for seed in self.experiment_config.random_seeds:
                value_vectors.append(exp_result.all_metrics[seed][metric])

            avg_metrics = []
            std_metrics = []
            for i in range(len(value_vectors[0])):
                if type(value_vectors[0][0]) is int or type(value_vectors[0][0]) is float \
                        or type(value_vectors[0][0]) is np.int64 or type(value_vectors[0][0]) is np.float64:
                    seed_values = []
                    for seed_idx in range(len(self.experiment_config.random_seeds)):
                        seed_values.append(value_vectors[seed_idx][i])
                    avg = ExperimentUtil.mean_confidence_interval(
                        data=seed_values,
                        confidence=self.experiment_config.hparams[agents_constants.COMMON.CONFIDENCE_INTERVAL].value)[0]
                    if not math.isnan(avg):
                        avg_metrics.append(avg)
                    ci = ExperimentUtil.mean_confidence_interval(
                        data=seed_values,
                        confidence=self.experiment_config.hparams[agents_constants.COMMON.CONFIDENCE_INTERVAL].value)[1]
                    if not math.isnan(ci):
                        std_metrics.append(ci)
                    else:
                        std_metrics.append(-1)
                else:
                    avg_metrics.append(-1)
                    std_metrics.append(-1)
                exp_result.avg_metrics[metric] = avg_metrics
                exp_result.std_metrics[metric] = std_metrics

                ts = time.time()
        self.exp_execution.timestamp = ts
        self.exp_execution.result = exp_result
        self.training_job.experiment_result = exp_result
        if self.save_to_metastore:
            MetastoreFacade.update_experiment_execution(experiment_execution=self.exp_execution,
                                                        id=self.exp_execution.id)
            MetastoreFacade.update_training_job(training_job=self.training_job, id=self.training_job.id)
        return self.exp_execution

[docs]    def hparam_names(self) -> List[str]:
        """
        :return: a list with the hyperparameter names
        """
        return [agents_constants.COMMON.EVAL_BATCH_SIZE, agents_constants.COMMON.CONFIDENCE_INTERVAL,
                agents_constants.COMMON.RUNNING_AVERAGE, agents_constants.COMMON.GAMMA,
                agents_constants.SHAPLEY_ITERATION.TRANSITION_TENSOR,
                agents_constants.SHAPLEY_ITERATION.REWARD_TENSOR,
                agents_constants.SHAPLEY_ITERATION.STATE_SPACE,
                agents_constants.SHAPLEY_ITERATION.ACTION_SPACE_PLAYER_1,
                agents_constants.SHAPLEY_ITERATION.ACTION_SPACE_PLAYER_2, agents_constants.SHAPLEY_ITERATION.DELTA,
                agents_constants.SHAPLEY_ITERATION.N]

[docs]    def shapley_iteration(self, exp_result: ExperimentResult, seed: int) -> ExperimentResult:
        """
        Runs the Shapley iteration algorithm

        :param exp_result: the experiment result object
        :param seed: the random seed
        :return: the updated experiment result
        """
        discount_factor = self.experiment_config.hparams[agents_constants.COMMON.GAMMA].value
        T = self.experiment_config.hparams[agents_constants.SHAPLEY_ITERATION.TRANSITION_TENSOR].value
        R = self.experiment_config.hparams[agents_constants.SHAPLEY_ITERATION.REWARD_TENSOR].value
        A1 = self.experiment_config.hparams[agents_constants.SHAPLEY_ITERATION.ACTION_SPACE_PLAYER_1].value
        A2 = self.experiment_config.hparams[agents_constants.SHAPLEY_ITERATION.ACTION_SPACE_PLAYER_2].value
        S = self.experiment_config.hparams[agents_constants.SHAPLEY_ITERATION.STATE_SPACE].value
        N = self.experiment_config.hparams[agents_constants.SHAPLEY_ITERATION.N].value
        delta = self.experiment_config.hparams[agents_constants.SHAPLEY_ITERATION.DELTA].value
        Logger.__call__().get_logger().info("Starting the shapley iteration algorithm")
        V, maximin_strategies, minimax_strategies, auxillary_games, deltas = self.si(
            S=np.array(S), A1=np.array(A1), A2=np.array(A2), R=np.array(R), T=np.array(T), gamma=discount_factor,
            max_iterations=N, delta_threshold=delta)
        exp_result.all_metrics[seed][agents_constants.VI.DELTA] = deltas
        tabular_policy_p1 = TabularPolicy(
            player_type=self.experiment_config.player_type,
            actions=self.simulation_env_config.joint_action_space_config.action_spaces[
                self.experiment_config.player_idx].actions, agent_type=self.experiment_config.agent_type,
            value_function=list(V), lookup_table=list(maximin_strategies),
            simulation_name=self.simulation_env_config.name, avg_R=V[0])
        tabular_policy_p2 = TabularPolicy(
            player_type=self.experiment_config.player_type,
            actions=self.simulation_env_config.joint_action_space_config.action_spaces[
                self.experiment_config.player_idx].actions, agent_type=self.experiment_config.agent_type,
            value_function=list(V), lookup_table=list(minimax_strategies),
            simulation_name=self.simulation_env_config.name, avg_R=V[0])
        exp_result.policies[seed] = tabular_policy_p1
        exp_result.policies[seed + 1] = tabular_policy_p2
        return exp_result

[docs]    def auxillary_game(self, V: npt.NDArray[Any], gamma: float, S: npt.NDArray[Any], s: int,
                       A1: npt.NDArray[Any], A2: npt.NDArray[Any], R: npt.NDArray[Any],
                       T: npt.NDArray[Any]) -> npt.NDArray[Any]:
        """
        Creates an auxillary matrix game based on the value function V

        :param V: the value function
        :param gamma: the discount factor
        :param S: the set of states
        :param s: the state s
        :param A1: the set of actions of player 1
        :param A2: the set of actions of player 2
        :param R: the reward tensor
        :param T: the transition tensor
        :return: the matrix auxillary game
        """
        A = np.zeros((len(A1), len(A2)))
        for a1 in A1:
            for a2 in A2:
                immediate_reward = R[a1][a2][s]
                expected_future_reward = 0.0
                for s_prime in S:
                    expected_future_reward += T[a1][a2][s][s_prime] * V[s_prime]
                expected_future_reward = expected_future_reward * gamma
                A[a1][a2] = immediate_reward + expected_future_reward
        return A

[docs]    def compute_matrix_game_value(self, A: npt.NDArray[Any], A1: npt.NDArray[Any], A2: npt.NDArray[Any],
                                  maximizer: bool = True):
        """

        :param A: the matrix game
        :param A1: the set of actions of player 1
        :param A2: the set of acitons of player 2
        :param maximizer: a boolean flag indicating whether the maximin or minimax strategy should be computed
        :return: (val(A), maximin/minimax)
        """
        if maximizer:
            problem = pulp.LpProblem("AuxillaryGame", pulp.LpMaximize)
            Ai = A1
        else:
            problem = pulp.LpProblem("AuxillaryGame", pulp.LpMinimize)
            Ai = A2

        # Decision variables, strategy-weights
        s = []
        for ai in Ai:
            si = pulp.LpVariable("s_" + str(ai), lowBound=0, upBound=1, cat=pulp.LpContinuous)
            s.append(si)

        # Auxillary decision variable, value of the game v
        v = pulp.LpVariable("v", lowBound=None, upBound=None, cat=pulp.LpContinuous)

        # The objective function
        problem += v, "Value of the game"

        # The constraints
        if maximizer:
            for j in range(A.shape[1]):
                sum = 0
                for i in range(A.shape[0]):
                    sum += s[i] * A[i][j]
                problem += sum >= v, "SecurityValueConstraint_" + str(j)
        else:
            for i in range(A.shape[0]):
                sum = 0
                for j in range(A.shape[1]):
                    sum += s[j] * A[i][j]
                problem += sum <= v, "SecurityValueConstraint_" + str(i)

        strategy_weights_sum = 0
        for si in s:
            strategy_weights_sum += si
        problem += strategy_weights_sum == 1, "probabilities sum"

        # Solve
        problem.solve(pulp.PULP_CBC_CMD(msg=0))

        # Obtain solution
        optimal_strategy = list(map(lambda x: x.varValue, s))
        value = v.varValue
        return value, optimal_strategy

[docs]    def si(self, S: npt.NDArray[Any], A1: npt.NDArray[Any], A2: npt.NDArray[Any], R: npt.NDArray[Any],
           T: npt.NDArray[Any], gamma: float = 1, max_iterations: int = 500, delta_threshold: float = 0.1) \
            -> Tuple[npt.NDArray[Any], npt.NDArray[Any], npt.NDArray[Any], npt.NDArray[Any], List[float]]:
        """
        Shapley Iteration (L. Shapley 1953)

        :param S: the set of states of the SG
        :param A1: the set of actions of player 1 in the SG
        :param A2: the set of actions of player 2 in the SG
        :param R: the reward tensor in the SG
        :param T: the transition tensor in the SG
        :param gamma: the discount factor
        :param max_iterations: the maximum number of iterations
        :param delta_threshold: the stopping threshold
        :return: the value function, the set of maximin strategies for all stage games,
        the set of minimax strategies for all stage games, and the stage games themselves
        """
        deltas = []
        num_states = len(S)
        V = np.zeros(num_states)
        for i in range(max_iterations):
            delta = 0.0
            auxillary_games = []
            for s in S:
                A = self.auxillary_game(V=V, gamma=gamma, S=S, s=s, A1=A1, A2=A2, R=R, T=T)
                auxillary_games.append(A)

            for s in S:
                value, _ = self.compute_matrix_game_value(A=auxillary_games[s], A1=A1, A2=A2, maximizer=True)
                delta += abs(V[s] - value)
                V[s] = value

            deltas.append(delta)

            if i % self.experiment_config.log_every == 0 and i > 0:
                Logger.__call__().get_logger().info(f"[Shapley iteration] i:{i}, delta: {delta}, V: {V}")

            if delta <= delta_threshold:
                break

        maximin_strategies = []
        minimax_strategies = []
        auxillary_games = []
        for s in S:
            A = self.auxillary_game(V=V, gamma=gamma, S=S, s=s, A1=A1, A2=A2, R=R, T=T)
            v1, maximin_strategy = self.compute_matrix_game_value(A=A, A1=A1, A2=A2, maximizer=True)
            v2, minimax_strategy = self.compute_matrix_game_value(A=A, A1=A1, A2=A2, maximizer=False)
            maximin_strategies.append(maximin_strategy)
            minimax_strategies.append(minimax_strategy)
            auxillary_games.append(A)

        return V, np.array(maximin_strategies), np.array(minimax_strategies), np.array(auxillary_games), deltas