"""
MIT License
Copyright (c) 2019 CleanRL developers https://github.com/vwxyzjn/cleanrl
"""
from typing import Union, List, Optional, Callable, Tuple, Any
import random
import time
import gymnasium as gym
from gymnasium.wrappers.common import RecordEpisodeStatistics
from gymnasium.spaces.discrete import Discrete
import os
import numpy as np
import torch
import torch.optim as optim
import torch.nn.utils.clip_grad as clip_grad
from torch.distributions.categorical import Categorical
import csle_common.constants.constants as constants
from csle_common.dao.emulation_config.emulation_env_config import EmulationEnvConfig
from csle_common.dao.simulation_config.simulation_env_config import SimulationEnvConfig
from csle_common.dao.training.experiment_config import ExperimentConfig
from csle_common.dao.training.experiment_execution import ExperimentExecution
from csle_common.dao.training.experiment_result import ExperimentResult
from csle_common.dao.training.agent_type import AgentType
from csle_common.util.experiment_util import ExperimentUtil
from csle_common.logging.log import Logger
from csle_common.metastore.metastore_facade import MetastoreFacade
from csle_common.dao.jobs.training_job_config import TrainingJobConfig
from csle_common.util.general_util import GeneralUtil
from csle_common.dao.simulation_config.base_env import BaseEnv
from csle_common.dao.training.ppo_policy import PPOPolicy
from csle_agents.agents.base.base_agent import BaseAgent
from csle_common.models.ppo_network import PPONetwork
import csle_agents.constants.constants as agents_constants
[docs]class PPGCleanAgent(BaseAgent):
"""
A Phasic Policy Gradient agent using the implementation from CleanRL
"""
def __init__(self, simulation_env_config: SimulationEnvConfig,
emulation_env_config: Union[None, EmulationEnvConfig], experiment_config: ExperimentConfig,
training_job: Optional[TrainingJobConfig] = None, save_to_metastore: bool = True) -> None:
"""
Initializes the agent, and sets the hyperparameters as attributes of the class representing the agent.
:param simulation_env_config: the simulation environment configuration
:param emulation_env_config: the emulation environment configuration
:param experiment_config: the experiment configuration
:param training_job: the training job
:param save_to_metastore: boolean flag indicating whether the results should be saved to the metastore or not
"""
super(PPGCleanAgent, self).__init__(simulation_env_config=simulation_env_config,
emulation_env_config=emulation_env_config,
experiment_config=experiment_config)
assert experiment_config.agent_type == AgentType.PPG_CLEAN
self.training_job = training_job
self.save_to_metastore = save_to_metastore
config = self.simulation_env_config.simulation_env_input_config
self.orig_env: BaseEnv = gym.make(self.simulation_env_config.gym_env_name, config=config)
[docs] def train(self) -> ExperimentExecution:
"""
Runs the training process
:return: the results
"""
pid = os.getpid()
# Setup experiment metrics
exp_result = ExperimentResult()
exp_result.plot_metrics.append(agents_constants.COMMON.AVERAGE_RETURN)
exp_result.plot_metrics.append(agents_constants.COMMON.RUNNING_AVERAGE_RETURN)
exp_result.plot_metrics.append(agents_constants.COMMON.RUNNING_AVERAGE_TIME_HORIZON)
exp_result.plot_metrics.append(agents_constants.COMMON.AVERAGE_TIME_HORIZON)
exp_result.plot_metrics.append(agents_constants.COMMON.AVERAGE_UPPER_BOUND_RETURN)
exp_result.plot_metrics.append(agents_constants.COMMON.AVERAGE_RANDOM_RETURN)
exp_result.plot_metrics.append(agents_constants.COMMON.AVERAGE_HEURISTIC_RETURN)
exp_result.plot_metrics.append(agents_constants.COMMON.RUNTIME)
descr = f"Training of policies with Clean-PPG using " \
f"simulation:{self.simulation_env_config.name}"
# Setup training job
if self.training_job is None:
emulation_name = ""
if self.emulation_env_config is not None:
emulation_name = self.emulation_env_config.name
self.training_job = TrainingJobConfig(
simulation_env_name=self.simulation_env_config.name, experiment_config=self.experiment_config,
progress_percentage=0, pid=pid, experiment_result=exp_result,
emulation_env_name=emulation_name, simulation_traces=[],
num_cached_traces=agents_constants.COMMON.NUM_CACHED_SIMULATION_TRACES,
log_file_path=Logger.__call__().get_log_file_path(), descr=descr,
physical_host_ip=GeneralUtil.get_host_ip())
training_job_id = -1
if self.save_to_metastore:
training_job_id = MetastoreFacade.save_training_job(training_job=self.training_job)
self.training_job.id = training_job_id
else:
self.training_job.pid = pid
self.training_job.progress_percentage = 0
self.training_job.experiment_result = exp_result
if self.save_to_metastore:
MetastoreFacade.update_training_job(training_job=self.training_job, id=self.training_job.id)
# Setup experiment execution
ts = time.time()
emulation_name = ""
if self.emulation_env_config is not None:
emulation_name = self.emulation_env_config.name
simulation_name = self.simulation_env_config.name
self.exp_execution = ExperimentExecution(
result=exp_result, config=self.experiment_config, timestamp=ts,
emulation_name=emulation_name, simulation_name=simulation_name, descr=descr,
log_file_path=self.training_job.log_file_path)
exp_execution_id = -1
if self.save_to_metastore:
exp_execution_id = MetastoreFacade.save_experiment_execution(self.exp_execution)
self.exp_execution.id = exp_execution_id
# Training runs, one per seed
for seed in self.experiment_config.random_seeds:
# Train
exp_result, env, model = self.run_ppg(exp_result=exp_result, seed=seed)
# Save policy
ts = time.time()
save_path = f"{self.experiment_config.output_dir}/ppo_policy_seed_{seed}_{ts}.zip"
model.save(save_path)
policy = PPOPolicy(
model=model, simulation_name=self.simulation_env_config.name, save_path=save_path,
states=self.simulation_env_config.state_space_config.states,
actions=self.simulation_env_config.joint_action_space_config.action_spaces[
self.experiment_config.player_idx].actions, player_type=self.experiment_config.player_type,
experiment_config=self.experiment_config,
avg_R=exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_RETURN][-1])
exp_result.policies[seed] = policy
# Save policy metadata
if self.save_to_metastore:
MetastoreFacade.save_ppo_policy(ppo_policy=policy)
os.chmod(save_path, 0o777)
# Save trace
traces = env.get_traces()
if len(traces) > 0 and self.save_to_metastore:
MetastoreFacade.save_simulation_trace(traces[-1])
env.reset_traces()
# Calculate average and std metrics
exp_result.avg_metrics = {}
exp_result.std_metrics = {}
for metric in exp_result.all_metrics[self.experiment_config.random_seeds[0]].keys():
value_vectors = []
for seed in self.experiment_config.random_seeds:
value_vectors.append(exp_result.all_metrics[seed][metric])
avg_metrics = []
std_metrics = []
for i in range(len(value_vectors[0])):
seed_values = []
for seed_idx in range(len(self.experiment_config.random_seeds)):
seed_values.append(value_vectors[seed_idx][i])
avg_metrics.append(ExperimentUtil.mean_confidence_interval(
data=seed_values,
confidence=self.experiment_config.hparams[agents_constants.COMMON.CONFIDENCE_INTERVAL].value)[0])
std_metrics.append(ExperimentUtil.mean_confidence_interval(
data=seed_values,
confidence=self.experiment_config.hparams[agents_constants.COMMON.CONFIDENCE_INTERVAL].value)[1])
exp_result.avg_metrics[metric] = avg_metrics
exp_result.std_metrics[metric] = std_metrics
return self.exp_execution
[docs] def run_ppg(self, exp_result: ExperimentResult, seed: int) -> Tuple[ExperimentResult, BaseEnv, PPONetwork]:
"""
Runs PPG with a given seed
:param exp_result: the object to save the experiment results
:param seed: the random seed
:return: the updated experiment results, the environment, and the trained model
"""
Logger.__call__().get_logger().info(f"[CleanPPG] Start training; seed: {seed}")
clip_coef = self.experiment_config.hparams[agents_constants.PPG_CLEAN.CLIP_COEF].value
adv_norm_fullbatch = self.experiment_config.hparams[agents_constants.PPG_CLEAN.ADV_NORM_FULLBATCH].value
clip_vloss = self.experiment_config.hparams[agents_constants.PPG_CLEAN.CLIP_VLOSS].value
ent_coef = self.experiment_config.hparams[agents_constants.PPG_CLEAN.ENT_COEF].value
max_grad_norm = self.experiment_config.hparams[agents_constants.PPG_CLEAN.MAX_GRAD_NORM].value
target_kl = self.experiment_config.hparams[agents_constants.PPG_CLEAN.TARGET_KL].value
vf_coef = self.experiment_config.hparams[agents_constants.PPG_CLEAN.VF_COEF].value
learning_rate = self.experiment_config.hparams[agents_constants.PPG_CLEAN.LEARNING_RATE].value
num_steps = self.experiment_config.hparams[agents_constants.PPG_CLEAN.NUM_STEPS].value
aux_batch_rollouts = self.experiment_config.hparams[agents_constants.PPG_CLEAN.AUX_BATCH_ROLLOUTS].value
n_iteration = self.experiment_config.hparams[agents_constants.PPG_CLEAN.N_ITERATION].value
anneal_lr = self.experiment_config.hparams[agents_constants.PPG_CLEAN.ANNEAL_LR].value
gamma = self.experiment_config.hparams[agents_constants.COMMON.GAMMA].value
gae_lambda = self.experiment_config.hparams[agents_constants.PPG_CLEAN.GAE_LAMBDA].value
e_policy = self.experiment_config.hparams[agents_constants.PPG_CLEAN.E_POLICY].value
beta_clone = self.experiment_config.hparams[agents_constants.PPG_CLEAN.BETA_CLONE].value
n_aux_grad_accum = self.experiment_config.hparams[agents_constants.PPG_CLEAN.NUM_AUX_GRAD_ACCUM].value
num_aux_rollouts = self.experiment_config.hparams[agents_constants.PPG_CLEAN.NUM_AUX_ROLLOUTS].value
e_auxiliary = self.experiment_config.hparams[agents_constants.PPG_CLEAN.E_AUXILIARY].value
num_minibatches = self.experiment_config.hparams[agents_constants.PPG_CLEAN.NUM_MINIBATCHES].value
total_timesteps = self.experiment_config.hparams[agents_constants.PPG_CLEAN.TOTAL_STEPS].value
envs = gym.vector.SyncVectorEnv([self.make_env() for _ in range(1)])
# Setup training metrics
exp_result.all_metrics[seed] = {}
exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_RETURN] = []
exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_RETURN] = []
exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_TIME_HORIZON] = []
exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_TIME_HORIZON] = []
exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_UPPER_BOUND_RETURN] = []
exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_RANDOM_RETURN] = []
exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_HEURISTIC_RETURN] = []
exp_result.all_metrics[seed][agents_constants.COMMON.RUNTIME] = []
ExperimentUtil.set_seed(seed)
cuda = False
# Create neural network
device = torch.device(agents_constants.PPO_CLEAN.CUDA if torch.cuda.is_available() and cuda else
self.experiment_config.hparams[constants.NEURAL_NETWORKS.DEVICE].value)
num_hidden_layers = self.experiment_config.hparams[constants.NEURAL_NETWORKS.NUM_HIDDEN_LAYERS].value
hidden_layer_dim = self.experiment_config.hparams[constants.NEURAL_NETWORKS.NUM_NEURONS_PER_HIDDEN_LAYER].value
input_dim = np.array(envs.single_observation_space.shape).prod()
env: BaseEnv = self.orig_env
action_space: Discrete = env.action_space
action_dim = int(action_space.n)
agent = PPONetwork(input_dim=input_dim, output_dim_critic=1, output_dim_action=action_dim,
num_hidden_layers=num_hidden_layers, hidden_layer_dim=hidden_layer_dim).to(device)
optimizer = optim.Adam(agent.parameters(), lr=learning_rate, eps=1e-8)
# seeding
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
# Storage setup
obs = torch.zeros((num_steps, 1) + envs.single_observation_space.shape).to(device) # type: ignore
actions = torch.zeros((num_steps, 1) + envs.single_action_space.shape).to(device) # type: ignore
logprobs = torch.zeros((num_steps, 1)).to(device) # type: ignore
rewards = torch.zeros((num_steps, 1)).to(device) # type: ignore
horizons = []
info_returns = []
dones = torch.zeros((num_steps, 1)).to(device) # type: ignore
values = torch.zeros((num_steps, 1)).to(device) # type: ignore
aux_obs = torch.zeros((num_steps, aux_batch_rollouts) + envs.single_observation_space.shape, # type: ignore
dtype=torch.uint8) # type: ignore
aux_returns = torch.zeros((num_steps, aux_batch_rollouts))
# Training loop
global_step = 0
start_time = time.time()
next_obs = torch.Tensor(envs.reset()[0]).to(device)
next_done = torch.zeros(1).to(device)
batch_size = max(1, int(num_steps))
num_iterations = max(1, total_timesteps // batch_size)
minibatch_size = max(2, batch_size // num_minibatches)
num_phases = max(num_iterations // batch_size, 1)
aux_batch_rollouts = max(1, n_iteration)
for phase in range(1, num_phases + 1):
# POLICY PHASE
for update in range(1, n_iteration + 1):
# Annealing the rate if instructed to do so.
if anneal_lr:
frac = 1.0 - (update - 1.0) / num_iterations
lrnow = frac * learning_rate
optimizer.param_groups[0]["lr"] = lrnow
for step in range(0, num_steps):
global_step += 1 * 1
obs[step] = next_obs
dones[step] = next_done
# action logic
with torch.no_grad():
action, logprob, _, value = agent.get_action_and_value(next_obs)
values[step] = value.flatten()
actions[step] = action
logprobs[step] = logprob
# execute the game and log data.
next_obs, reward, done, info, info_d = envs.step(action.cpu().numpy()) # type: ignore
if done[0] and "final_info" in info_d:
horizons.append(info_d["final_info"][0][agents_constants.ENV_METRICS.TIME_HORIZON])
info_returns.append(info_d["final_info"][0][agents_constants.ENV_METRICS.RETURN])
rewards[step] = torch.tensor(reward).to(device).view(-1)
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)
# bootstrap value if not done
with torch.no_grad():
next_value = agent.get_value(next_obs).reshape(1, -1)
advantages = torch.zeros_like(rewards).to(device)
lastgaelam = 0
for t in reversed(range(num_steps)):
if t == num_steps - 1:
nextnonterminal = 1.0 - next_done
nextvalues = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
nextvalues = values[t + 1]
delta = rewards[t] + gamma * nextvalues * nextnonterminal - values[t]
advantages[t] = lastgaelam = delta + gamma * gae_lambda * nextnonterminal * lastgaelam
returns = advantages + values
# flatten the batch
b_obs = obs.reshape((-1,) + envs.single_observation_space.shape) # type: ignore
b_logprobs = logprobs.reshape(-1)
b_actions = actions.reshape((-1,) + envs.single_action_space.shape) # type: ignore
b_advantages = advantages.reshape(-1)
b_returns = returns.reshape(-1)
b_values = values.reshape(-1)
# PPG code does full batch advantage normalization
if adv_norm_fullbatch:
b_advantages = (b_advantages - b_advantages.mean()) / (b_advantages.std() + 1e-8)
# Optimizing the policy and value network
b_inds = np.arange(batch_size)
clipfracs = []
for epoch in range(e_policy):
np.random.shuffle(b_inds)
for start in range(0, batch_size, minibatch_size):
end = start + minibatch_size
mb_inds = b_inds[start:end]
_, newlogprob, entropy, newvalue = agent.get_action_and_value( # type: ignore
b_obs[mb_inds], b_actions.long()[mb_inds]) # type: ignore
logratio = newlogprob - b_logprobs[mb_inds] # type: ignore
ratio = logratio.exp()
with torch.no_grad():
approx_kl = ((ratio - 1) - logratio).mean()
clipfracs += [((ratio - 1.0).abs() > clip_coef).float().mean().item()]
mb_advantages = b_advantages[mb_inds] # type: ignore
# Policy loss
pg_loss1 = -mb_advantages * ratio
pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - clip_coef, 1 + clip_coef)
pg_loss = torch.max(pg_loss1, pg_loss2).mean()
# Value loss
newvalue = newvalue.view(-1)
if clip_vloss:
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 # type: ignore
v_clipped = b_values[mb_inds] + torch.clamp( # type: ignore
newvalue - b_values[mb_inds], -clip_coef, clip_coef) # type: ignore
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 # type: ignore
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) # type: ignore
v_loss = 0.5 * v_loss_max.mean() # type: ignore
else:
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() # type: ignore
entropy_loss = entropy.mean()
loss = pg_loss - ent_coef * entropy_loss + v_loss * vf_coef
optimizer.zero_grad()
loss.backward()
clip_grad.clip_grad_norm_(agent.parameters(), max_grad_norm)
optimizer.step()
if target_kl is not None and approx_kl > target_kl:
break
# PPG Storage - Rollouts are saved without flattening for sampling full rollouts later:
aux_obs = obs.cpu().clone().to(torch.uint8)
aux_returns = returns.cpu().clone()
# AUXILIARY PHASE
aux_inds = np.arange(aux_batch_rollouts)
# Build the old policy on the aux buffer before distilling to the network
aux_pi = torch.zeros((num_steps, aux_batch_rollouts, envs.single_action_space.n)) # type: ignore
for i, start in enumerate(range(0, aux_batch_rollouts, num_aux_rollouts)):
end = start + num_aux_rollouts
m_aux_obs = aux_obs[start:end].to(torch.float32).to(device)
if len(m_aux_obs) > 0:
with torch.no_grad():
pi_logits = agent.get_pi(m_aux_obs).logits.cpu().clone()
aux_pi[start:end] = pi_logits
del m_aux_obs
for auxiliary_update in range(1, e_auxiliary + 1):
np.random.shuffle(aux_inds)
for i, start in enumerate(range(0, aux_batch_rollouts, num_aux_rollouts)):
end = start + num_aux_rollouts
try:
m_aux_obs = aux_obs[start:end].to(torch.float32).to(device)
if len(m_aux_obs) == 0:
continue
m_aux_returns = aux_returns[start:end].to(torch.float32).to(device)
new_pi, new_values, new_aux_values = agent.get_pi_value_and_aux_value(m_aux_obs)
new_values = new_values.view(-1)
new_aux_values = new_aux_values.view(-1)
old_pi_logits = aux_pi[start:end].to(device)
old_pi = Categorical(logits=old_pi_logits)
kl_loss = torch.distributions.kl_divergence(old_pi, new_pi).mean()
real_value_loss = 0.5 * ((new_values - m_aux_returns) ** 2).mean()
aux_value_loss = 0.5 * ((new_aux_values - m_aux_returns) ** 2).mean()
joint_loss = aux_value_loss + beta_clone * kl_loss
loss = (joint_loss + real_value_loss) / n_aux_grad_accum
loss.backward()
if (i + 1) % n_aux_grad_accum == 0:
clip_grad.clip_grad_norm_(agent.parameters(), max_grad_norm)
optimizer.step()
optimizer.zero_grad() # This cannot be outside, else gradients won't accumulate
except RuntimeError as e:
raise Exception(
"if running out of CUDA memory, try a higher --n-aux-grad-accum, which trades more time "
"for less gpu memory") from e
del m_aux_obs, m_aux_returns
# Logging
time_elapsed_minutes = round((time.time() - start_time) / 60, 3)
exp_result.all_metrics[seed][agents_constants.COMMON.RUNTIME].append(time_elapsed_minutes)
avg_R = round(float(np.mean(info_returns)), 3)
exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_RETURN].append(round(avg_R, 3))
avg_T = round(float(np.mean(horizons)), 3)
exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_TIME_HORIZON].append(round(avg_T, 3))
exp_result.all_metrics[seed][agents_constants.COMMON.RUNTIME].append(time_elapsed_minutes)
running_avg_J = ExperimentUtil.running_average(
exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_RETURN],
self.experiment_config.hparams[agents_constants.COMMON.RUNNING_AVERAGE].value)
exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_RETURN].append(
round(running_avg_J, 3))
running_avg_T = ExperimentUtil.running_average(
exp_result.all_metrics[seed][agents_constants.COMMON.AVERAGE_TIME_HORIZON],
self.experiment_config.hparams[agents_constants.COMMON.RUNNING_AVERAGE].value)
exp_result.all_metrics[seed][agents_constants.COMMON.RUNNING_AVERAGE_TIME_HORIZON].append(
round(running_avg_T, 3))
Logger.__call__().get_logger().info(
f"[CleanPPG] Iteration: {phase}/{num_phases}, "
f"avg R: {avg_R}, "
f"R_avg_{self.experiment_config.hparams[agents_constants.COMMON.RUNNING_AVERAGE].value}:"
f"{running_avg_J}, Avg T:{round(avg_T, 3)}, "
f"Running_avg_{self.experiment_config.hparams[agents_constants.COMMON.RUNNING_AVERAGE].value}_T: "
f"{round(running_avg_T, 3)}, "
f"runtime: {time_elapsed_minutes} min")
envs.close()
base_env: BaseEnv = envs.envs[0].env.env.env # type: ignore
return exp_result, base_env, agent
[docs] def make_env(self) -> Callable[[], RecordEpisodeStatistics[Any, Any]]:
"""
Helper function for creating the environment to use for training
:return: a function that creates the environment
"""
def thunk() -> RecordEpisodeStatistics[Any, Any]:
"""
Function for creating a new environment
:return: the created environment
"""
config = self.simulation_env_config.simulation_env_input_config
orig_env: BaseEnv = gym.make(self.simulation_env_config.gym_env_name, config=config)
env = RecordEpisodeStatistics(orig_env)
return env
return thunk
[docs] def hparam_names(self) -> List[str]:
"""
:return: a list with the hyperparameter names
"""
return [constants.NEURAL_NETWORKS.NUM_NEURONS_PER_HIDDEN_LAYER,
constants.NEURAL_NETWORKS.NUM_HIDDEN_LAYERS,
agents_constants.COMMON.NUM_PARALLEL_ENVS, agents_constants.COMMON.BATCH_SIZE,
agents_constants.COMMON.EVAL_EVERY, constants.NEURAL_NETWORKS.DEVICE,
agents_constants.COMMON.SAVE_EVERY, agents_constants.COMMON.NUM_TRAINING_TIMESTEPS,
agents_constants.PPG_CLEAN.TOTAL_STEPS, agents_constants.PPG_CLEAN.LEARNING_RATE,
agents_constants.PPG_CLEAN.NUM_STEPS, agents_constants.PPG_CLEAN.ANNEAL_LR,
agents_constants.PPG_CLEAN.GAMMA, agents_constants.PPG_CLEAN.GAE_LAMBDA,
agents_constants.PPG_CLEAN.NUM_MINIBATCHES, agents_constants.PPG_CLEAN.ADV_NORM_FULLBATCH,
agents_constants.PPG_CLEAN.CLIP_COEF, agents_constants.PPG_CLEAN.ENT_COEF,
agents_constants.PPG_CLEAN.VF_COEF, agents_constants.PPG_CLEAN.MAX_GRAD_NORM,
agents_constants.PPG_CLEAN.TARGET_KL, agents_constants.PPG_CLEAN.N_ITERATION,
agents_constants.PPG_CLEAN.E_POLICY, agents_constants.PPG_CLEAN.E_AUXILIARY,
agents_constants.PPG_CLEAN.BETA_CLONE, agents_constants.PPG_CLEAN.NUM_AUX_ROLLOUTS,
agents_constants.PPG_CLEAN.NUM_AUX_GRAD_ACCUM, agents_constants.PPG_CLEAN.BATCH_SIZE,
agents_constants.PPG_CLEAN.MINIBATCH_SIZE, agents_constants.PPG_CLEAN.NUM_ITERATIONS,
agents_constants.PPG_CLEAN.NUM_PHASES, agents_constants.PPG_CLEAN.AUX_BATCH_ROLLOUTS,
agents_constants.PPG_CLEAN.V_VALUE]