Skip to content
Snippets Groups Projects
Commit 720cd3de authored by Andri Joos's avatar Andri Joos :blush:
Browse files

main.py

parent e12bcef0
No related branches found
No related tags found
No related merge requests found
from HelloWorldEnv import HelloWorldEnv
from envs import HelloWorldEnv
from time import sleep
import tensorflow as tf
import reverb
import os
import numpy as np
from tf_agents.agents.sac.sac_agent import SacAgent
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.agents.ddpg.critic_network import CriticNetwork
from tf_agents.networks.actor_distribution_network import ActorDistributionNetwork
from tf_agents.agents.sac.tanh_normal_projection_network import TanhNormalProjectionNetwork
from tf_agents.train.utils import train_utils
from tf_agents.replay_buffers.reverb_replay_buffer import ReverbReplayBuffer
from tf_agents.policies.py_tf_eager_policy import PyTFEagerPolicy
from tf_agents.policies.random_py_policy import RandomPyPolicy
from tf_agents.replay_buffers.reverb_utils import ReverbAddTrajectoryObserver
from tf_agents.train.actor import Actor
from tf_agents.train import actor
from tf_agents.metrics.py_metrics import EnvironmentSteps
from tf_agents.train import learner
from tf_agents.train import triggers
from tf_agents.utils.common import Checkpointer
from tf_agents.policies.policy_saver import PolicySaver
from tf_agents.policies.tf_py_policy import TFPyPolicy
SIM_IP = "192.168.8.195"
env = HelloWorldEnv(ip=SIM_IP)
env.reset()
env.step(1)
sleep(2)
env.step(1)
# Use "num_iterations = 1e6" for better results (2 hrs)
# 1e5 is just so this doesn't take too long (1 hr)
num_episodes = 10000
num_steps_per_episode = 1000
initial_collect_steps = 1000 # @param {type:"integer"}
collect_steps_per_iteration = 1 # @param {type:"integer"}
replay_buffer_capacity = 10000 # @param {type:"integer"}
batch_size = 256 # @param {type:"integer"}
critic_learning_rate = 3e-4 # @param {type:"number"}
actor_learning_rate = 3e-4 # @param {type:"number"}
alpha_learning_rate = 3e-4 # @param {type:"number"}
target_update_tau = 0.005 # @param {type:"number"}
target_update_period = 1 # @param {type:"number"}
gamma = 0.99 # @param {type:"number"}
reward_scale_factor = 0.8 # @param {type:"number"}
actor_fc_layer_params = (10,)
critic_joint_fc_layer_params = (10,)
log_interval = 5000 # @param {type:"integer"}
num_eval_episodes = 20 # @param {type:"integer"}
eval_interval = 10000 # @param {type:"integer"}
policy_save_interval = 1000 # @param {type:"integer"}
tempdir = "out/"
tempdir_manual = "{}manual/".format(tempdir)
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
py_env = HelloWorldEnv(ip=SIM_IP)
env = TFPyEnvironment(py_env)
# env.reset()
# t = env.step(np.float32(-1))
# sleep(2)
# env.step(1)
observation_spec = env.observation_spec()
action_spec = env.action_spec()
time_step_spec = env.time_step_spec()
critic_net = CriticNetwork(
(observation_spec, action_spec),
observation_fc_layer_params=None,
action_fc_layer_params=None,
joint_fc_layer_params=critic_joint_fc_layer_params,
kernel_initializer='glorot_uniform',
last_kernel_initializer='glorot_uniform')
actor_net = ActorDistributionNetwork(
observation_spec,
action_spec,
fc_layer_params=actor_fc_layer_params,
continuous_projection_net=(
TanhNormalProjectionNetwork))
train_step = train_utils.create_train_step()
tf_agent = SacAgent(
time_step_spec,
action_spec,
actor_network=actor_net,
critic_network=critic_net,
actor_optimizer=tf.keras.optimizers.Adam(
learning_rate=actor_learning_rate),
critic_optimizer=tf.keras.optimizers.Adam(
learning_rate=critic_learning_rate),
alpha_optimizer=tf.keras.optimizers.Adam(
learning_rate=alpha_learning_rate),
target_update_tau=target_update_tau,
target_update_period=target_update_period,
td_errors_loss_fn=tf.math.squared_difference,
gamma=gamma,
reward_scale_factor=reward_scale_factor,
train_step_counter=train_step)
rate_limiter = reverb.rate_limiters.SampleToInsertRatio(samples_per_insert=3.0, min_size_to_sample=3, error_buffer=3.0)
table_name = 'uniform_table'
table = reverb.Table(
table_name,
max_size=replay_buffer_capacity,
sampler=reverb.selectors.Uniform(),
remover=reverb.selectors.Fifo(),
rate_limiter=reverb.rate_limiters.MinSize(1))
reverb_server = reverb.Server([table])
reverb_replay = ReverbReplayBuffer(
tf_agent.collect_data_spec,
sequence_length=2,
table_name=table_name,
local_server=reverb_server)
tf_eval_policy = tf_agent.policy
eval_policy = PyTFEagerPolicy(
tf_eval_policy, use_tf_function=True)
tf_collect_policy = tf_agent.collect_policy
collect_policy = PyTFEagerPolicy(
tf_collect_policy, use_tf_function=True)
random_policy = RandomPyPolicy(time_step_spec, action_spec)
rb_observer = ReverbAddTrajectoryObserver(
reverb_replay.py_client,
table_name,
sequence_length=2,
stride_length=1)
initial_collect_actor = Actor(
py_env,
random_policy,
train_step,
steps_per_run=initial_collect_steps,
observers=[rb_observer])
initial_collect_actor.run()
env_step_metric = EnvironmentSteps()
collect_actor = Actor(
py_env,
collect_policy,
train_step,
steps_per_run=1,
metrics=actor.collect_metrics(10),
summary_dir=os.path.join(tempdir, learner.TRAIN_DIR),
observers=[rb_observer, env_step_metric])
eval_actor = actor.Actor(
py_env,
eval_policy,
train_step,
episodes_per_run=num_eval_episodes,
steps_per_run=2000,
metrics=actor.eval_metrics(num_eval_episodes),
summary_dir=os.path.join(tempdir, 'eval'),
)
saved_model_dir = os.path.join(tempdir, learner.POLICY_SAVED_MODEL_DIR)
# Triggers to save the agent's policy checkpoints.
learning_triggers = [
triggers.PolicySavedModelTrigger(
saved_model_dir,
tf_agent,
train_step,
interval=policy_save_interval),
triggers.StepPerSecondLogTrigger(train_step, interval=1000),
]
dataset = reverb_replay.as_dataset(
sample_batch_size=batch_size, num_steps=2).prefetch(50)
experience_dataset_fn = lambda: dataset
agent_learner = learner.Learner(
tempdir,
train_step,
tf_agent,
experience_dataset_fn,
triggers=learning_triggers)
checkpoint_dir = os.path.join(tempdir_manual, 'checkpoint')
train_checkpointer = Checkpointer(
ckpt_dir=checkpoint_dir,
max_to_keep=10,
agent=tf_agent,
policy=tf_agent.policy,
replay_buffer=reverb_replay,
global_step=train_step
)
train_checkpointer.initialize_or_restore()
def get_eval_metrics():
eval_actor.run()
results = {}
for metric in eval_actor.metrics:
results[metric.name] = metric.result()
return results
# env.reset()
# metrics = get_eval_metrics()
def log_eval_metrics(step, metrics):
eval_results = (', ').join(
'{} = {:.6f}'.format(name, result) for name, result in metrics.items())
print('step = {0}: {1}'.format(step, eval_results))
# log_eval_metrics(0, metrics)
# Reset the train step
# tf_agent.train_step_counter.assign(0)
# env.reset()
# Evaluate the agent's policy once before training.
# avg_return = get_eval_metrics()["AverageReturn"]
returns = []
# env.reset()
# for _ in range(1000):
# collect_actor.run()
agent_learner.run(iterations=1)
for i in range(num_episodes):
env.reset()
for _ in range(num_steps_per_episode):
# Training.
collect_actor.run()
# Evaluating.
step = agent_learner.train_step_numpy
# if log_interval and step % log_interval == 0:
# print('step = {0}: loss = {1}'.format(step, loss_info.loss.numpy()))
agent_learner.run(iterations=10)
print("######## Evaluation episode {} ########".format(i))
env.reset()
metrics = get_eval_metrics()
log_eval_metrics(step, metrics)
returns.append(metrics["AverageReturn"])
train_checkpointer.save(train_step)
rb_observer.close()
reverb_server.stop()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment