Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
H
Hello World
Manage
Activity
Members
Labels
Plan
Issues
6
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Operate
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
OST
SA
projects
Hello World
Commits
720cd3de
Commit
720cd3de
authored
1 year ago
by
Andri Joos
Browse files
Options
Downloads
Patches
Plain Diff
main.py
parent
e12bcef0
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
main.py
+259
-6
259 additions, 6 deletions
main.py
with
259 additions
and
6 deletions
main.py
+
259
−
6
View file @
720cd3de
from
HelloWorldE
nv
import
HelloWorldEnv
from
e
nv
s
import
HelloWorldEnv
from
time
import
sleep
import
tensorflow
as
tf
import
reverb
import
os
import
numpy
as
np
from
tf_agents.agents.sac.sac_agent
import
SacAgent
from
tf_agents.environments.tf_py_environment
import
TFPyEnvironment
from
tf_agents.agents.ddpg.critic_network
import
CriticNetwork
from
tf_agents.networks.actor_distribution_network
import
ActorDistributionNetwork
from
tf_agents.agents.sac.tanh_normal_projection_network
import
TanhNormalProjectionNetwork
from
tf_agents.train.utils
import
train_utils
from
tf_agents.replay_buffers.reverb_replay_buffer
import
ReverbReplayBuffer
from
tf_agents.policies.py_tf_eager_policy
import
PyTFEagerPolicy
from
tf_agents.policies.random_py_policy
import
RandomPyPolicy
from
tf_agents.replay_buffers.reverb_utils
import
ReverbAddTrajectoryObserver
from
tf_agents.train.actor
import
Actor
from
tf_agents.train
import
actor
from
tf_agents.metrics.py_metrics
import
EnvironmentSteps
from
tf_agents.train
import
learner
from
tf_agents.train
import
triggers
from
tf_agents.utils.common
import
Checkpointer
from
tf_agents.policies.policy_saver
import
PolicySaver
from
tf_agents.policies.tf_py_policy
import
TFPyPolicy
SIM_IP
=
"
192.168.8.195
"
env
=
HelloWorldEnv
(
ip
=
SIM_IP
)
env
.
reset
()
env
.
step
(
1
)
sleep
(
2
)
env
.
step
(
1
)
# Use "num_iterations = 1e6" for better results (2 hrs)
# 1e5 is just so this doesn't take too long (1 hr)
num_episodes
=
10000
num_steps_per_episode
=
1000
initial_collect_steps
=
1000
# @param {type:"integer"}
collect_steps_per_iteration
=
1
# @param {type:"integer"}
replay_buffer_capacity
=
10000
# @param {type:"integer"}
batch_size
=
256
# @param {type:"integer"}
critic_learning_rate
=
3e-4
# @param {type:"number"}
actor_learning_rate
=
3e-4
# @param {type:"number"}
alpha_learning_rate
=
3e-4
# @param {type:"number"}
target_update_tau
=
0.005
# @param {type:"number"}
target_update_period
=
1
# @param {type:"number"}
gamma
=
0.99
# @param {type:"number"}
reward_scale_factor
=
0.8
# @param {type:"number"}
actor_fc_layer_params
=
(
10
,)
critic_joint_fc_layer_params
=
(
10
,)
log_interval
=
5000
# @param {type:"integer"}
num_eval_episodes
=
20
# @param {type:"integer"}
eval_interval
=
10000
# @param {type:"integer"}
policy_save_interval
=
1000
# @param {type:"integer"}
tempdir
=
"
out/
"
tempdir_manual
=
"
{}manual/
"
.
format
(
tempdir
)
gpus
=
tf
.
config
.
list_physical_devices
(
'
GPU
'
)
for
gpu
in
gpus
:
tf
.
config
.
experimental
.
set_memory_growth
(
gpu
,
True
)
py_env
=
HelloWorldEnv
(
ip
=
SIM_IP
)
env
=
TFPyEnvironment
(
py_env
)
# env.reset()
# t = env.step(np.float32(-1))
# sleep(2)
# env.step(1)
observation_spec
=
env
.
observation_spec
()
action_spec
=
env
.
action_spec
()
time_step_spec
=
env
.
time_step_spec
()
critic_net
=
CriticNetwork
(
(
observation_spec
,
action_spec
),
observation_fc_layer_params
=
None
,
action_fc_layer_params
=
None
,
joint_fc_layer_params
=
critic_joint_fc_layer_params
,
kernel_initializer
=
'
glorot_uniform
'
,
last_kernel_initializer
=
'
glorot_uniform
'
)
actor_net
=
ActorDistributionNetwork
(
observation_spec
,
action_spec
,
fc_layer_params
=
actor_fc_layer_params
,
continuous_projection_net
=
(
TanhNormalProjectionNetwork
))
train_step
=
train_utils
.
create_train_step
()
tf_agent
=
SacAgent
(
time_step_spec
,
action_spec
,
actor_network
=
actor_net
,
critic_network
=
critic_net
,
actor_optimizer
=
tf
.
keras
.
optimizers
.
Adam
(
learning_rate
=
actor_learning_rate
),
critic_optimizer
=
tf
.
keras
.
optimizers
.
Adam
(
learning_rate
=
critic_learning_rate
),
alpha_optimizer
=
tf
.
keras
.
optimizers
.
Adam
(
learning_rate
=
alpha_learning_rate
),
target_update_tau
=
target_update_tau
,
target_update_period
=
target_update_period
,
td_errors_loss_fn
=
tf
.
math
.
squared_difference
,
gamma
=
gamma
,
reward_scale_factor
=
reward_scale_factor
,
train_step_counter
=
train_step
)
rate_limiter
=
reverb
.
rate_limiters
.
SampleToInsertRatio
(
samples_per_insert
=
3.0
,
min_size_to_sample
=
3
,
error_buffer
=
3.0
)
table_name
=
'
uniform_table
'
table
=
reverb
.
Table
(
table_name
,
max_size
=
replay_buffer_capacity
,
sampler
=
reverb
.
selectors
.
Uniform
(),
remover
=
reverb
.
selectors
.
Fifo
(),
rate_limiter
=
reverb
.
rate_limiters
.
MinSize
(
1
))
reverb_server
=
reverb
.
Server
([
table
])
reverb_replay
=
ReverbReplayBuffer
(
tf_agent
.
collect_data_spec
,
sequence_length
=
2
,
table_name
=
table_name
,
local_server
=
reverb_server
)
tf_eval_policy
=
tf_agent
.
policy
eval_policy
=
PyTFEagerPolicy
(
tf_eval_policy
,
use_tf_function
=
True
)
tf_collect_policy
=
tf_agent
.
collect_policy
collect_policy
=
PyTFEagerPolicy
(
tf_collect_policy
,
use_tf_function
=
True
)
random_policy
=
RandomPyPolicy
(
time_step_spec
,
action_spec
)
rb_observer
=
ReverbAddTrajectoryObserver
(
reverb_replay
.
py_client
,
table_name
,
sequence_length
=
2
,
stride_length
=
1
)
initial_collect_actor
=
Actor
(
py_env
,
random_policy
,
train_step
,
steps_per_run
=
initial_collect_steps
,
observers
=
[
rb_observer
])
initial_collect_actor
.
run
()
env_step_metric
=
EnvironmentSteps
()
collect_actor
=
Actor
(
py_env
,
collect_policy
,
train_step
,
steps_per_run
=
1
,
metrics
=
actor
.
collect_metrics
(
10
),
summary_dir
=
os
.
path
.
join
(
tempdir
,
learner
.
TRAIN_DIR
),
observers
=
[
rb_observer
,
env_step_metric
])
eval_actor
=
actor
.
Actor
(
py_env
,
eval_policy
,
train_step
,
episodes_per_run
=
num_eval_episodes
,
steps_per_run
=
2000
,
metrics
=
actor
.
eval_metrics
(
num_eval_episodes
),
summary_dir
=
os
.
path
.
join
(
tempdir
,
'
eval
'
),
)
saved_model_dir
=
os
.
path
.
join
(
tempdir
,
learner
.
POLICY_SAVED_MODEL_DIR
)
# Triggers to save the agent's policy checkpoints.
learning_triggers
=
[
triggers
.
PolicySavedModelTrigger
(
saved_model_dir
,
tf_agent
,
train_step
,
interval
=
policy_save_interval
),
triggers
.
StepPerSecondLogTrigger
(
train_step
,
interval
=
1000
),
]
dataset
=
reverb_replay
.
as_dataset
(
sample_batch_size
=
batch_size
,
num_steps
=
2
).
prefetch
(
50
)
experience_dataset_fn
=
lambda
:
dataset
agent_learner
=
learner
.
Learner
(
tempdir
,
train_step
,
tf_agent
,
experience_dataset_fn
,
triggers
=
learning_triggers
)
checkpoint_dir
=
os
.
path
.
join
(
tempdir_manual
,
'
checkpoint
'
)
train_checkpointer
=
Checkpointer
(
ckpt_dir
=
checkpoint_dir
,
max_to_keep
=
10
,
agent
=
tf_agent
,
policy
=
tf_agent
.
policy
,
replay_buffer
=
reverb_replay
,
global_step
=
train_step
)
train_checkpointer
.
initialize_or_restore
()
def
get_eval_metrics
():
eval_actor
.
run
()
results
=
{}
for
metric
in
eval_actor
.
metrics
:
results
[
metric
.
name
]
=
metric
.
result
()
return
results
# env.reset()
# metrics = get_eval_metrics()
def
log_eval_metrics
(
step
,
metrics
):
eval_results
=
(
'
,
'
).
join
(
'
{} = {:.6f}
'
.
format
(
name
,
result
)
for
name
,
result
in
metrics
.
items
())
print
(
'
step = {0}: {1}
'
.
format
(
step
,
eval_results
))
# log_eval_metrics(0, metrics)
# Reset the train step
# tf_agent.train_step_counter.assign(0)
# env.reset()
# Evaluate the agent's policy once before training.
# avg_return = get_eval_metrics()["AverageReturn"]
returns
=
[]
# env.reset()
# for _ in range(1000):
# collect_actor.run()
agent_learner
.
run
(
iterations
=
1
)
for
i
in
range
(
num_episodes
):
env
.
reset
()
for
_
in
range
(
num_steps_per_episode
):
# Training.
collect_actor
.
run
()
# Evaluating.
step
=
agent_learner
.
train_step_numpy
# if log_interval and step % log_interval == 0:
# print('step = {0}: loss = {1}'.format(step, loss_info.loss.numpy()))
agent_learner
.
run
(
iterations
=
10
)
print
(
"
######## Evaluation episode {} ########
"
.
format
(
i
))
env
.
reset
()
metrics
=
get_eval_metrics
()
log_eval_metrics
(
step
,
metrics
)
returns
.
append
(
metrics
[
"
AverageReturn
"
])
train_checkpointer
.
save
(
train_step
)
rb_observer
.
close
()
reverb_server
.
stop
()
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment