Skip to content
Snippets Groups Projects
Commit 5a21cc68 authored by Andri Joos's avatar Andri Joos :blush:
Browse files

Merge branch '19-refactor-start'

parents ccd73d55 c140a8ba
No related branches found
No related tags found
No related merge requests found
......@@ -63,21 +63,11 @@ class StartEnvironment(BaseEnvironment):
geo_point = state.gps_location
height_delta = geo_point.altitude - self._desired_height
normalized_height_delta = self._normalize_height_delta(height_delta)
# acceleration = self._client.getImuData().linear_acceleration
velocity = state.kinematics_estimated.linear_velocity
return np.array([normalized_height_delta, velocity.z_val], dtype=np.float64)
def _calculate_reward(self, obs: NDArray) -> np.float32:
# if abs(obs[0]) < 0.2:
# return np.float32(1)
# else:
# return np.float32(0)
abs_relative_height = abs(obs[0])
reward = -abs_relative_height
# if abs_relative_height < 0.5:
# reward += 1
return np.float32(reward)
return np.float32(abs(obs[0]))
def _normalize_height_delta(self, height: float) -> float:
return height
from envs import StartEnvironment, TestEnvironment
from envs import StartEnvironment
import os
import train.utils as utils
import numpy as np
......@@ -9,36 +9,21 @@ from tf_agents.policies.random_py_policy import RandomPyPolicy
SIM_IP = "192.168.8.195"
# Use "num_iterations = 1e6" for better results (2 hrs)
# 1e5 is just so this doesn't take too long (1 hr)
num_episodes = 10000
num_steps_per_episode = 2000
initial_collect_steps = 2000 # @param {type:"integer"}
collect_steps_per_iteration = 1 # @param {type:"integer"}
replay_buffer_capacity = 100000 # @param {type:"integer"}
replay_buffer_server="localhost:40000"
replay_buffer_capacity = 100000
batch_size = 1024 # @param {type:"integer"}
batch_size = 1024
critic_learning_rate = 3e-4 # @param {type:"number"}
actor_learning_rate = 3e-4 # @param {type:"number"}
alpha_learning_rate = 3e-4 # @param {type:"number"}
target_update_tau = 0.005 # @param {type:"number"}
target_update_period = 1 # @param {type:"number"}
gamma = 0.9 # @param {type:"number"}
reward_scale_factor = 1.0 # @param {type:"number"}
critic_learning_rate = 3e-4
actor_learning_rate = 3e-4
alpha_learning_rate = 3e-4
gamma = 0.9
actor_fc_layer_params = (16,16)
critic_joint_fc_layer_params = actor_fc_layer_params
log_interval = 5000 # @param {type:"integer"}
num_eval_episodes = 20 # @param {type:"integer"}
eval_interval = 5000 # @param {type:"integer"}
policy_save_interval = 1000 # @param {type:"integer"}
tempdir = "out/"
is_initial = utils.is_initial(tempdir)
......@@ -47,19 +32,6 @@ utils.set_gpu()
train_env = StartEnvironment(ip=SIM_IP, desired_height=160, dynamic_start=True)
eval_env = StartEnvironment(ip=SIM_IP, desired_height=180, dynamic_start=False)
train_tf_env = TFPyEnvironment(train_env)
# py_env = TestEnvironment(ip=SIM_IP)
# env = TFPyEnvironment(py_env)
# env.reset()
# action = np.array([0,0.5,0.5,0.5], dtype=np.float32)
# init_action = np.array([0.5, 0.5, 0.5, 0])
# for i in range(35):
# py_env.step(init_action)
# while True:
# t = py_env.step(action)
# m = 0
observation_spec = train_tf_env.observation_spec()
action_spec = train_tf_env.action_spec()
......@@ -67,9 +39,9 @@ time_step_spec = train_tf_env.time_step_spec()
train_step = train_utils.create_train_step()
tf_agent = utils.create_agent(action_spec, observation_spec, time_step_spec, train_step, actor_fc_layer_params, critic_joint_fc_layer_params, alpha_learning_rate=alpha_learning_rate, actor_learning_rate=actor_learning_rate, critic_learning_rate=critic_learning_rate, gamma=gamma, reward_scale_factor=reward_scale_factor)
tf_agent = utils.create_agent(action_spec, observation_spec, time_step_spec, train_step, actor_fc_layer_params, critic_joint_fc_layer_params, alpha_learning_rate=alpha_learning_rate, actor_learning_rate=actor_learning_rate, critic_learning_rate=critic_learning_rate, gamma=gamma)
reverb_replay, rb_observer = utils.create_replay_buffer(tf_agent.collect_data_spec, replay_buffer_server, replay_buffer_capacity)
reverb_replay, rb_observer = utils.create_replay_buffer(tf_agent.collect_data_spec, replay_buffer_capacity)
collect_policy = utils.create_policy(tf_agent.collect_policy)
eval_policy = utils.create_policy(tf_agent.policy)
......@@ -87,15 +59,8 @@ random_actor, collect_actor, eval_actor = utils.create_actors(
checkpoint_dir = os.path.join(tempdir, 'checkpoint')
train_checkpointer, policy_saver = utils.create_checkpoint(tempdir, tf_agent, reverb_replay, train_step)
# reverb_replay = train_checkpointer.manager.checkpoint.replay_buffer # must be done manually, gets somehow not restored
agent_learner = utils.create_learner(tf_agent, train_step, tempdir, num_steps_per_episode, reverb_replay, batch_size)
# eval_env.reset()
# eval_actor.run()
# env.reset()
# eval_actor.run()
utils.train_loop(
agent_learner,
collect_actor,
......@@ -108,6 +73,5 @@ utils.train_loop(
train_step,
num_episodes,
num_steps_per_episode,
epsilon=0.2,
is_initial=is_initial
)
......@@ -42,7 +42,7 @@ def create_agent(action_spec: Any,
target_update_tau: Float = 0.005,
target_update_period: Int = 1,
gamma: Float = 0.99,
reward_scale_factor: Float = 0.8,
reward_scale_factor: Float = 1.0,
) -> SacAgent:
critic_net = CriticNetwork(
(observation_spec, action_spec),
......@@ -74,7 +74,7 @@ def create_agent(action_spec: Any,
reward_scale_factor=reward_scale_factor,
train_step_counter=train_step)
def create_replay_buffer(collect_data_spec: NestedTensorSpec, server_address: str, replay_buffer_capacity: int) -> Tuple[ReverbReplayBuffer, ReverbAddTrajectoryObserver]:
def create_replay_buffer(collect_data_spec: NestedTensorSpec, replay_buffer_capacity: int) -> Tuple[ReverbReplayBuffer, ReverbAddTrajectoryObserver]:
table_name = 'uniform_table'
table = reverb.Table(
table_name,
......@@ -185,14 +185,11 @@ def train_loop(learner: Learner,
train_steps_per_episode: int,
initial_steps: int = 1000,
initial_learning_iterations: int = 1,
episode_learning_iterations: int = 10,
epsilon: float = None,
eval_interval: int = 5,
is_initial: bool = True,
outdir: str = "out",
):
policy_dir = os.path.join(outdir, "policy")
policy_saver.save(policy_dir)
for _ in range(initial_steps):
if is_initial:
random_actor.run()
......@@ -211,6 +208,7 @@ def train_loop(learner: Learner,
learner.run(iterations=1)
checkpointer.save(train_step)
policy_saver.save(policy_dir)
if i % eval_interval == 0:
print("######## Evaluation episode {} ########".format(i))
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment