Skip to content
Snippets Groups Projects
Commit ccd73d55 authored by Andri Joos's avatar Andri Joos :blush:
Browse files

Merge branch '18-start-land-policy' into 'master'

Resolve "Start & Land policy"

Closes #18

See merge request !2
parents 1b0e7c68 2cf5a87c
No related branches found
No related tags found
1 merge request!2Resolve "Start & Land policy"
......@@ -10,7 +10,7 @@
"request": "launch",
"program": "train/start_training.py",
"console": "integratedTerminal",
"justMyCode": false
"justMyCode": true
}
]
}
\ No newline at end of file
......@@ -2,6 +2,8 @@ import numpy as np
from airsim import MultirotorClient
from envs import BaseEnvironment
import abc
import math
from msgpackrpc.error import TimeoutError
from numpy.typing import NDArray
......@@ -11,9 +13,21 @@ from tf_agents.typing.types import TimeStep, NestedArray, Text
from tf_agents.specs.array_spec import BoundedArraySpec, ArraySpec
class BaseEnvironment(PyEnvironment):
min_throttle = 0.4
max_throttle = 1.0
max_pitch = math.pi / 4
min_pitch = -max_pitch
max_roll = math.pi / 4
min_roll = -max_roll
max_yaw = math.pi / 2
min_yaw = -max_yaw
def __init__(self, ip: str = None, client: MultirotorClient = None, handle_auto_reset: bool = False):
self._ip = ip
if client is None:
self._client = MultirotorClient(ip=ip)
self._create_client()
else:
self._client = client
......@@ -36,12 +50,16 @@ class BaseEnvironment(PyEnvironment):
if self._must_restart():
return self.reset()
vz = self._convert_z_velocity_for_airsim(action[0].item())
vx = action[1].item()
vy = action[2].item()
throttle = self._convert_throttle_for_airsim(action[0].item())
pitch = self._convert_pitch_for_airsim(action[1].item())
roll = self._convert_roll_for_airsim(action[2].item())
yaw = self._convert_yaw_for_airsim(action[3].item())
# vx = self._convert_xy_velocity_for_airsim(action[1].item())
# vy = self._convert_xy_velocity_for_airsim(action[2].item())
# self._client.cancelLastTask() # Maybe needed when using async commands
self._client.moveByVelocityAsync(vx, vy, vz, 0.0001).join() # execute command for 0.0001 seconds
# self._client.moveByVelocityBodyFrameAsync(vx, vy, vz, 0.01).join()
self._client.moveByRollPitchYawrateThrottleAsync(roll, pitch, yaw, throttle, 0.01).join()
obs = self._get_observation()
reward = self._calculate_reward(obs)
......@@ -54,23 +72,90 @@ class BaseEnvironment(PyEnvironment):
return ts.transition(obs, reward=reward)
def _convert_z_velocity_for_airsim(self, z_velocity: np.float64) -> np.float64:
"""
returns converted velocity:
- 1: max down
- 0: stop/hover
- -1: max up
"""
return z_velocity * (-2) + 1
def _create_client(self):
self._client = MultirotorClient(ip=self._ip, timeout_value=10)
def _convert_z_velocity_for_model(self, z_velocity: np.float64) -> np.float64:
"""
returns converted velocity:
- 0: max down
- 0.5: stop/hover
- 1: max up
"""
return (z_velocity - 1) / (-2)
# def _call_until_no_timeout(self, f, *args, **kwargs):
# succeeded = False
# has_timeout = False
# return_val = None
# while not succeeded:
# try:
# if not has_timeout:
# return_val = f(*args, **kwargs)
# else:
# self._create_client()
# self._reset()
# return_val = f(*args, **kwargs)
# succeeded = True
# except TimeoutError:
# print("Timeout error, check simulator")
# has_timeout = True
# return return_val
@staticmethod
def _convert_property_for_airsim(min: float, max: float, property: np.float32) -> float:
return (max - min) * property + min
def _convert_throttle_for_airsim(self, throttle: np.float32) -> float:
return BaseEnvironment._convert_property_for_airsim(self.min_throttle, self.max_throttle, throttle)
def _convert_pitch_for_airsim(self, pitch: np.float32) -> float:
return BaseEnvironment._convert_property_for_airsim(self.min_pitch, self.max_pitch, pitch)
def _convert_roll_for_airsim(self, roll: np.float32) -> float:
return BaseEnvironment._convert_property_for_airsim(self.min_roll, self.max_roll, roll)
def _convert_yaw_for_airsim(self, yaw: np.float32) -> float:
return BaseEnvironment._convert_property_for_airsim(self.min_yaw, self.max_yaw, yaw)
# def _convert_z_velocity_for_airsim(self, z_velocity: np.float64) -> np.float64:
# """
# returns converted velocity:
# - 1: max down
# - 0: stop/hover
# - -1: max up
# """
# return z_velocity * (-2) + 1
# def _convert_z_velocity_for_model(self, z_velocity: np.float64) -> np.float64:
# """
# returns converted velocity:
# - 0: max down
# - 0.5: stop/hover
# - 1: max up
# """
# return (z_velocity - 1) / (-2)
# def _convert_xy_velocity_for_airsim(self, velocity: np.float64) -> np.float64:
# """
# input
# - 0: max negative speed
# - 0.5: no speed
# - 1: max speed
# returns
# - 1: max speed
# - 0: no speed
# - -1: max negative speed
# """
# return velocity * 2 - 1
# def _convert_xy_velocity_for_model(self, velocity: np.float64) -> np.float64:
# """
# input
# - 1: max speed
# - 0: no speed
# - -1: max negative speed
# returns
# - 0: max negative speed
# - 0.5: no speed
# - 1: max speed
# """
# return (velocity - 1) / 2
def render(self, mode: Text = 'rgb_array') -> NestedArray | None:
raise NotImplementedError()
......
......@@ -2,6 +2,7 @@ from airsim import MultirotorClient
from numpy.typing import NDArray
from envs import BaseEnvironment, StartEnvironment
import numpy as np
from tf_agents.trajectories import time_step as ts
from tf_agents.specs.array_spec import BoundedArraySpec, ArraySpec
from tf_agents.typing.types import TimeStep, NestedArray
......@@ -12,13 +13,15 @@ class StartEnvironment(BaseEnvironment):
client: MultirotorClient = None,
handle_auto_reset: bool = False,
desired_height: np.float64=140.0,
check_out_of_bounds: bool = False):
check_out_of_bounds: bool = False,
dynamic_start: bool = False):
self._desired_height = desired_height
self._check_out_of_bounds = check_out_of_bounds
self._is_out_of_bounds = False
self._dynamic_start = dynamic_start
self._action_spec = BoundedArraySpec(shape=(1,), dtype=np.float32, minimum=0, maximum=1, name='action')
self._observation_spec = ArraySpec(shape=(1,), dtype=np.float64, name='observation')
self._observation_spec = ArraySpec(shape=(2,), dtype=np.float64, name='observation')
super(StartEnvironment, self).__init__(ip, client, handle_auto_reset)
......@@ -30,11 +33,20 @@ class StartEnvironment(BaseEnvironment):
def _reset(self) -> TimeStep:
self._is_out_of_bounds = False
return super()._reset()
time_step = super()._reset()
if self._dynamic_start:
altitude = np.random.random_sample() * 60 + 122.2
position = self._client.getMultirotorState().gps_location
self._client.moveToGPSAsync(latitude=position.latitude, longitude=position.longitude, altitude=altitude, velocity=10).join()
time_step = ts.restart(self._get_observation())
return time_step
def _step(self, action: NestedArray) -> TimeStep:
full_action = np.append(action, [0, 0])
return super()._step(full_action)
full_action = np.append(action, [0.5, 0.5, 0.5])
ts = super()._step(full_action)
print("Current height delta: {}".format(ts.observation[0]))
return ts
def _is_terminated(self, obs: NDArray) -> bool:
return False
......@@ -46,9 +58,26 @@ class StartEnvironment(BaseEnvironment):
def _must_restart(self) -> bool:
return self._is_out_of_bounds
def _get_observation(self) -> NDArray:
geo_point = self._client.getMultirotorState().gps_location
return np.array([geo_point.altitude - self._desired_height])
def _get_observation(self) -> NDArray:
state = self._client.getMultirotorState()
geo_point = state.gps_location
height_delta = geo_point.altitude - self._desired_height
normalized_height_delta = self._normalize_height_delta(height_delta)
# acceleration = self._client.getImuData().linear_acceleration
velocity = state.kinematics_estimated.linear_velocity
return np.array([normalized_height_delta, velocity.z_val], dtype=np.float64)
def _calculate_reward(self, obs: NDArray) -> np.float32:
return np.float32(-(obs[0] ** 2))
# if abs(obs[0]) < 0.2:
# return np.float32(1)
# else:
# return np.float32(0)
abs_relative_height = abs(obs[0])
reward = -abs_relative_height
# if abs_relative_height < 0.5:
# reward += 1
return np.float32(reward)
def _normalize_height_delta(self, height: float) -> float:
return height
No preview for this file type
......@@ -13,6 +13,7 @@ from tf_agents.networks.actor_distribution_network import ActorDistributionNetwo
from tf_agents.agents.sac.tanh_normal_projection_network import TanhNormalProjectionNetwork
from tf_agents.train.utils import train_utils
from tf_agents.policies.py_tf_eager_policy import PyTFEagerPolicy
from tf_agents.policies.greedy_policy import GreedyPolicy
SIM_IP = "192.168.8.195"
actor_fc_layer_params = (10,)
......@@ -29,7 +30,7 @@ tempdir="out/"
client = airsim.MultirotorClient(SIM_IP)
eval_policy = tf.saved_model.load("out/policies/greedy_policy")
eval_policy = tf.saved_model.load("out/policy")
def _step_loop(env, steps: int, initial_time_step):
time_step = initial_time_step
......@@ -55,8 +56,9 @@ def _assert_last_time_steps(time_steps: []):
last_observations = np.array(last_observations).flatten()
for ts in last_steps:
height_diffrence = ts.observation.numpy().flatten()
height_diffrence = ts.observation.numpy().flatten()[0]
assert(abs(height_diffrence) < 1.0)
assert(abs(last_observations.max() - last_observations.min()) < 1.0)
def test130():
......@@ -93,3 +95,9 @@ def test_landing():
time_steps =_step_loop(env, 2000, time_step)
_assert_last_time_steps(time_steps)
test130()
test150()
test180()
test150from180()
test_landing()
from envs import StartEnvironment
from envs import StartEnvironment, TestEnvironment
import os
import train.utils as utils
import numpy as np
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.train.utils import train_utils
......@@ -11,55 +12,72 @@ SIM_IP = "192.168.8.195"
# Use "num_iterations = 1e6" for better results (2 hrs)
# 1e5 is just so this doesn't take too long (1 hr)
num_episodes = 10000
num_steps_per_episode = 1000
num_steps_per_episode = 2000
initial_collect_steps = 1000 # @param {type:"integer"}
initial_collect_steps = 2000 # @param {type:"integer"}
collect_steps_per_iteration = 1 # @param {type:"integer"}
replay_buffer_capacity = 10000 # @param {type:"integer"}
replay_buffer_capacity = 100000 # @param {type:"integer"}
replay_buffer_server="localhost:40000"
batch_size = 256 # @param {type:"integer"}
batch_size = 1024 # @param {type:"integer"}
critic_learning_rate = 3e-4 # @param {type:"number"}
actor_learning_rate = 3e-4 # @param {type:"number"}
alpha_learning_rate = 3e-4 # @param {type:"number"}
target_update_tau = 0.005 # @param {type:"number"}
target_update_period = 1 # @param {type:"number"}
gamma = 0.99 # @param {type:"number"}
reward_scale_factor = 0.8 # @param {type:"number"}
gamma = 0.9 # @param {type:"number"}
reward_scale_factor = 1.0 # @param {type:"number"}
actor_fc_layer_params = (10,)
critic_joint_fc_layer_params = (10,)
actor_fc_layer_params = (16,16)
critic_joint_fc_layer_params = actor_fc_layer_params
log_interval = 5000 # @param {type:"integer"}
num_eval_episodes = 20 # @param {type:"integer"}
eval_interval = 10000 # @param {type:"integer"}
eval_interval = 5000 # @param {type:"integer"}
policy_save_interval = 1000 # @param {type:"integer"}
tempdir = "out/"
is_initial = utils.is_initial(tempdir)
utils.set_gpu()
py_env = StartEnvironment(ip=SIM_IP)
env = TFPyEnvironment(py_env)
train_env = StartEnvironment(ip=SIM_IP, desired_height=160, dynamic_start=True)
eval_env = StartEnvironment(ip=SIM_IP, desired_height=180, dynamic_start=False)
train_tf_env = TFPyEnvironment(train_env)
# py_env = TestEnvironment(ip=SIM_IP)
# env = TFPyEnvironment(py_env)
# env.reset()
observation_spec = env.observation_spec()
action_spec = env.action_spec()
time_step_spec = env.time_step_spec()
# action = np.array([0,0.5,0.5,0.5], dtype=np.float32)
# init_action = np.array([0.5, 0.5, 0.5, 0])
# for i in range(35):
# py_env.step(init_action)
# while True:
# t = py_env.step(action)
# m = 0
observation_spec = train_tf_env.observation_spec()
action_spec = train_tf_env.action_spec()
time_step_spec = train_tf_env.time_step_spec()
train_step = train_utils.create_train_step()
tf_agent = utils.create_agent(action_spec, observation_spec, time_step_spec, train_step, actor_fc_layer_params, critic_joint_fc_layer_params)
tf_agent = utils.create_agent(action_spec, observation_spec, time_step_spec, train_step, actor_fc_layer_params, critic_joint_fc_layer_params, alpha_learning_rate=alpha_learning_rate, actor_learning_rate=actor_learning_rate, critic_learning_rate=critic_learning_rate, gamma=gamma, reward_scale_factor=reward_scale_factor)
reverb_replay, rb_observer = utils.create_replay_buffer(tf_agent.collect_data_spec, replay_buffer_capacity)
reverb_replay, rb_observer = utils.create_replay_buffer(tf_agent.collect_data_spec, replay_buffer_server, replay_buffer_capacity)
eval_policy = utils.create_policy(tf_agent.policy)
collect_policy = utils.create_policy(tf_agent.collect_policy)
eval_policy = utils.create_policy(tf_agent.policy)
random_policy = RandomPyPolicy(time_step_spec, action_spec)
initial_collect_actor, collect_actor, eval_actor = utils.create_actors(
py_env,
random_actor, collect_actor, eval_actor = utils.create_actors(
train_env,
eval_env,
random_policy,
collect_policy,
eval_policy,
......@@ -67,20 +85,29 @@ initial_collect_actor, collect_actor, eval_actor = utils.create_actors(
rb_observer,
)
initial_collect_actor.run()
checkpoint_dir = os.path.join(tempdir, 'checkpoint')
train_checkpointer, policy_saver = utils.create_checkpoint(tempdir, tf_agent, reverb_replay, train_step)
# reverb_replay = train_checkpointer.manager.checkpoint.replay_buffer # must be done manually, gets somehow not restored
agent_learner = utils.create_learner(tf_agent, train_step, tempdir, num_steps_per_episode, reverb_replay, batch_size)
checkpoint_dir = os.path.join(tempdir, 'checkpoint')
train_checkpointer = utils.create_checkpoint(tempdir, tf_agent, reverb_replay, train_step)
# eval_env.reset()
# eval_actor.run()
# env.reset()
# eval_actor.run()
utils.train_loop(
agent_learner,
collect_actor,
eval_actor,
env,
random_actor,
train_env,
eval_env,
train_checkpointer,
policy_saver,
train_step,
num_episodes,
num_steps_per_episode,
epsilon=0.2,
is_initial=is_initial
)
......@@ -2,6 +2,7 @@ from typing import Any, Tuple
import tensorflow as tf
import reverb
import os
import numpy as np
from tf_agents.typing.types import Float, Int, NestedTensorSpec
from tf_agents.agents.ddpg.critic_network import CriticNetwork
......@@ -20,9 +21,8 @@ from tf_agents.metrics.py_metrics import EnvironmentSteps
from tf_agents.train import actor
from tf_agents.train import learner
from tf_agents.train.learner import Learner
from tf_agents.train.triggers import PolicySavedModelTrigger
from tf_agents.utils.common import Checkpointer
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.policies import PolicySaver
def set_gpu():
......@@ -74,17 +74,16 @@ def create_agent(action_spec: Any,
reward_scale_factor=reward_scale_factor,
train_step_counter=train_step)
def create_replay_buffer(collect_data_spec: NestedTensorSpec, capacity: int) -> Tuple[ReverbReplayBuffer, ReverbAddTrajectoryObserver]:
def create_replay_buffer(collect_data_spec: NestedTensorSpec, server_address: str, replay_buffer_capacity: int) -> Tuple[ReverbReplayBuffer, ReverbAddTrajectoryObserver]:
table_name = 'uniform_table'
table = reverb.Table(
table_name,
max_size=capacity,
max_size=replay_buffer_capacity,
sampler=reverb.selectors.Uniform(),
remover=reverb.selectors.Fifo(),
rate_limiter=reverb.rate_limiters.MinSize(1))
reverb_server = reverb.Server([table])
replay_buffer = ReverbReplayBuffer(
collect_data_spec,
sequence_length=2,
......@@ -102,27 +101,30 @@ def create_replay_buffer(collect_data_spec: NestedTensorSpec, capacity: int) ->
def create_policy(policy: TFPolicy) -> PyTFEagerPolicy:
return PyTFEagerPolicy(policy, use_tf_function=True)
def create_actors(py_env: PyEnvironment,
# def create_eval_policy(policy: TFPolicy) -> PyTFEagerPolicy:
# return PyTFEagerPolicy(GreedyPolicy(policy), use_tf_function=True)
def create_actors(train_env: PyEnvironment,
eval_env: PyEnvironment,
random_policy: PyPolicy,
collect_policy: PyPolicy,
eval_policy: PyPolicy,
train_step: tf.Variable,
rb_observer: ReverbAddTrajectoryObserver,
initial_collect_steps: int = 1000,
eval_episodes_per_run: int = 20,
eval_steps_per_run: int = 2000,
tempdir: str = "out/"
) -> Tuple[Actor, Actor, Actor]:
initial_collect_actor = Actor(
py_env,
random_actor = Actor(
train_env,
random_policy,
train_step,
steps_per_run=initial_collect_steps,
steps_per_run=1,
observers=[rb_observer])
env_step_metric = EnvironmentSteps()
collect_actor = Actor(
py_env,
train_env,
collect_policy,
train_step,
steps_per_run=1,
......@@ -131,7 +133,7 @@ def create_actors(py_env: PyEnvironment,
observers=[rb_observer, env_step_metric])
eval_actor = Actor(
py_env,
eval_env,
eval_policy,
train_step,
episodes_per_run=eval_episodes_per_run,
......@@ -139,7 +141,7 @@ def create_actors(py_env: PyEnvironment,
metrics=actor.eval_metrics(eval_episodes_per_run),
)
return initial_collect_actor, collect_actor, eval_actor
return random_actor, collect_actor, eval_actor
def create_learner(agent: TFAgent, train_step: tf.Variable, tempdir: str, steps_per_episode: int, replay_buffer: ReverbReplayBuffer, batch_size: int) -> Learner:
dataset = replay_buffer.as_dataset(
......@@ -147,20 +149,12 @@ def create_learner(agent: TFAgent, train_step: tf.Variable, tempdir: str, steps_
experience_dataset_fn = lambda: dataset
saved_model_dir = os.path.join(tempdir, learner.POLICY_SAVED_MODEL_DIR)
learning_triggers = [
PolicySavedModelTrigger(
saved_model_dir,
agent,
train_step,
interval=steps_per_episode),
]
return Learner(
tempdir,
train_step,
agent,
experience_dataset_fn,
triggers=learning_triggers)
experience_dataset_fn)
def create_checkpoint(tempdir: str, agent: TFAgent, replay_buffer: ReverbReplayBuffer, train_step: tf.Variable) -> Checkpointer:
checkpoint_dir = os.path.join(tempdir, 'checkpoint')
......@@ -174,30 +168,54 @@ def create_checkpoint(tempdir: str, agent: TFAgent, replay_buffer: ReverbReplayB
)
checkpointer.initialize_or_restore()
return checkpointer
policy_saver = PolicySaver(agent.policy)
return checkpointer, policy_saver
def train_loop(learner: Learner,
collect_actor: Actor,
eval_actor: Actor,
env: TFPyEnvironment,
checkpointer: Checkpointer,
random_actor: Actor,
train_env: PyEnvironment,
eval_env: PyEnvironment,
checkpointer: Checkpointer,
policy_saver: PolicySaver,
train_step: tf.Variable,
train_episodes: int,
train_steps_per_episode: int,
initial_steps: int = 1000,
initial_learning_iterations: int = 1,
episode_learning_iterations: int = 10,
epsilon: float = None,
eval_interval: int = 5,
is_initial: bool = True,
outdir: str = "out",
):
policy_dir = os.path.join(outdir, "policy")
policy_saver.save(policy_dir)
for _ in range(initial_steps):
if is_initial:
random_actor.run()
else:
collect_actor.run()
learner.run(iterations=initial_learning_iterations)
for i in range(train_episodes):
env.reset()
train_env.reset()
for _ in range(train_steps_per_episode):
for j in range(train_steps_per_episode):
# if exploration_interval is not None and np.random.randint(0, exploration_interval) == 0:
# random_actor.run()
# else:
collect_actor.run()
learner.run(iterations=episode_learning_iterations)
learner.run(iterations=1)
checkpointer.save(train_step)
print("######## Evaluation episode {} ########".format(i))
env.reset()
eval_actor.run()
if i % eval_interval == 0:
print("######## Evaluation episode {} ########".format(i))
eval_env.reset()
eval_actor.run()
def is_initial(tempdir: str) -> bool:
return not os.path.exists(tempdir)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment