diff --git a/.vscode/launch.json b/.vscode/launch.json index 962d6dc5c723e2748a4f2ff022bbdca79900053e..e903e22c3c19e7af805bb50d64f3f2c4beec117a 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -10,7 +10,7 @@ "request": "launch", "program": "train/start_training.py", "console": "integratedTerminal", - "justMyCode": false + "justMyCode": true } ] } \ No newline at end of file diff --git a/envs/BaseEnvironment.py b/envs/BaseEnvironment.py index 6e0fac122c38b28a49a0583355730f948ff78ded..d0b80d5bc5d988ac8c72894f097e76caa3343bdb 100644 --- a/envs/BaseEnvironment.py +++ b/envs/BaseEnvironment.py @@ -2,6 +2,8 @@ import numpy as np from airsim import MultirotorClient from envs import BaseEnvironment import abc +import math +from msgpackrpc.error import TimeoutError from numpy.typing import NDArray @@ -11,9 +13,21 @@ from tf_agents.typing.types import TimeStep, NestedArray, Text from tf_agents.specs.array_spec import BoundedArraySpec, ArraySpec class BaseEnvironment(PyEnvironment): + min_throttle = 0.4 + max_throttle = 1.0 + + max_pitch = math.pi / 4 + min_pitch = -max_pitch + + max_roll = math.pi / 4 + min_roll = -max_roll + + max_yaw = math.pi / 2 + min_yaw = -max_yaw def __init__(self, ip: str = None, client: MultirotorClient = None, handle_auto_reset: bool = False): + self._ip = ip if client is None: - self._client = MultirotorClient(ip=ip) + self._create_client() else: self._client = client @@ -36,12 +50,16 @@ class BaseEnvironment(PyEnvironment): if self._must_restart(): return self.reset() - vz = self._convert_z_velocity_for_airsim(action[0].item()) - vx = action[1].item() - vy = action[2].item() + throttle = self._convert_throttle_for_airsim(action[0].item()) + pitch = self._convert_pitch_for_airsim(action[1].item()) + roll = self._convert_roll_for_airsim(action[2].item()) + yaw = self._convert_yaw_for_airsim(action[3].item()) + # vx = self._convert_xy_velocity_for_airsim(action[1].item()) + # vy = self._convert_xy_velocity_for_airsim(action[2].item()) # self._client.cancelLastTask() # Maybe needed when using async commands - self._client.moveByVelocityAsync(vx, vy, vz, 0.0001).join() # execute command for 0.0001 seconds + # self._client.moveByVelocityBodyFrameAsync(vx, vy, vz, 0.01).join() + self._client.moveByRollPitchYawrateThrottleAsync(roll, pitch, yaw, throttle, 0.01).join() obs = self._get_observation() reward = self._calculate_reward(obs) @@ -54,23 +72,90 @@ class BaseEnvironment(PyEnvironment): return ts.transition(obs, reward=reward) - def _convert_z_velocity_for_airsim(self, z_velocity: np.float64) -> np.float64: - """ - returns converted velocity: - - 1: max down - - 0: stop/hover - - -1: max up - """ - return z_velocity * (-2) + 1 + def _create_client(self): + self._client = MultirotorClient(ip=self._ip, timeout_value=10) - def _convert_z_velocity_for_model(self, z_velocity: np.float64) -> np.float64: - """ - returns converted velocity: - - 0: max down - - 0.5: stop/hover - - 1: max up - """ - return (z_velocity - 1) / (-2) + # def _call_until_no_timeout(self, f, *args, **kwargs): + # succeeded = False + # has_timeout = False + # return_val = None + # while not succeeded: + # try: + # if not has_timeout: + # return_val = f(*args, **kwargs) + # else: + # self._create_client() + # self._reset() + # return_val = f(*args, **kwargs) + # succeeded = True + # except TimeoutError: + # print("Timeout error, check simulator") + # has_timeout = True + + # return return_val + + @staticmethod + def _convert_property_for_airsim(min: float, max: float, property: np.float32) -> float: + return (max - min) * property + min + + def _convert_throttle_for_airsim(self, throttle: np.float32) -> float: + return BaseEnvironment._convert_property_for_airsim(self.min_throttle, self.max_throttle, throttle) + + def _convert_pitch_for_airsim(self, pitch: np.float32) -> float: + return BaseEnvironment._convert_property_for_airsim(self.min_pitch, self.max_pitch, pitch) + + def _convert_roll_for_airsim(self, roll: np.float32) -> float: + return BaseEnvironment._convert_property_for_airsim(self.min_roll, self.max_roll, roll) + + def _convert_yaw_for_airsim(self, yaw: np.float32) -> float: + return BaseEnvironment._convert_property_for_airsim(self.min_yaw, self.max_yaw, yaw) + + # def _convert_z_velocity_for_airsim(self, z_velocity: np.float64) -> np.float64: + # """ + # returns converted velocity: + # - 1: max down + # - 0: stop/hover + # - -1: max up + # """ + # return z_velocity * (-2) + 1 + + # def _convert_z_velocity_for_model(self, z_velocity: np.float64) -> np.float64: + # """ + # returns converted velocity: + # - 0: max down + # - 0.5: stop/hover + # - 1: max up + # """ + # return (z_velocity - 1) / (-2) + + # def _convert_xy_velocity_for_airsim(self, velocity: np.float64) -> np.float64: + # """ + # input + # - 0: max negative speed + # - 0.5: no speed + # - 1: max speed + + # returns + # - 1: max speed + # - 0: no speed + # - -1: max negative speed + # """ + # return velocity * 2 - 1 + + # def _convert_xy_velocity_for_model(self, velocity: np.float64) -> np.float64: + # """ + # input + # - 1: max speed + # - 0: no speed + # - -1: max negative speed + + # returns + + # - 0: max negative speed + # - 0.5: no speed + # - 1: max speed + # """ + # return (velocity - 1) / 2 def render(self, mode: Text = 'rgb_array') -> NestedArray | None: raise NotImplementedError() diff --git a/envs/StartEnvironment.py b/envs/StartEnvironment.py index cfa49fa582ee853b6b27ce90bb1755c22a8f8ec7..7d1350fd159bb7f71f13441c1c74038a357f8222 100644 --- a/envs/StartEnvironment.py +++ b/envs/StartEnvironment.py @@ -2,6 +2,7 @@ from airsim import MultirotorClient from numpy.typing import NDArray from envs import BaseEnvironment, StartEnvironment import numpy as np +from tf_agents.trajectories import time_step as ts from tf_agents.specs.array_spec import BoundedArraySpec, ArraySpec from tf_agents.typing.types import TimeStep, NestedArray @@ -12,13 +13,15 @@ class StartEnvironment(BaseEnvironment): client: MultirotorClient = None, handle_auto_reset: bool = False, desired_height: np.float64=140.0, - check_out_of_bounds: bool = False): + check_out_of_bounds: bool = False, + dynamic_start: bool = False): self._desired_height = desired_height self._check_out_of_bounds = check_out_of_bounds self._is_out_of_bounds = False + self._dynamic_start = dynamic_start self._action_spec = BoundedArraySpec(shape=(1,), dtype=np.float32, minimum=0, maximum=1, name='action') - self._observation_spec = ArraySpec(shape=(1,), dtype=np.float64, name='observation') + self._observation_spec = ArraySpec(shape=(2,), dtype=np.float64, name='observation') super(StartEnvironment, self).__init__(ip, client, handle_auto_reset) @@ -30,11 +33,20 @@ class StartEnvironment(BaseEnvironment): def _reset(self) -> TimeStep: self._is_out_of_bounds = False - return super()._reset() + time_step = super()._reset() + if self._dynamic_start: + altitude = np.random.random_sample() * 60 + 122.2 + position = self._client.getMultirotorState().gps_location + self._client.moveToGPSAsync(latitude=position.latitude, longitude=position.longitude, altitude=altitude, velocity=10).join() + time_step = ts.restart(self._get_observation()) + return time_step def _step(self, action: NestedArray) -> TimeStep: - full_action = np.append(action, [0, 0]) - return super()._step(full_action) + full_action = np.append(action, [0.5, 0.5, 0.5]) + ts = super()._step(full_action) + print("Current height delta: {}".format(ts.observation[0])) + + return ts def _is_terminated(self, obs: NDArray) -> bool: return False @@ -46,9 +58,26 @@ class StartEnvironment(BaseEnvironment): def _must_restart(self) -> bool: return self._is_out_of_bounds - def _get_observation(self) -> NDArray: - geo_point = self._client.getMultirotorState().gps_location - return np.array([geo_point.altitude - self._desired_height]) + def _get_observation(self) -> NDArray: + state = self._client.getMultirotorState() + geo_point = state.gps_location + height_delta = geo_point.altitude - self._desired_height + normalized_height_delta = self._normalize_height_delta(height_delta) + # acceleration = self._client.getImuData().linear_acceleration + velocity = state.kinematics_estimated.linear_velocity + return np.array([normalized_height_delta, velocity.z_val], dtype=np.float64) def _calculate_reward(self, obs: NDArray) -> np.float32: - return np.float32(-(obs[0] ** 2)) + # if abs(obs[0]) < 0.2: + # return np.float32(1) + # else: + # return np.float32(0) + + abs_relative_height = abs(obs[0]) + reward = -abs_relative_height + # if abs_relative_height < 0.5: + # reward += 1 + return np.float32(reward) + + def _normalize_height_delta(self, height: float) -> float: + return height diff --git a/tests/starting_test.py b/tests/starting_test.py index 012a7e33cbdab03828bdeb8c1be0ce531606df0e..360a200383fa95fc08ccff08be3405538ebcc455 100644 --- a/tests/starting_test.py +++ b/tests/starting_test.py @@ -13,6 +13,7 @@ from tf_agents.networks.actor_distribution_network import ActorDistributionNetwo from tf_agents.agents.sac.tanh_normal_projection_network import TanhNormalProjectionNetwork from tf_agents.train.utils import train_utils from tf_agents.policies.py_tf_eager_policy import PyTFEagerPolicy +from tf_agents.policies.greedy_policy import GreedyPolicy SIM_IP = "192.168.8.195" actor_fc_layer_params = (10,) @@ -29,7 +30,7 @@ tempdir="out/" client = airsim.MultirotorClient(SIM_IP) -eval_policy = tf.saved_model.load("out/policies/greedy_policy") +eval_policy = tf.saved_model.load("out/policy") def _step_loop(env, steps: int, initial_time_step): time_step = initial_time_step @@ -55,8 +56,9 @@ def _assert_last_time_steps(time_steps: []): last_observations = np.array(last_observations).flatten() for ts in last_steps: - height_diffrence = ts.observation.numpy().flatten() + height_diffrence = ts.observation.numpy().flatten()[0] assert(abs(height_diffrence) < 1.0) + assert(abs(last_observations.max() - last_observations.min()) < 1.0) def test130(): @@ -93,3 +95,9 @@ def test_landing(): time_steps =_step_loop(env, 2000, time_step) _assert_last_time_steps(time_steps) + +test130() +test150() +test180() +test150from180() +test_landing() diff --git a/train/start_training.py b/train/start_training.py index dcfdc6588dfe0118c505111fe9c3c8af8e6f969a..546fa8633d0076ed409ac6e2079501ec94e57096 100644 --- a/train/start_training.py +++ b/train/start_training.py @@ -1,6 +1,7 @@ -from envs import StartEnvironment +from envs import StartEnvironment, TestEnvironment import os import train.utils as utils +import numpy as np from tf_agents.environments.tf_py_environment import TFPyEnvironment from tf_agents.train.utils import train_utils @@ -11,55 +12,72 @@ SIM_IP = "192.168.8.195" # Use "num_iterations = 1e6" for better results (2 hrs) # 1e5 is just so this doesn't take too long (1 hr) num_episodes = 10000 -num_steps_per_episode = 1000 +num_steps_per_episode = 2000 -initial_collect_steps = 1000 # @param {type:"integer"} +initial_collect_steps = 2000 # @param {type:"integer"} collect_steps_per_iteration = 1 # @param {type:"integer"} -replay_buffer_capacity = 10000 # @param {type:"integer"} +replay_buffer_capacity = 100000 # @param {type:"integer"} +replay_buffer_server="localhost:40000" -batch_size = 256 # @param {type:"integer"} +batch_size = 1024 # @param {type:"integer"} critic_learning_rate = 3e-4 # @param {type:"number"} actor_learning_rate = 3e-4 # @param {type:"number"} alpha_learning_rate = 3e-4 # @param {type:"number"} target_update_tau = 0.005 # @param {type:"number"} target_update_period = 1 # @param {type:"number"} -gamma = 0.99 # @param {type:"number"} -reward_scale_factor = 0.8 # @param {type:"number"} +gamma = 0.9 # @param {type:"number"} +reward_scale_factor = 1.0 # @param {type:"number"} -actor_fc_layer_params = (10,) -critic_joint_fc_layer_params = (10,) +actor_fc_layer_params = (16,16) +critic_joint_fc_layer_params = actor_fc_layer_params log_interval = 5000 # @param {type:"integer"} num_eval_episodes = 20 # @param {type:"integer"} -eval_interval = 10000 # @param {type:"integer"} +eval_interval = 5000 # @param {type:"integer"} policy_save_interval = 1000 # @param {type:"integer"} tempdir = "out/" +is_initial = utils.is_initial(tempdir) utils.set_gpu() -py_env = StartEnvironment(ip=SIM_IP) -env = TFPyEnvironment(py_env) +train_env = StartEnvironment(ip=SIM_IP, desired_height=160, dynamic_start=True) +eval_env = StartEnvironment(ip=SIM_IP, desired_height=180, dynamic_start=False) +train_tf_env = TFPyEnvironment(train_env) +# py_env = TestEnvironment(ip=SIM_IP) +# env = TFPyEnvironment(py_env) +# env.reset() -observation_spec = env.observation_spec() -action_spec = env.action_spec() -time_step_spec = env.time_step_spec() +# action = np.array([0,0.5,0.5,0.5], dtype=np.float32) + +# init_action = np.array([0.5, 0.5, 0.5, 0]) +# for i in range(35): +# py_env.step(init_action) + +# while True: +# t = py_env.step(action) +# m = 0 + +observation_spec = train_tf_env.observation_spec() +action_spec = train_tf_env.action_spec() +time_step_spec = train_tf_env.time_step_spec() train_step = train_utils.create_train_step() -tf_agent = utils.create_agent(action_spec, observation_spec, time_step_spec, train_step, actor_fc_layer_params, critic_joint_fc_layer_params) +tf_agent = utils.create_agent(action_spec, observation_spec, time_step_spec, train_step, actor_fc_layer_params, critic_joint_fc_layer_params, alpha_learning_rate=alpha_learning_rate, actor_learning_rate=actor_learning_rate, critic_learning_rate=critic_learning_rate, gamma=gamma, reward_scale_factor=reward_scale_factor) -reverb_replay, rb_observer = utils.create_replay_buffer(tf_agent.collect_data_spec, replay_buffer_capacity) +reverb_replay, rb_observer = utils.create_replay_buffer(tf_agent.collect_data_spec, replay_buffer_server, replay_buffer_capacity) -eval_policy = utils.create_policy(tf_agent.policy) collect_policy = utils.create_policy(tf_agent.collect_policy) +eval_policy = utils.create_policy(tf_agent.policy) random_policy = RandomPyPolicy(time_step_spec, action_spec) -initial_collect_actor, collect_actor, eval_actor = utils.create_actors( - py_env, +random_actor, collect_actor, eval_actor = utils.create_actors( + train_env, + eval_env, random_policy, collect_policy, eval_policy, @@ -67,20 +85,29 @@ initial_collect_actor, collect_actor, eval_actor = utils.create_actors( rb_observer, ) -initial_collect_actor.run() +checkpoint_dir = os.path.join(tempdir, 'checkpoint') +train_checkpointer, policy_saver = utils.create_checkpoint(tempdir, tf_agent, reverb_replay, train_step) +# reverb_replay = train_checkpointer.manager.checkpoint.replay_buffer # must be done manually, gets somehow not restored agent_learner = utils.create_learner(tf_agent, train_step, tempdir, num_steps_per_episode, reverb_replay, batch_size) -checkpoint_dir = os.path.join(tempdir, 'checkpoint') -train_checkpointer = utils.create_checkpoint(tempdir, tf_agent, reverb_replay, train_step) +# eval_env.reset() +# eval_actor.run() +# env.reset() +# eval_actor.run() utils.train_loop( agent_learner, collect_actor, eval_actor, - env, + random_actor, + train_env, + eval_env, train_checkpointer, + policy_saver, train_step, num_episodes, num_steps_per_episode, + epsilon=0.2, + is_initial=is_initial ) diff --git a/train/utils.py b/train/utils.py index f43da3f6925e82c3ba1834f26869f06399bd2406..33fc537f0ba1220c604c9a46eefa0b93f95f82c4 100644 --- a/train/utils.py +++ b/train/utils.py @@ -2,6 +2,7 @@ from typing import Any, Tuple import tensorflow as tf import reverb import os +import numpy as np from tf_agents.typing.types import Float, Int, NestedTensorSpec from tf_agents.agents.ddpg.critic_network import CriticNetwork @@ -20,9 +21,8 @@ from tf_agents.metrics.py_metrics import EnvironmentSteps from tf_agents.train import actor from tf_agents.train import learner from tf_agents.train.learner import Learner -from tf_agents.train.triggers import PolicySavedModelTrigger from tf_agents.utils.common import Checkpointer -from tf_agents.environments.tf_py_environment import TFPyEnvironment +from tf_agents.policies import PolicySaver def set_gpu(): @@ -74,17 +74,16 @@ def create_agent(action_spec: Any, reward_scale_factor=reward_scale_factor, train_step_counter=train_step) -def create_replay_buffer(collect_data_spec: NestedTensorSpec, capacity: int) -> Tuple[ReverbReplayBuffer, ReverbAddTrajectoryObserver]: +def create_replay_buffer(collect_data_spec: NestedTensorSpec, server_address: str, replay_buffer_capacity: int) -> Tuple[ReverbReplayBuffer, ReverbAddTrajectoryObserver]: table_name = 'uniform_table' table = reverb.Table( table_name, - max_size=capacity, + max_size=replay_buffer_capacity, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), rate_limiter=reverb.rate_limiters.MinSize(1)) reverb_server = reverb.Server([table]) - replay_buffer = ReverbReplayBuffer( collect_data_spec, sequence_length=2, @@ -102,27 +101,30 @@ def create_replay_buffer(collect_data_spec: NestedTensorSpec, capacity: int) -> def create_policy(policy: TFPolicy) -> PyTFEagerPolicy: return PyTFEagerPolicy(policy, use_tf_function=True) -def create_actors(py_env: PyEnvironment, +# def create_eval_policy(policy: TFPolicy) -> PyTFEagerPolicy: +# return PyTFEagerPolicy(GreedyPolicy(policy), use_tf_function=True) + +def create_actors(train_env: PyEnvironment, + eval_env: PyEnvironment, random_policy: PyPolicy, collect_policy: PyPolicy, eval_policy: PyPolicy, train_step: tf.Variable, rb_observer: ReverbAddTrajectoryObserver, - initial_collect_steps: int = 1000, eval_episodes_per_run: int = 20, eval_steps_per_run: int = 2000, tempdir: str = "out/" ) -> Tuple[Actor, Actor, Actor]: - initial_collect_actor = Actor( - py_env, + random_actor = Actor( + train_env, random_policy, train_step, - steps_per_run=initial_collect_steps, + steps_per_run=1, observers=[rb_observer]) env_step_metric = EnvironmentSteps() collect_actor = Actor( - py_env, + train_env, collect_policy, train_step, steps_per_run=1, @@ -131,7 +133,7 @@ def create_actors(py_env: PyEnvironment, observers=[rb_observer, env_step_metric]) eval_actor = Actor( - py_env, + eval_env, eval_policy, train_step, episodes_per_run=eval_episodes_per_run, @@ -139,7 +141,7 @@ def create_actors(py_env: PyEnvironment, metrics=actor.eval_metrics(eval_episodes_per_run), ) - return initial_collect_actor, collect_actor, eval_actor + return random_actor, collect_actor, eval_actor def create_learner(agent: TFAgent, train_step: tf.Variable, tempdir: str, steps_per_episode: int, replay_buffer: ReverbReplayBuffer, batch_size: int) -> Learner: dataset = replay_buffer.as_dataset( @@ -147,20 +149,12 @@ def create_learner(agent: TFAgent, train_step: tf.Variable, tempdir: str, steps_ experience_dataset_fn = lambda: dataset saved_model_dir = os.path.join(tempdir, learner.POLICY_SAVED_MODEL_DIR) - learning_triggers = [ - PolicySavedModelTrigger( - saved_model_dir, - agent, - train_step, - interval=steps_per_episode), - ] return Learner( tempdir, train_step, agent, - experience_dataset_fn, - triggers=learning_triggers) + experience_dataset_fn) def create_checkpoint(tempdir: str, agent: TFAgent, replay_buffer: ReverbReplayBuffer, train_step: tf.Variable) -> Checkpointer: checkpoint_dir = os.path.join(tempdir, 'checkpoint') @@ -174,30 +168,54 @@ def create_checkpoint(tempdir: str, agent: TFAgent, replay_buffer: ReverbReplayB ) checkpointer.initialize_or_restore() - return checkpointer + policy_saver = PolicySaver(agent.policy) + + return checkpointer, policy_saver def train_loop(learner: Learner, collect_actor: Actor, eval_actor: Actor, - env: TFPyEnvironment, - checkpointer: Checkpointer, + random_actor: Actor, + train_env: PyEnvironment, + eval_env: PyEnvironment, + checkpointer: Checkpointer, + policy_saver: PolicySaver, train_step: tf.Variable, train_episodes: int, train_steps_per_episode: int, + initial_steps: int = 1000, initial_learning_iterations: int = 1, episode_learning_iterations: int = 10, + epsilon: float = None, + eval_interval: int = 5, + is_initial: bool = True, + outdir: str = "out", ): + policy_dir = os.path.join(outdir, "policy") + policy_saver.save(policy_dir) + for _ in range(initial_steps): + if is_initial: + random_actor.run() + else: + collect_actor.run() + learner.run(iterations=initial_learning_iterations) for i in range(train_episodes): - env.reset() + train_env.reset() - for _ in range(train_steps_per_episode): + for j in range(train_steps_per_episode): + # if exploration_interval is not None and np.random.randint(0, exploration_interval) == 0: + # random_actor.run() + # else: collect_actor.run() - - learner.run(iterations=episode_learning_iterations) + learner.run(iterations=1) checkpointer.save(train_step) - print("######## Evaluation episode {} ########".format(i)) - env.reset() - eval_actor.run() + if i % eval_interval == 0: + print("######## Evaluation episode {} ########".format(i)) + eval_env.reset() + eval_actor.run() + +def is_initial(tempdir: str) -> bool: + return not os.path.exists(tempdir)