diff --git a/envs/StartEnvironment.py b/envs/StartEnvironment.py index 7d1350fd159bb7f71f13441c1c74038a357f8222..16eb8afd69268eb94e28a3a02f1d53bd10dbbe8b 100644 --- a/envs/StartEnvironment.py +++ b/envs/StartEnvironment.py @@ -63,21 +63,11 @@ class StartEnvironment(BaseEnvironment): geo_point = state.gps_location height_delta = geo_point.altitude - self._desired_height normalized_height_delta = self._normalize_height_delta(height_delta) - # acceleration = self._client.getImuData().linear_acceleration velocity = state.kinematics_estimated.linear_velocity return np.array([normalized_height_delta, velocity.z_val], dtype=np.float64) def _calculate_reward(self, obs: NDArray) -> np.float32: - # if abs(obs[0]) < 0.2: - # return np.float32(1) - # else: - # return np.float32(0) - - abs_relative_height = abs(obs[0]) - reward = -abs_relative_height - # if abs_relative_height < 0.5: - # reward += 1 - return np.float32(reward) + return np.float32(abs(obs[0])) def _normalize_height_delta(self, height: float) -> float: return height diff --git a/train/start_training.py b/train/start_training.py index 546fa8633d0076ed409ac6e2079501ec94e57096..f7d514301fca9c900b78ba64b2e42a4b56867781 100644 --- a/train/start_training.py +++ b/train/start_training.py @@ -1,4 +1,4 @@ -from envs import StartEnvironment, TestEnvironment +from envs import StartEnvironment import os import train.utils as utils import numpy as np @@ -9,36 +9,21 @@ from tf_agents.policies.random_py_policy import RandomPyPolicy SIM_IP = "192.168.8.195" -# Use "num_iterations = 1e6" for better results (2 hrs) -# 1e5 is just so this doesn't take too long (1 hr) num_episodes = 10000 num_steps_per_episode = 2000 -initial_collect_steps = 2000 # @param {type:"integer"} -collect_steps_per_iteration = 1 # @param {type:"integer"} -replay_buffer_capacity = 100000 # @param {type:"integer"} -replay_buffer_server="localhost:40000" +replay_buffer_capacity = 100000 -batch_size = 1024 # @param {type:"integer"} +batch_size = 1024 -critic_learning_rate = 3e-4 # @param {type:"number"} -actor_learning_rate = 3e-4 # @param {type:"number"} -alpha_learning_rate = 3e-4 # @param {type:"number"} -target_update_tau = 0.005 # @param {type:"number"} -target_update_period = 1 # @param {type:"number"} -gamma = 0.9 # @param {type:"number"} -reward_scale_factor = 1.0 # @param {type:"number"} +critic_learning_rate = 3e-4 +actor_learning_rate = 3e-4 +alpha_learning_rate = 3e-4 +gamma = 0.9 actor_fc_layer_params = (16,16) critic_joint_fc_layer_params = actor_fc_layer_params -log_interval = 5000 # @param {type:"integer"} - -num_eval_episodes = 20 # @param {type:"integer"} -eval_interval = 5000 # @param {type:"integer"} - -policy_save_interval = 1000 # @param {type:"integer"} - tempdir = "out/" is_initial = utils.is_initial(tempdir) @@ -47,19 +32,6 @@ utils.set_gpu() train_env = StartEnvironment(ip=SIM_IP, desired_height=160, dynamic_start=True) eval_env = StartEnvironment(ip=SIM_IP, desired_height=180, dynamic_start=False) train_tf_env = TFPyEnvironment(train_env) -# py_env = TestEnvironment(ip=SIM_IP) -# env = TFPyEnvironment(py_env) -# env.reset() - -# action = np.array([0,0.5,0.5,0.5], dtype=np.float32) - -# init_action = np.array([0.5, 0.5, 0.5, 0]) -# for i in range(35): -# py_env.step(init_action) - -# while True: -# t = py_env.step(action) -# m = 0 observation_spec = train_tf_env.observation_spec() action_spec = train_tf_env.action_spec() @@ -67,9 +39,9 @@ time_step_spec = train_tf_env.time_step_spec() train_step = train_utils.create_train_step() -tf_agent = utils.create_agent(action_spec, observation_spec, time_step_spec, train_step, actor_fc_layer_params, critic_joint_fc_layer_params, alpha_learning_rate=alpha_learning_rate, actor_learning_rate=actor_learning_rate, critic_learning_rate=critic_learning_rate, gamma=gamma, reward_scale_factor=reward_scale_factor) +tf_agent = utils.create_agent(action_spec, observation_spec, time_step_spec, train_step, actor_fc_layer_params, critic_joint_fc_layer_params, alpha_learning_rate=alpha_learning_rate, actor_learning_rate=actor_learning_rate, critic_learning_rate=critic_learning_rate, gamma=gamma) -reverb_replay, rb_observer = utils.create_replay_buffer(tf_agent.collect_data_spec, replay_buffer_server, replay_buffer_capacity) +reverb_replay, rb_observer = utils.create_replay_buffer(tf_agent.collect_data_spec, replay_buffer_capacity) collect_policy = utils.create_policy(tf_agent.collect_policy) eval_policy = utils.create_policy(tf_agent.policy) @@ -87,15 +59,8 @@ random_actor, collect_actor, eval_actor = utils.create_actors( checkpoint_dir = os.path.join(tempdir, 'checkpoint') train_checkpointer, policy_saver = utils.create_checkpoint(tempdir, tf_agent, reverb_replay, train_step) -# reverb_replay = train_checkpointer.manager.checkpoint.replay_buffer # must be done manually, gets somehow not restored - agent_learner = utils.create_learner(tf_agent, train_step, tempdir, num_steps_per_episode, reverb_replay, batch_size) -# eval_env.reset() -# eval_actor.run() -# env.reset() -# eval_actor.run() - utils.train_loop( agent_learner, collect_actor, @@ -108,6 +73,5 @@ utils.train_loop( train_step, num_episodes, num_steps_per_episode, - epsilon=0.2, is_initial=is_initial ) diff --git a/train/utils.py b/train/utils.py index 33fc537f0ba1220c604c9a46eefa0b93f95f82c4..da7e9aa54432388ac296c08f8aa82f6463be4612 100644 --- a/train/utils.py +++ b/train/utils.py @@ -42,7 +42,7 @@ def create_agent(action_spec: Any, target_update_tau: Float = 0.005, target_update_period: Int = 1, gamma: Float = 0.99, - reward_scale_factor: Float = 0.8, + reward_scale_factor: Float = 1.0, ) -> SacAgent: critic_net = CriticNetwork( (observation_spec, action_spec), @@ -74,7 +74,7 @@ def create_agent(action_spec: Any, reward_scale_factor=reward_scale_factor, train_step_counter=train_step) -def create_replay_buffer(collect_data_spec: NestedTensorSpec, server_address: str, replay_buffer_capacity: int) -> Tuple[ReverbReplayBuffer, ReverbAddTrajectoryObserver]: +def create_replay_buffer(collect_data_spec: NestedTensorSpec, replay_buffer_capacity: int) -> Tuple[ReverbReplayBuffer, ReverbAddTrajectoryObserver]: table_name = 'uniform_table' table = reverb.Table( table_name, @@ -185,14 +185,11 @@ def train_loop(learner: Learner, train_steps_per_episode: int, initial_steps: int = 1000, initial_learning_iterations: int = 1, - episode_learning_iterations: int = 10, - epsilon: float = None, eval_interval: int = 5, is_initial: bool = True, outdir: str = "out", ): policy_dir = os.path.join(outdir, "policy") - policy_saver.save(policy_dir) for _ in range(initial_steps): if is_initial: random_actor.run() @@ -211,6 +208,7 @@ def train_loop(learner: Learner, learner.run(iterations=1) checkpointer.save(train_step) + policy_saver.save(policy_dir) if i % eval_interval == 0: print("######## Evaluation episode {} ########".format(i))