diff --git a/envs/StartEnvironment.py b/envs/StartEnvironment.py
index 7d1350fd159bb7f71f13441c1c74038a357f8222..16eb8afd69268eb94e28a3a02f1d53bd10dbbe8b 100644
--- a/envs/StartEnvironment.py
+++ b/envs/StartEnvironment.py
@@ -63,21 +63,11 @@ class StartEnvironment(BaseEnvironment):
         geo_point = state.gps_location
         height_delta = geo_point.altitude - self._desired_height
         normalized_height_delta = self._normalize_height_delta(height_delta)
-        # acceleration = self._client.getImuData().linear_acceleration
         velocity = state.kinematics_estimated.linear_velocity
         return np.array([normalized_height_delta, velocity.z_val], dtype=np.float64)
     
     def _calculate_reward(self, obs: NDArray) -> np.float32:
-        # if abs(obs[0]) < 0.2:
-        #     return np.float32(1)
-        # else:
-        #     return np.float32(0)
-
-        abs_relative_height = abs(obs[0])
-        reward = -abs_relative_height
-        # if abs_relative_height < 0.5:
-        #     reward += 1
-        return np.float32(reward)
+        return np.float32(abs(obs[0]))
     
     def _normalize_height_delta(self, height: float) -> float:
         return height
diff --git a/train/start_training.py b/train/start_training.py
index 546fa8633d0076ed409ac6e2079501ec94e57096..f7d514301fca9c900b78ba64b2e42a4b56867781 100644
--- a/train/start_training.py
+++ b/train/start_training.py
@@ -1,4 +1,4 @@
-from envs import StartEnvironment, TestEnvironment
+from envs import StartEnvironment
 import os
 import train.utils as utils
 import numpy as np
@@ -9,36 +9,21 @@ from tf_agents.policies.random_py_policy import RandomPyPolicy
 
 SIM_IP = "192.168.8.195"
 
-# Use "num_iterations = 1e6" for better results (2 hrs)
-# 1e5 is just so this doesn't take too long (1 hr)
 num_episodes = 10000
 num_steps_per_episode = 2000
 
-initial_collect_steps = 2000 # @param {type:"integer"}
-collect_steps_per_iteration = 1 # @param {type:"integer"}
-replay_buffer_capacity = 100000 # @param {type:"integer"}
-replay_buffer_server="localhost:40000"
+replay_buffer_capacity = 100000
 
-batch_size = 1024 # @param {type:"integer"}
+batch_size = 1024
 
-critic_learning_rate = 3e-4 # @param {type:"number"}
-actor_learning_rate = 3e-4 # @param {type:"number"}
-alpha_learning_rate = 3e-4 # @param {type:"number"}
-target_update_tau = 0.005 # @param {type:"number"}
-target_update_period = 1 # @param {type:"number"}
-gamma = 0.9 # @param {type:"number"}
-reward_scale_factor = 1.0 # @param {type:"number"}
+critic_learning_rate = 3e-4
+actor_learning_rate = 3e-4
+alpha_learning_rate = 3e-4
+gamma = 0.9
 
 actor_fc_layer_params = (16,16)
 critic_joint_fc_layer_params = actor_fc_layer_params
 
-log_interval = 5000 # @param {type:"integer"}
-
-num_eval_episodes = 20 # @param {type:"integer"}
-eval_interval = 5000 # @param {type:"integer"}
-
-policy_save_interval = 1000 # @param {type:"integer"}
-
 tempdir = "out/"
 is_initial = utils.is_initial(tempdir)
 
@@ -47,19 +32,6 @@ utils.set_gpu()
 train_env = StartEnvironment(ip=SIM_IP, desired_height=160, dynamic_start=True)
 eval_env = StartEnvironment(ip=SIM_IP, desired_height=180, dynamic_start=False)
 train_tf_env = TFPyEnvironment(train_env)
-# py_env = TestEnvironment(ip=SIM_IP)
-# env = TFPyEnvironment(py_env)
-# env.reset()
-
-# action = np.array([0,0.5,0.5,0.5], dtype=np.float32)
-
-# init_action = np.array([0.5, 0.5, 0.5, 0])
-# for i in range(35):
-#     py_env.step(init_action)
-
-# while True:
-#     t = py_env.step(action)
-#     m = 0
 
 observation_spec = train_tf_env.observation_spec()
 action_spec = train_tf_env.action_spec()
@@ -67,9 +39,9 @@ time_step_spec = train_tf_env.time_step_spec()
 
 train_step = train_utils.create_train_step()
 
-tf_agent = utils.create_agent(action_spec, observation_spec, time_step_spec, train_step, actor_fc_layer_params, critic_joint_fc_layer_params, alpha_learning_rate=alpha_learning_rate, actor_learning_rate=actor_learning_rate, critic_learning_rate=critic_learning_rate, gamma=gamma, reward_scale_factor=reward_scale_factor)
+tf_agent = utils.create_agent(action_spec, observation_spec, time_step_spec, train_step, actor_fc_layer_params, critic_joint_fc_layer_params, alpha_learning_rate=alpha_learning_rate, actor_learning_rate=actor_learning_rate, critic_learning_rate=critic_learning_rate, gamma=gamma)
 
-reverb_replay, rb_observer = utils.create_replay_buffer(tf_agent.collect_data_spec, replay_buffer_server, replay_buffer_capacity)
+reverb_replay, rb_observer = utils.create_replay_buffer(tf_agent.collect_data_spec, replay_buffer_capacity)
 
 collect_policy = utils.create_policy(tf_agent.collect_policy)
 eval_policy = utils.create_policy(tf_agent.policy)
@@ -87,15 +59,8 @@ random_actor, collect_actor, eval_actor = utils.create_actors(
 
 checkpoint_dir = os.path.join(tempdir, 'checkpoint')
 train_checkpointer, policy_saver = utils.create_checkpoint(tempdir, tf_agent, reverb_replay, train_step)
-# reverb_replay = train_checkpointer.manager.checkpoint.replay_buffer # must be done manually, gets somehow not restored
-
 agent_learner = utils.create_learner(tf_agent, train_step, tempdir, num_steps_per_episode, reverb_replay, batch_size)
 
-# eval_env.reset()
-# eval_actor.run()
-# env.reset()
-# eval_actor.run()
-
 utils.train_loop(
     agent_learner,
     collect_actor,
@@ -108,6 +73,5 @@ utils.train_loop(
     train_step,
     num_episodes,
     num_steps_per_episode,
-    epsilon=0.2,
     is_initial=is_initial
 )
diff --git a/train/utils.py b/train/utils.py
index 33fc537f0ba1220c604c9a46eefa0b93f95f82c4..da7e9aa54432388ac296c08f8aa82f6463be4612 100644
--- a/train/utils.py
+++ b/train/utils.py
@@ -42,7 +42,7 @@ def create_agent(action_spec: Any,
                  target_update_tau: Float = 0.005,
                  target_update_period: Int = 1,
                  gamma: Float = 0.99,
-                 reward_scale_factor: Float = 0.8,
+                 reward_scale_factor: Float = 1.0,
                  ) -> SacAgent:
     critic_net = CriticNetwork(
         (observation_spec, action_spec),
@@ -74,7 +74,7 @@ def create_agent(action_spec: Any,
         reward_scale_factor=reward_scale_factor,
         train_step_counter=train_step)
 
-def create_replay_buffer(collect_data_spec: NestedTensorSpec, server_address: str, replay_buffer_capacity: int) -> Tuple[ReverbReplayBuffer, ReverbAddTrajectoryObserver]:
+def create_replay_buffer(collect_data_spec: NestedTensorSpec, replay_buffer_capacity: int) -> Tuple[ReverbReplayBuffer, ReverbAddTrajectoryObserver]:
     table_name = 'uniform_table'
     table = reverb.Table(
         table_name,
@@ -185,14 +185,11 @@ def train_loop(learner: Learner,
                train_steps_per_episode: int,
                initial_steps: int = 1000,
                initial_learning_iterations: int = 1,
-               episode_learning_iterations: int = 10,
-               epsilon: float = None,
                eval_interval: int = 5,
                is_initial: bool = True,
                outdir: str = "out",
                ):
     policy_dir = os.path.join(outdir, "policy")
-    policy_saver.save(policy_dir)
     for _ in range(initial_steps):
         if is_initial:
             random_actor.run()
@@ -211,6 +208,7 @@ def train_loop(learner: Learner,
 
             learner.run(iterations=1)
         checkpointer.save(train_step)
+        policy_saver.save(policy_dir)
 
         if i % eval_interval == 0:
             print("######## Evaluation episode {} ########".format(i))