diff --git a/.vscode/launch.json b/.vscode/launch.json
index 962d6dc5c723e2748a4f2ff022bbdca79900053e..e903e22c3c19e7af805bb50d64f3f2c4beec117a 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -10,7 +10,7 @@
             "request": "launch",
             "program": "train/start_training.py",
             "console": "integratedTerminal",
-            "justMyCode": false
+            "justMyCode": true
         }
     ]
 }
\ No newline at end of file
diff --git a/envs/BaseEnvironment.py b/envs/BaseEnvironment.py
index 6e0fac122c38b28a49a0583355730f948ff78ded..d0b80d5bc5d988ac8c72894f097e76caa3343bdb 100644
--- a/envs/BaseEnvironment.py
+++ b/envs/BaseEnvironment.py
@@ -2,6 +2,8 @@ import numpy as np
 from airsim import MultirotorClient
 from envs import BaseEnvironment
 import abc
+import math
+from msgpackrpc.error import TimeoutError
 
 from numpy.typing import NDArray
 
@@ -11,9 +13,21 @@ from tf_agents.typing.types import TimeStep, NestedArray, Text
 from tf_agents.specs.array_spec import BoundedArraySpec, ArraySpec
 
 class BaseEnvironment(PyEnvironment):
+    min_throttle = 0.4
+    max_throttle = 1.0
+
+    max_pitch = math.pi / 4
+    min_pitch = -max_pitch
+
+    max_roll = math.pi / 4
+    min_roll = -max_roll
+
+    max_yaw = math.pi / 2
+    min_yaw = -max_yaw
     def __init__(self, ip: str = None, client: MultirotorClient = None, handle_auto_reset: bool = False):
+        self._ip = ip
         if client is None:
-            self._client = MultirotorClient(ip=ip)
+            self._create_client()
         else:
             self._client = client
 
@@ -36,12 +50,16 @@ class BaseEnvironment(PyEnvironment):
         if self._must_restart():
             return self.reset()
 
-        vz = self._convert_z_velocity_for_airsim(action[0].item())
-        vx = action[1].item()
-        vy = action[2].item()
+        throttle = self._convert_throttle_for_airsim(action[0].item())
+        pitch = self._convert_pitch_for_airsim(action[1].item())
+        roll = self._convert_roll_for_airsim(action[2].item())
+        yaw = self._convert_yaw_for_airsim(action[3].item())
+        # vx = self._convert_xy_velocity_for_airsim(action[1].item())
+        # vy = self._convert_xy_velocity_for_airsim(action[2].item())
 
         # self._client.cancelLastTask() # Maybe needed when using async commands
-        self._client.moveByVelocityAsync(vx, vy, vz, 0.0001).join() # execute command for 0.0001 seconds
+        # self._client.moveByVelocityBodyFrameAsync(vx, vy, vz, 0.01).join()
+        self._client.moveByRollPitchYawrateThrottleAsync(roll, pitch, yaw, throttle, 0.01).join()
 
         obs = self._get_observation()
         reward = self._calculate_reward(obs)
@@ -54,23 +72,90 @@ class BaseEnvironment(PyEnvironment):
             
         return ts.transition(obs, reward=reward)
     
-    def _convert_z_velocity_for_airsim(self, z_velocity: np.float64) -> np.float64:
-        """
-        returns converted velocity:
-        - 1: max down
-        - 0: stop/hover
-        - -1: max up
-        """
-        return z_velocity * (-2) + 1
+    def _create_client(self):
+        self._client = MultirotorClient(ip=self._ip, timeout_value=10)
     
-    def _convert_z_velocity_for_model(self, z_velocity: np.float64) -> np.float64:
-        """
-        returns converted velocity:
-        - 0: max down
-        - 0.5: stop/hover
-        - 1: max up
-        """
-        return (z_velocity - 1) / (-2)
+    # def _call_until_no_timeout(self, f, *args, **kwargs):
+    #     succeeded = False
+    #     has_timeout = False
+    #     return_val = None
+    #     while not succeeded:
+    #         try:
+    #             if not has_timeout:
+    #                 return_val = f(*args, **kwargs)
+    #             else:
+    #                 self._create_client()
+    #                 self._reset()
+    #                 return_val = f(*args, **kwargs)
+    #             succeeded = True
+    #         except TimeoutError:
+    #             print("Timeout error, check simulator")
+    #             has_timeout = True
+
+    #     return return_val
+    
+    @staticmethod
+    def _convert_property_for_airsim(min: float, max: float, property: np.float32) -> float:
+        return (max - min) * property + min
+    
+    def _convert_throttle_for_airsim(self, throttle: np.float32) -> float:
+        return BaseEnvironment._convert_property_for_airsim(self.min_throttle, self.max_throttle, throttle)
+    
+    def _convert_pitch_for_airsim(self, pitch: np.float32) -> float:
+        return BaseEnvironment._convert_property_for_airsim(self.min_pitch, self.max_pitch, pitch)
+    
+    def _convert_roll_for_airsim(self, roll: np.float32) -> float:
+        return BaseEnvironment._convert_property_for_airsim(self.min_roll, self.max_roll, roll)
+    
+    def _convert_yaw_for_airsim(self, yaw: np.float32) -> float:
+        return BaseEnvironment._convert_property_for_airsim(self.min_yaw, self.max_yaw, yaw)
+    
+    # def _convert_z_velocity_for_airsim(self, z_velocity: np.float64) -> np.float64:
+    #     """
+    #     returns converted velocity:
+    #     - 1: max down
+    #     - 0: stop/hover
+    #     - -1: max up
+    #     """
+    #     return z_velocity * (-2) + 1
+    
+    # def _convert_z_velocity_for_model(self, z_velocity: np.float64) -> np.float64:
+    #     """
+    #     returns converted velocity:
+    #     - 0: max down
+    #     - 0.5: stop/hover
+    #     - 1: max up
+    #     """
+    #     return (z_velocity - 1) / (-2)
+    
+    # def _convert_xy_velocity_for_airsim(self, velocity: np.float64) -> np.float64:
+    #     """
+    #     input
+    #     - 0: max negative speed
+    #     - 0.5: no speed
+    #     - 1: max speed
+
+    #     returns
+    #     - 1: max speed
+    #     - 0: no speed
+    #     - -1: max negative speed
+    #     """
+    #     return velocity * 2 - 1
+    
+    # def _convert_xy_velocity_for_model(self, velocity: np.float64) -> np.float64:
+    #     """
+    #     input
+    #     - 1: max speed
+    #     - 0: no speed
+    #     - -1: max negative speed
+        
+    #     returns
+
+    #     - 0: max negative speed
+    #     - 0.5: no speed
+    #     - 1: max speed
+    #     """
+    #     return (velocity - 1) / 2
     
     def render(self, mode: Text = 'rgb_array') -> NestedArray | None:
         raise NotImplementedError()
diff --git a/envs/StartEnvironment.py b/envs/StartEnvironment.py
index cfa49fa582ee853b6b27ce90bb1755c22a8f8ec7..7d1350fd159bb7f71f13441c1c74038a357f8222 100644
--- a/envs/StartEnvironment.py
+++ b/envs/StartEnvironment.py
@@ -2,6 +2,7 @@ from airsim import MultirotorClient
 from numpy.typing import NDArray
 from envs import BaseEnvironment, StartEnvironment
 import numpy as np
+from tf_agents.trajectories import time_step as ts
 
 from tf_agents.specs.array_spec import BoundedArraySpec, ArraySpec
 from tf_agents.typing.types import TimeStep, NestedArray
@@ -12,13 +13,15 @@ class StartEnvironment(BaseEnvironment):
                  client: MultirotorClient = None, 
                  handle_auto_reset: bool = False, 
                  desired_height: np.float64=140.0, 
-                 check_out_of_bounds: bool = False):
+                 check_out_of_bounds: bool = False,
+                 dynamic_start: bool = False):
         self._desired_height = desired_height
         self._check_out_of_bounds = check_out_of_bounds
         self._is_out_of_bounds = False
+        self._dynamic_start = dynamic_start
 
         self._action_spec = BoundedArraySpec(shape=(1,), dtype=np.float32, minimum=0, maximum=1, name='action')
-        self._observation_spec = ArraySpec(shape=(1,), dtype=np.float64, name='observation')
+        self._observation_spec = ArraySpec(shape=(2,), dtype=np.float64, name='observation')
 
         super(StartEnvironment, self).__init__(ip, client, handle_auto_reset)
     
@@ -30,11 +33,20 @@ class StartEnvironment(BaseEnvironment):
     
     def _reset(self) -> TimeStep:
         self._is_out_of_bounds = False
-        return super()._reset()
+        time_step = super()._reset()
+        if self._dynamic_start:
+            altitude = np.random.random_sample() * 60 + 122.2
+            position = self._client.getMultirotorState().gps_location
+            self._client.moveToGPSAsync(latitude=position.latitude, longitude=position.longitude, altitude=altitude, velocity=10).join()
+            time_step = ts.restart(self._get_observation())
+        return time_step
     
     def _step(self, action: NestedArray) -> TimeStep:
-        full_action = np.append(action, [0, 0])
-        return super()._step(full_action)
+        full_action = np.append(action, [0.5, 0.5, 0.5])
+        ts = super()._step(full_action)
+        print("Current height delta: {}".format(ts.observation[0]))
+
+        return ts
     
     def _is_terminated(self, obs: NDArray) -> bool:
         return False
@@ -46,9 +58,26 @@ class StartEnvironment(BaseEnvironment):
     def _must_restart(self) -> bool:
         return self._is_out_of_bounds
     
-    def _get_observation(self) -> NDArray:        
-        geo_point = self._client.getMultirotorState().gps_location
-        return np.array([geo_point.altitude - self._desired_height])
+    def _get_observation(self) -> NDArray:
+        state = self._client.getMultirotorState()
+        geo_point = state.gps_location
+        height_delta = geo_point.altitude - self._desired_height
+        normalized_height_delta = self._normalize_height_delta(height_delta)
+        # acceleration = self._client.getImuData().linear_acceleration
+        velocity = state.kinematics_estimated.linear_velocity
+        return np.array([normalized_height_delta, velocity.z_val], dtype=np.float64)
     
     def _calculate_reward(self, obs: NDArray) -> np.float32:
-        return np.float32(-(obs[0] ** 2))
+        # if abs(obs[0]) < 0.2:
+        #     return np.float32(1)
+        # else:
+        #     return np.float32(0)
+
+        abs_relative_height = abs(obs[0])
+        reward = -abs_relative_height
+        # if abs_relative_height < 0.5:
+        #     reward += 1
+        return np.float32(reward)
+    
+    def _normalize_height_delta(self, height: float) -> float:
+        return height
diff --git a/tests/starting_test.py b/tests/starting_test.py
index 012a7e33cbdab03828bdeb8c1be0ce531606df0e..360a200383fa95fc08ccff08be3405538ebcc455 100644
--- a/tests/starting_test.py
+++ b/tests/starting_test.py
@@ -13,6 +13,7 @@ from tf_agents.networks.actor_distribution_network import ActorDistributionNetwo
 from tf_agents.agents.sac.tanh_normal_projection_network import TanhNormalProjectionNetwork
 from tf_agents.train.utils import train_utils
 from tf_agents.policies.py_tf_eager_policy import PyTFEagerPolicy
+from tf_agents.policies.greedy_policy import GreedyPolicy
 
 SIM_IP = "192.168.8.195"
 actor_fc_layer_params = (10,)
@@ -29,7 +30,7 @@ tempdir="out/"
 
 client = airsim.MultirotorClient(SIM_IP)
 
-eval_policy = tf.saved_model.load("out/policies/greedy_policy")
+eval_policy = tf.saved_model.load("out/policy")
 
 def _step_loop(env, steps: int, initial_time_step):
     time_step = initial_time_step
@@ -55,8 +56,9 @@ def _assert_last_time_steps(time_steps: []):
     last_observations = np.array(last_observations).flatten()
 
     for ts in last_steps:
-        height_diffrence = ts.observation.numpy().flatten()
+        height_diffrence = ts.observation.numpy().flatten()[0]
         assert(abs(height_diffrence) < 1.0)
+
     assert(abs(last_observations.max() - last_observations.min()) < 1.0)
 
 def test130():
@@ -93,3 +95,9 @@ def test_landing():
 
     time_steps =_step_loop(env, 2000, time_step)
     _assert_last_time_steps(time_steps)
+
+test130()
+test150()
+test180()
+test150from180()
+test_landing()
diff --git a/train/start_training.py b/train/start_training.py
index dcfdc6588dfe0118c505111fe9c3c8af8e6f969a..546fa8633d0076ed409ac6e2079501ec94e57096 100644
--- a/train/start_training.py
+++ b/train/start_training.py
@@ -1,6 +1,7 @@
-from envs import StartEnvironment
+from envs import StartEnvironment, TestEnvironment
 import os
 import train.utils as utils
+import numpy as np
 
 from tf_agents.environments.tf_py_environment import TFPyEnvironment
 from tf_agents.train.utils import train_utils
@@ -11,55 +12,72 @@ SIM_IP = "192.168.8.195"
 # Use "num_iterations = 1e6" for better results (2 hrs)
 # 1e5 is just so this doesn't take too long (1 hr)
 num_episodes = 10000
-num_steps_per_episode = 1000
+num_steps_per_episode = 2000
 
-initial_collect_steps = 1000 # @param {type:"integer"}
+initial_collect_steps = 2000 # @param {type:"integer"}
 collect_steps_per_iteration = 1 # @param {type:"integer"}
-replay_buffer_capacity = 10000 # @param {type:"integer"}
+replay_buffer_capacity = 100000 # @param {type:"integer"}
+replay_buffer_server="localhost:40000"
 
-batch_size = 256 # @param {type:"integer"}
+batch_size = 1024 # @param {type:"integer"}
 
 critic_learning_rate = 3e-4 # @param {type:"number"}
 actor_learning_rate = 3e-4 # @param {type:"number"}
 alpha_learning_rate = 3e-4 # @param {type:"number"}
 target_update_tau = 0.005 # @param {type:"number"}
 target_update_period = 1 # @param {type:"number"}
-gamma = 0.99 # @param {type:"number"}
-reward_scale_factor = 0.8 # @param {type:"number"}
+gamma = 0.9 # @param {type:"number"}
+reward_scale_factor = 1.0 # @param {type:"number"}
 
-actor_fc_layer_params = (10,)
-critic_joint_fc_layer_params = (10,)
+actor_fc_layer_params = (16,16)
+critic_joint_fc_layer_params = actor_fc_layer_params
 
 log_interval = 5000 # @param {type:"integer"}
 
 num_eval_episodes = 20 # @param {type:"integer"}
-eval_interval = 10000 # @param {type:"integer"}
+eval_interval = 5000 # @param {type:"integer"}
 
 policy_save_interval = 1000 # @param {type:"integer"}
 
 tempdir = "out/"
+is_initial = utils.is_initial(tempdir)
 
 utils.set_gpu()
 
-py_env = StartEnvironment(ip=SIM_IP)
-env = TFPyEnvironment(py_env)
+train_env = StartEnvironment(ip=SIM_IP, desired_height=160, dynamic_start=True)
+eval_env = StartEnvironment(ip=SIM_IP, desired_height=180, dynamic_start=False)
+train_tf_env = TFPyEnvironment(train_env)
+# py_env = TestEnvironment(ip=SIM_IP)
+# env = TFPyEnvironment(py_env)
+# env.reset()
 
-observation_spec = env.observation_spec()
-action_spec = env.action_spec()
-time_step_spec = env.time_step_spec()
+# action = np.array([0,0.5,0.5,0.5], dtype=np.float32)
+
+# init_action = np.array([0.5, 0.5, 0.5, 0])
+# for i in range(35):
+#     py_env.step(init_action)
+
+# while True:
+#     t = py_env.step(action)
+#     m = 0
+
+observation_spec = train_tf_env.observation_spec()
+action_spec = train_tf_env.action_spec()
+time_step_spec = train_tf_env.time_step_spec()
 
 train_step = train_utils.create_train_step()
 
-tf_agent = utils.create_agent(action_spec, observation_spec, time_step_spec, train_step, actor_fc_layer_params, critic_joint_fc_layer_params)
+tf_agent = utils.create_agent(action_spec, observation_spec, time_step_spec, train_step, actor_fc_layer_params, critic_joint_fc_layer_params, alpha_learning_rate=alpha_learning_rate, actor_learning_rate=actor_learning_rate, critic_learning_rate=critic_learning_rate, gamma=gamma, reward_scale_factor=reward_scale_factor)
 
-reverb_replay, rb_observer = utils.create_replay_buffer(tf_agent.collect_data_spec, replay_buffer_capacity)
+reverb_replay, rb_observer = utils.create_replay_buffer(tf_agent.collect_data_spec, replay_buffer_server, replay_buffer_capacity)
 
-eval_policy = utils.create_policy(tf_agent.policy)
 collect_policy = utils.create_policy(tf_agent.collect_policy)
+eval_policy = utils.create_policy(tf_agent.policy)
 random_policy = RandomPyPolicy(time_step_spec, action_spec)
 
-initial_collect_actor, collect_actor, eval_actor = utils.create_actors(
-    py_env,
+random_actor, collect_actor, eval_actor = utils.create_actors(
+    train_env,
+    eval_env,
     random_policy,
     collect_policy,
     eval_policy,
@@ -67,20 +85,29 @@ initial_collect_actor, collect_actor, eval_actor = utils.create_actors(
     rb_observer,
 )
 
-initial_collect_actor.run()
+checkpoint_dir = os.path.join(tempdir, 'checkpoint')
+train_checkpointer, policy_saver = utils.create_checkpoint(tempdir, tf_agent, reverb_replay, train_step)
+# reverb_replay = train_checkpointer.manager.checkpoint.replay_buffer # must be done manually, gets somehow not restored
 
 agent_learner = utils.create_learner(tf_agent, train_step, tempdir, num_steps_per_episode, reverb_replay, batch_size)
 
-checkpoint_dir = os.path.join(tempdir, 'checkpoint')
-train_checkpointer = utils.create_checkpoint(tempdir, tf_agent, reverb_replay, train_step)
+# eval_env.reset()
+# eval_actor.run()
+# env.reset()
+# eval_actor.run()
 
 utils.train_loop(
     agent_learner,
     collect_actor,
     eval_actor,
-    env,
+    random_actor,
+    train_env,
+    eval_env,
     train_checkpointer,
+    policy_saver,
     train_step,
     num_episodes,
     num_steps_per_episode,
+    epsilon=0.2,
+    is_initial=is_initial
 )
diff --git a/train/utils.py b/train/utils.py
index f43da3f6925e82c3ba1834f26869f06399bd2406..33fc537f0ba1220c604c9a46eefa0b93f95f82c4 100644
--- a/train/utils.py
+++ b/train/utils.py
@@ -2,6 +2,7 @@ from typing import Any, Tuple
 import tensorflow as tf
 import reverb
 import os
+import numpy as np
 
 from tf_agents.typing.types import Float, Int, NestedTensorSpec
 from tf_agents.agents.ddpg.critic_network import CriticNetwork
@@ -20,9 +21,8 @@ from tf_agents.metrics.py_metrics import EnvironmentSteps
 from tf_agents.train import actor
 from tf_agents.train import learner
 from tf_agents.train.learner import Learner
-from tf_agents.train.triggers import PolicySavedModelTrigger
 from tf_agents.utils.common import Checkpointer
-from tf_agents.environments.tf_py_environment import TFPyEnvironment
+from tf_agents.policies import PolicySaver
 
 
 def set_gpu():
@@ -74,17 +74,16 @@ def create_agent(action_spec: Any,
         reward_scale_factor=reward_scale_factor,
         train_step_counter=train_step)
 
-def create_replay_buffer(collect_data_spec: NestedTensorSpec, capacity: int) -> Tuple[ReverbReplayBuffer, ReverbAddTrajectoryObserver]:
+def create_replay_buffer(collect_data_spec: NestedTensorSpec, server_address: str, replay_buffer_capacity: int) -> Tuple[ReverbReplayBuffer, ReverbAddTrajectoryObserver]:
     table_name = 'uniform_table'
     table = reverb.Table(
         table_name,
-        max_size=capacity,
+        max_size=replay_buffer_capacity,
         sampler=reverb.selectors.Uniform(),
         remover=reverb.selectors.Fifo(),
         rate_limiter=reverb.rate_limiters.MinSize(1))
 
     reverb_server = reverb.Server([table])
-
     replay_buffer = ReverbReplayBuffer(
         collect_data_spec,
         sequence_length=2,
@@ -102,27 +101,30 @@ def create_replay_buffer(collect_data_spec: NestedTensorSpec, capacity: int) ->
 def create_policy(policy: TFPolicy) -> PyTFEagerPolicy:
     return PyTFEagerPolicy(policy, use_tf_function=True)
 
-def create_actors(py_env: PyEnvironment, 
+# def create_eval_policy(policy: TFPolicy) -> PyTFEagerPolicy:
+#     return PyTFEagerPolicy(GreedyPolicy(policy), use_tf_function=True)
+
+def create_actors(train_env: PyEnvironment, 
+                  eval_env: PyEnvironment, 
                   random_policy: PyPolicy, 
                   collect_policy: PyPolicy, 
                   eval_policy: PyPolicy,
                   train_step: tf.Variable,
                   rb_observer: ReverbAddTrajectoryObserver,
-                  initial_collect_steps: int = 1000,
                   eval_episodes_per_run: int = 20,
                   eval_steps_per_run: int = 2000,
                   tempdir: str = "out/"
                   ) -> Tuple[Actor, Actor, Actor]:
-    initial_collect_actor = Actor(
-        py_env,
+    random_actor = Actor(
+        train_env,
         random_policy,
         train_step,
-        steps_per_run=initial_collect_steps,
+        steps_per_run=1,
         observers=[rb_observer])
 
     env_step_metric = EnvironmentSteps()
     collect_actor = Actor(
-        py_env,
+        train_env,
         collect_policy,
         train_step,
         steps_per_run=1,
@@ -131,7 +133,7 @@ def create_actors(py_env: PyEnvironment,
         observers=[rb_observer, env_step_metric])
 
     eval_actor = Actor(
-        py_env,
+        eval_env,
         eval_policy,
         train_step,
         episodes_per_run=eval_episodes_per_run,
@@ -139,7 +141,7 @@ def create_actors(py_env: PyEnvironment,
         metrics=actor.eval_metrics(eval_episodes_per_run),
     )
 
-    return initial_collect_actor, collect_actor, eval_actor
+    return random_actor, collect_actor, eval_actor
 
 def create_learner(agent: TFAgent, train_step: tf.Variable, tempdir: str, steps_per_episode: int, replay_buffer: ReverbReplayBuffer, batch_size: int) -> Learner:
     dataset = replay_buffer.as_dataset(
@@ -147,20 +149,12 @@ def create_learner(agent: TFAgent, train_step: tf.Variable, tempdir: str, steps_
     experience_dataset_fn = lambda: dataset
 
     saved_model_dir = os.path.join(tempdir, learner.POLICY_SAVED_MODEL_DIR)
-    learning_triggers = [
-        PolicySavedModelTrigger(
-            saved_model_dir,
-            agent,
-            train_step,
-            interval=steps_per_episode),
-    ]
 
     return Learner(
     tempdir,
     train_step,
     agent,
-    experience_dataset_fn,
-    triggers=learning_triggers)
+    experience_dataset_fn)
 
 def create_checkpoint(tempdir: str, agent: TFAgent, replay_buffer: ReverbReplayBuffer, train_step: tf.Variable) -> Checkpointer:
     checkpoint_dir = os.path.join(tempdir, 'checkpoint')
@@ -174,30 +168,54 @@ def create_checkpoint(tempdir: str, agent: TFAgent, replay_buffer: ReverbReplayB
     )
     checkpointer.initialize_or_restore()
 
-    return checkpointer
+    policy_saver = PolicySaver(agent.policy)
+
+    return checkpointer, policy_saver
 
 def train_loop(learner: Learner, 
                collect_actor: Actor, 
                eval_actor: Actor,
-               env: TFPyEnvironment, 
-               checkpointer: Checkpointer, 
+               random_actor: Actor,
+               train_env: PyEnvironment, 
+               eval_env: PyEnvironment, 
+               checkpointer: Checkpointer,
+               policy_saver: PolicySaver,
                train_step: tf.Variable, 
                train_episodes: int, 
                train_steps_per_episode: int,
+               initial_steps: int = 1000,
                initial_learning_iterations: int = 1,
                episode_learning_iterations: int = 10,
+               epsilon: float = None,
+               eval_interval: int = 5,
+               is_initial: bool = True,
+               outdir: str = "out",
                ):
+    policy_dir = os.path.join(outdir, "policy")
+    policy_saver.save(policy_dir)
+    for _ in range(initial_steps):
+        if is_initial:
+            random_actor.run()
+        else:
+            collect_actor.run()
+
     learner.run(iterations=initial_learning_iterations)
     for i in range(train_episodes):
-        env.reset()
+        train_env.reset()
 
-        for _ in range(train_steps_per_episode):
+        for j in range(train_steps_per_episode):
+            # if exploration_interval is not None and np.random.randint(0, exploration_interval) == 0:
+            #     random_actor.run()
+            # else:
             collect_actor.run()
 
-
-        learner.run(iterations=episode_learning_iterations)
+            learner.run(iterations=1)
         checkpointer.save(train_step)
 
-        print("######## Evaluation episode {} ########".format(i))
-        env.reset()
-        eval_actor.run()
+        if i % eval_interval == 0:
+            print("######## Evaluation episode {} ########".format(i))
+            eval_env.reset()
+            eval_actor.run()
+
+def is_initial(tempdir: str) -> bool:
+    return not os.path.exists(tempdir)