From bab76978bc165b3782ea84f43becc23f96238d30 Mon Sep 17 00:00:00 2001
From: Peter Zhokhov <peterz@openai.com>
Date: Wed, 27 Feb 2019 12:49:40 -0800
Subject: [PATCH 01/37] fixes to catch changes in gym

---
 baselines/common/tests/util.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/baselines/common/tests/util.py b/baselines/common/tests/util.py
index 51b9d0f..ce44d50 100644
--- a/baselines/common/tests/util.py
+++ b/baselines/common/tests/util.py
@@ -1,6 +1,5 @@
 import tensorflow as tf
 import numpy as np
-from gym.spaces import np_random
 from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
 
 N_TRIALS = 10000
@@ -8,8 +7,6 @@
 
 def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
     np.random.seed(0)
-    np_random.seed(0)
-
     env = DummyVecEnv([env_fn])
 
 

From 2dcf4002e0c743e89460063ad3d9e478ae7c54cb Mon Sep 17 00:00:00 2001
From: Peter Zhokhov <peterz@openai.com>
Date: Wed, 27 Feb 2019 14:22:24 -0800
Subject: [PATCH 02/37] raised the tolerance on the test_microbatches test

---
 baselines/ppo2/test_microbatches.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/baselines/ppo2/test_microbatches.py b/baselines/ppo2/test_microbatches.py
index 291c2d2..829e0a9 100644
--- a/baselines/ppo2/test_microbatches.py
+++ b/baselines/ppo2/test_microbatches.py
@@ -25,10 +25,11 @@ def env_fn():
     env_test = DummyVecEnv([env_fn])
     sess_test = make_session(make_default=True, graph=tf.Graph())
     learn_fn(env=env_test, model_fn=partial(MicrobatchedModel, microbatch_size=2))
+    # learn_fn(env=env_test)
     vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()}
 
     for v in vars_ref:
-        np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=1e-3)
+        np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=3e-3)
 
 if __name__ == '__main__':
     test_microbatches()

From d3535a2d98cc388c452583cf744e3a89f9701181 Mon Sep 17 00:00:00 2001
From: pzhokhov <peterzhokhoff@gmail.com>
Date: Wed, 27 Feb 2019 15:35:31 -0800
Subject: [PATCH 03/37] release Internal changes (#800)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* joshim5 changes (width and height to WarpFrame wrapper)

* match network output with action distribution via a linear layer only if necessary (#167)

* support color vs. grayscale option in WarpFrame wrapper (#166)

* support color vs. grayscale option in WarpFrame wrapper

* Support color in other wrappers

* Updated per Peters suggestions

* fixing test failures

* ppo2 with microbatches (#168)

* pass microbatch_size to the model during construction

* microbatch fixes and test (#169)

* microbatch fixes and test

* tiny cleanup

* added assertions to the test

* vpg-related fix

* Peterz joshim5 subclass ppo2 model (#170)

* microbatch fixes and test

* tiny cleanup

* added assertions to the test

* vpg-related fix

* subclassing the model to make microbatched version of model WIP

* made microbatched model a subclass of ppo2 Model

* flake8 complaint

* mpi-less ppo2 (resolving merge conflict)

* flake8 and mpi4py imports in ppo2/model.py

* more un-mpying

* merge master

* updates to the benchmark viewer code + autopep8 (#184)

* viz docs and syntactic sugar wip

* update viewer yaml to use persistent volume claims

* move plot_util to baselines.common, update links

* use 1Tb hard drive for results viewer

* small updates to benchmark vizualizer code

* autopep8

* autopep8

* any folder can be a benchmark

* massage games image a little bit

* fixed --preload option in app.py

* remove preload from run_viewer.sh

* remove pdb breakpoints

* update bench-viewer.yaml

* fixed bug (#185)

* fixed bug

it's wrong to do the else statement, because no other nodes would start.

* changed the fix slightly

* Refactor her phase 1 (#194)

* add monitor to the rollout envs in her RUN BENCHMARKS her

* Slice -> Slide in her benchmarks RUN BENCHMARKS her

* run her benchmark for 200 epochs

* dummy commit to RUN BENCHMARKS her

* her benchmark for 500 epochs RUN BENCHMARKS her

* add num_timesteps to her benchmark to be compatible with viewer RUN BENCHMARKS her

* add num_timesteps to her benchmark to be compatible with viewer RUN BENCHMARKS her

* add num_timesteps to her benchmark to be compatible with viewer RUN BENCHMARKS her

* disable saving of policies in her benchmark RUN BENCHMARKS her

* run fetch benchmarks with ppo2 and ddpg RUN BENCHMARKS Fetch

* run fetch benchmarks with ppo2 and ddpg RUN BENCHMARKS Fetch

* launcher refactor wip

* wip

* her works on FetchReach

* her runner refactor RUN BENCHMARKS Fetch1M

* unit test for her

* fixing warnings in mpi_average in her, skip test_fetchreach if mujoco is not present

* pickle-based serialization in her

* remove extra import from subproc_vec_env.py

* investigating differences in rollout.py

* try with old rollout code RUN BENCHMARKS her

* temporarily use DummyVecEnv in cmd_util.py RUN BENCHMARKS her

* dummy commit to RUN BENCHMARKS her

* set info_values in rollout worker in her RUN BENCHMARKS her

* bug in rollout_new.py RUN BENCHMARKS her

* fixed bug in rollout_new.py RUN BENCHMARKS her

* do not use last step because vecenv calls reset and returns obs after reset RUN BENCHMARKS her

* updated buffer sizes RUN BENCHMARKS her

* fixed loading/saving via joblib

* dust off learning from demonstrations in HER, docs, refactor

* add deprecation notice on her play and plot files

* address comments by Matthias

* 1.5 months of codegen changes (#196)

* play with resnet

* feed_dict version

* coinrun prob and more stats

* fixes to get_choices_specs & hp search

* minor prob fixes

* minor fixes

* minor

* alternative version of rl_algo stuff

* pylint fixes

* fix bugs, move node_filters to soup

* changed how get_algo works

* change how get_algo works, probably broke all tests

* continue previous refactor

* get eval_agent running again

* fixing tests

* fix tests

* fix more tests

* clean up cma stuff

* fix experiment

* minor changes to eval_agent to make ppo_metal use gpu

* make dict space work

* modify mac makefile to use conda

* recurrent layers

* play with bn and resnets

* minor hp changes

* minor

* got rid of use_fb argument and jtft (joint-train-fine-tune) functionality
built test phase directly into AlgoProb

* make new rl algos generateable

* pylint; start fixing tests

* fixing tests

* more test fixes

* pylint

* fix search

* work on search

* hack around infinite loop caused by scan

* algo search fixes

* misc changes for search expt

* enable annealing, overriding options of Op

* pylint fixes

* identity op

* achieve use_last_output through masking so it automatically works in other distributions

* fix tests

* minor

* discrete

* use_last_output to be just a preference, not a hard constraint

* pred delay, pruning

* require nontrivial inputs

* aliases for get_sm

* add probname to probs

* fixes

* small fixes

* fix tests

* fix tests

* fix tests

* minor

* test scripts

* dualgru network improvements

* minor

* work on mysterious bugs

* rcall gpu-usage command for kube

* use cache dir that’s not in code folder, so that it doesn’t get removed by rcall code rsync

* add power mode to gpu usage

* make sure train/test actually different

* remove VR for now

* minor fixes

* simplify soln_db

* minor

* big refactor of mpi eda

* improve mpieda for multitask

* - get rid of timelimit hack
- add __del__ to cleanup SubprocVecEnv

* get multitask working better

* fixes

* working on atari, various

* annotate ops with whether they’re parametrized

* minor

* gym version

* rand atari prob

* minor

* SolnDb bugfix and name change

* pyspy script

* switch conv layers

* fix roboschool/bullet3

* nenvs assertion

* fix rand atari

* get rid of blanket exception catching
fix soln_db bug

* fix rand_atari

* dynamic routing as cmdline arg

* slight modifications to test_mpi_map and pyspy-all

* max_tries argument for run_until_successs

* dedup option in train_mle

* simplify soln_db

* increase atari horizon for 1 experiment

* start implementing reward increment

* ent multiplier

* create cc dsl
other misc fixes

* cc ops

* q_func -> qs in rl_algos_cc.py

* fix PredictDistr

* rl_ops_cc fixes, MakeAction op

* augment algo agent to support cc stuff

* work on ddpg experiments

* fix blocking
temporarily change logger

* allow layer scaling

* pylint fixes

* spawn_method

* isolate ddpg hacks

* improve pruning

* use spawn for subproc

* remove use of python -c in rcall

* fix pylint warning

* fix static

* maybe fix local backend

* switch to DummyVecEnv

* making some fixes via pylint

* pylint fixes

* fixing tests

* fix tests

* fix tests

* write scaffolding for SSL in Codegen

* logger fix

* fix error

* add EMA op to sl_ops

* save many changes

* save

* add upsampler

* add sl ops, enhance state machine

* get ssl search working — some gross hacking

* fix session/graph issue

* fix importing

* work on mle

* - scale embeddings in gru model
- better exception handling in sl_prob
- use emas for test/val
- use non-contrib batch_norm layer

* improve logging

* option to average before dumping in logger

* default arguments, etc

* new ddpg and identity test

* concat fix

* minor

* move realistic ssl stuff to third-party (underscore to dash)

* fixes

* remove realistic_ssl_evaluation

* pylint fixes

* use gym master

* try again

* pass around args without gin

* fix tests

* separate line to install gym

* rename failing tests that should be ignored

* add data aug

* ssl improvements

* use fixed time limit

* try to fix baselines tests

* add score_floor, max_walltime, fiddle with lr decay

* realistic_ssl

* autopep8

* various ssl
- enable blocking grad for simplification
- kl
- multiple final prediction

* fix pruning

* misc ssl stuff

* bring back linear schedule, don’t use allgather for collecting stats
(i’ve been getting nondeterministic errors from the old code)

* save/load weights in SSL, big stepsize

* cleanup SslProb

* fix

* get rid of kl coef

* fix simplification, lower lr

* search over hps

* minor fixes

* minor

* static analysis

* move files and rename things for improved consistency.
still broken, and just saving before making nontrivial changes

* various

* make tests pass

* move coinrun_train to codegen since it depends on codegen

* fixes

* pylint fixes

* improve tests
fix some things

* improve tests

* lint

* fix up db_info.py, tests

* mostly restore master version of envs directory, except for makefile changes

* fix tests

* improve printing

* minor fixes

* fix fixmes

* pruning test

* fixes

* lint

* write new test that makes tf graphs of random algos; fix some bugs it caught

* add —delete flag to rcall upload-code command

* lint

* get cifar10 lazily for testing purposes

* disable codegen ci tests for now

* clean up rl_ops

* rename spec classes

* td3 with identity test

* identity tests without gin files

* remove gin.configurable from AlgoAgent

* comments about reduction in rl_ops_cc

* address @pzhokhov comments

* fix tests

* more linting

* better tests

* clean up filtering a bit

* fix concat

* delayed logger configuration (#208)

* delayed logger configuration

* fix typo

* setters and getters for Logger.DEFAULT as well

* do away with fancy property stuff - unable to get it to work with class level methods

* grammar and spaces

* spaces

* use get_current function instead of reading Logger.CURRENT

* autopep8

* disable mpi in subprocesses (#213)

* lazy_mpi load

* cleanups

* more lazy mpi

* don't pretend that class is a module, just use it as a class

* mass-replace mpi4py imports

* flake8

* fix previous lazy_mpi imports

* silly recursion

* try os.environ hack

* better prefix test, work with mpich

* restored MPI imports

* removed commented import in test_with_mpi

* restored codegen from master

* remove lazy mpi

* restored changes from rl-algs

* remove extra files

* address Chris' comments

* use spawn for shmem vec env as well (#2) (#219)

* lazy_mpi load

* cleanups

* more lazy mpi

* don't pretend that class is a module, just use it as a class

* mass-replace mpi4py imports

* flake8

* fix previous lazy_mpi imports

* silly recursion

* try os.environ hack

* better prefix test, work with mpich

* restored MPI imports

* removed commented import in test_with_mpi

* restored codegen from master

* remove lazy mpi

* restored changes from rl-algs

* remove extra files

* port mpi fix to shmem vec env

* increase the mpi test default timeout

* change humanoid hyperparameters, get rid of clip_Frac annealing, as it's apparently dangerous

* remove clip_frac schedule from ppo2

* more timesteps in humanoid run

* whitespace + RUN BENCHMARKS

* baselines: export vecenvs from folder (#221)

* baselines: export vecenvs from folder

* put missing function back in

* add missing imports

* more imports

* longer mpi timeout?

* make default logger configuration the same as call to logger.configure() (#222)

* Vecenv refactor (#223)

* update karl util

* restore pvi flag

* change rcall auto cpu behavior, move gin.configurable, add os.makedirs

* vecenv refactor

* aux buf index fix

* add num aux obs

* reset level with enter

* restore high difficulty flag

* bugfix

* restore train_coinrun.py

* tweaks

* renaming

* renaming

* better arguments handling

* more options

* options cleanup

* game data refactor

* more options

* args for train_procgen

* add close handler to interactive base class

* use debug build if debug=True, fix range on aux_obs

* add ProcGenEnv to __init__.py, add missing imports to procgen.py

* export RemoveDictWrapper and build, update train_procgen.py, move assets download into env creation and replace init_assets_and_build with just build

* fix formatting issues

* only call global init once

* fix path in setup.py

* revert part of makefile

* ignore IDE files and folders

* vec remove dict

* export VecRemoveDictObs

* remove RemoveDictWrapper

* remove IDE files

* move shared .h and .cpp files to common folder, update build to use those, dedupe env.cpp

* fix missing header

* try unified build function

* remove old scripts dir

* add comment on build

* upload libenv with render fixes

* tell qthreads to die when we unload the library

* pyglet.app.run is garbage

* static fixes

* whoops

* actually vsync is on

* cleanup

* cleanup

* extern C for libenv interface

* parse util rcall arg

* high difficulty fix

* game type enums

* ProcGenEnv subclasses

* game type cleanup

* unrecognized key

* unrecognized game type

* parse util reorg

* args management

* typo fix

* GinParser

* arg tweaks

* tweak

* restore start_level/num_levels setting

* fix create_procgen_env interface

* build fix

* procgen args in init signature

* fix

* build fix

* fix logger usage in ppo_metal/run_retro

* removed unnecessary OrderedDict requirement in subproc_vec_env

* flake8 fix

* allow for non-mpi tests

* mpi test fixes

* flake8; removed special logic for discrete spaces in dummy_vec_env

* remove forked argument in front of tests - does not play nicely with subprocvecenv in spawned processes; analog of forked in ddpg/test_smoke

* Everyrl initial commit & a few minor baselines changes (#226)

* everyrl initial commit

* add keep_buf argument to VecMonitor

* logger changes: set_comm and fix to mpi_mean functionality

* if filename not provided, don't create ResultsWriter

* change variable syncing function to simplify its usage. now you should initialize from all mpi processes

* everyrl coinrun changes

* tf_distr changes, bugfix

* get_one

* bring back get_next to temporarily restore code

* lint fixes

* fix test

* rename profile function

* rename gaussian

* fix coinrun training script

* change random seeding to work with new gym version (#231)

* change random seeding to work with new gym version

* move seeding to seed() method

* fix mnistenv

* actually try some of the tests before pushing

* more deterministic fixed seq

* misc changes to vecenvs and run.py for benchmarks (#236)

* misc changes to vecenvs and run.py for benchmarks

* dont seed global gen

* update more references to assert_venvs_equal

* Rl19 (#232)

* everyrl initial commit

* add keep_buf argument to VecMonitor

* logger changes: set_comm and fix to mpi_mean functionality

* if filename not provided, don't create ResultsWriter

* change variable syncing function to simplify its usage. now you should initialize from all mpi processes

* everyrl coinrun changes

* tf_distr changes, bugfix

* get_one

* bring back get_next to temporarily restore code

* lint fixes

* fix test

* rename profile function

* rename gaussian

* fix coinrun training script

* rl19

* remove everyrl dir which appeared in the merge for some reason

* readme

* fiddle with ddpg

* make ddpg work

* steps_total argument

* gpu count

* clean up hyperparams and shape math

* logging + saving

* configuration stuff

* fixes, smoke tests

* fix stats

* make load_results return dicts -- easier to create the same kind of objects with some other mechanism for passing to downstream functions

* benchmarks

* fix tests

* add dqn to tests, fix it

* minor

* turned annotated transformer (pytorch) into a script

* more refactoring

* jax stuff

* cluster

* minor

* copy & paste alec code

* sign error

* add huber, rename some parameters, snapshotting off by default

* remove jax stuff

* minor

* move maze env

* minor

* remove trailing spaces

* remove trailing space

* lint

* fix test breakage due to gym update

* rename function

* move maze back to codegen

* get recurrent ppo working

* enable both lstm and gru

* script to print table of benchmark results

* various

* fix dqn

* add fixup initializer, remove lastrew

* organize logging stats

* fix silly bug

* refactor models

* fix mpi usage

* check sync

* minor

* change vf coef, hps

* clean up slicing in ppo

* minor fixes

* caching transformer

* docstrings

* xf fixes

* get rid of 'B' and 'BT' arguments

* minor

* transformer example

* remove output_kind from base class until we have a better idea how to use it

* add comments, revert maze stuff

* flake8

* codegen lint

* fix codegen tests

* responded to peter's comments

* lint fixes

* minor changes to baselines (#243)

* minor changes to baselines

* fix spaces reference

* remove flake8 disable comments and fix import

* okay maybe don't add spec to vec_env

* Merge branch 'master' of github.com:openai/games

 the commit.

* flake8 complaints in baselines/her
---
 .travis.yml                                   |   2 +-
 baselines/bench/benchmarks.py                 |   2 +-
 baselines/bench/monitor.py                    |  49 ++--
 baselines/common/atari_wrappers.py            |   9 +-
 baselines/common/cmd_util.py                  |  13 +-
 baselines/common/distributions.py             |   3 +-
 baselines/common/mpi_adam_optimizer.py        |  17 +-
 baselines/common/mpi_util.py                  |  64 +++--
 baselines/common/retro_wrappers.py            |  32 +--
 baselines/common/test_mpi_util.py             |  27 +++
 .../common/tests/envs/fixed_sequence_env.py   |  10 +-
 baselines/common/tests/envs/identity_env.py   |   8 +-
 baselines/common/tests/envs/mnist_env.py      |   5 +-
 baselines/common/tests/test_fixed_sequence.py |   3 +-
 baselines/common/tests/test_mnist.py          |   2 +-
 baselines/common/tests/test_serialization.py  |   7 +-
 baselines/common/tests/test_with_mpi.py       |  36 +++
 baselines/common/tests/util.py                |  27 +--
 baselines/common/tf_util.py                   |   3 +-
 baselines/common/vec_env/__init__.py          | 186 +--------------
 baselines/common/vec_env/dummy_vec_env.py     |   9 +-
 baselines/common/vec_env/shmem_vec_env.py     |  31 +--
 baselines/common/vec_env/subproc_vec_env.py   |  36 +--
 baselines/common/vec_env/test_vec_env.py      |  47 ++--
 baselines/common/vec_env/vec_env.py           | 219 ++++++++++++++++++
 baselines/common/vec_env/vec_frame_stack.py   |   2 +-
 .../common/vec_env/vec_remove_dict_obs.py     |  11 +
 baselines/common/wrappers.py                  |  19 ++
 baselines/ddpg/test_smoke.py                  |   7 +-
 baselines/her/her.py                          |   2 +-
 baselines/logger.py                           | 113 ++++-----
 baselines/ppo1/pposgd_simple.py               |   1 -
 baselines/ppo1/run_humanoid.py                |  12 +-
 baselines/ppo2/defaults.py                    |   2 +-
 baselines/run.py                              |  42 ++--
 35 files changed, 635 insertions(+), 423 deletions(-)
 create mode 100644 baselines/common/test_mpi_util.py
 create mode 100644 baselines/common/tests/test_with_mpi.py
 create mode 100644 baselines/common/vec_env/vec_env.py
 create mode 100644 baselines/common/vec_env/vec_remove_dict_obs.py
 create mode 100644 baselines/common/wrappers.py

diff --git a/.travis.yml b/.travis.yml
index 7ca7e6a..712fc84 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -11,4 +11,4 @@ install:
 
 script:
     - flake8 . --show-source --statistics
-    - docker run baselines-test pytest -v --forked .
+    - docker run baselines-test pytest -v .
diff --git a/baselines/bench/benchmarks.py b/baselines/bench/benchmarks.py
index c6fa596..c129008 100644
--- a/baselines/bench/benchmarks.py
+++ b/baselines/bench/benchmarks.py
@@ -20,7 +20,7 @@ def register_benchmark(benchmark):
     if 'tasks' in benchmark:
         for t in benchmark['tasks']:
             if 'desc' not in t:
-                t['desc'] = remove_version_re.sub('', t['env_id'])
+                t['desc'] = remove_version_re.sub('', t.get('env_id', t.get('id')))
     _BENCHMARKS.append(benchmark)
 
 
diff --git a/baselines/bench/monitor.py b/baselines/bench/monitor.py
index bb34a29..6fd9806 100644
--- a/baselines/bench/monitor.py
+++ b/baselines/bench/monitor.py
@@ -16,11 +16,13 @@ class Monitor(Wrapper):
     def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), info_keywords=()):
         Wrapper.__init__(self, env=env)
         self.tstart = time.time()
-        self.results_writer = ResultsWriter(
-            filename,
-            header={"t_start": time.time(), 'env_id' : env.spec and env.spec.id},
-            extra_keys=reset_keywords + info_keywords
-        )
+        if filename:
+            self.results_writer = ResultsWriter(filename,
+                header={"t_start": time.time(), 'env_id' : env.spec and env.spec.id},
+                extra_keys=reset_keywords + info_keywords
+            )
+        else:
+            self.results_writer = None
         self.reset_keywords = reset_keywords
         self.info_keywords = info_keywords
         self.allow_early_resets = allow_early_resets
@@ -80,8 +82,9 @@ def update(self, ob, rew, done, info):
             self.episode_lengths.append(eplen)
             self.episode_times.append(time.time() - self.tstart)
             epinfo.update(self.current_reset_info)
-            self.results_writer.write_row(epinfo)
-
+            if self.results_writer:
+                self.results_writer.write_row(epinfo)
+            assert isinstance(info, dict)
             if isinstance(info, dict):
                 info['episode'] = epinfo
 
@@ -108,24 +111,21 @@ class LoadMonitorResultsError(Exception):
 
 
 class ResultsWriter(object):
-    def __init__(self, filename=None, header='', extra_keys=()):
+    def __init__(self, filename, header='', extra_keys=()):
         self.extra_keys = extra_keys
-        if filename is None:
-            self.f = None
-            self.logger = None
-        else:
-            if not filename.endswith(Monitor.EXT):
-                if osp.isdir(filename):
-                    filename = osp.join(filename, Monitor.EXT)
-                else:
-                    filename = filename + "." + Monitor.EXT
-            self.f = open(filename, "wt")
-            if isinstance(header, dict):
-                header = '# {} \n'.format(json.dumps(header))
-            self.f.write(header)
-            self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+tuple(extra_keys))
-            self.logger.writeheader()
-            self.f.flush()
+        assert filename is not None
+        if not filename.endswith(Monitor.EXT):
+            if osp.isdir(filename):
+                filename = osp.join(filename, Monitor.EXT)
+            else:
+                filename = filename + "." + Monitor.EXT
+        self.f = open(filename, "wt")
+        if isinstance(header, dict):
+            header = '# {} \n'.format(json.dumps(header))
+        self.f.write(header)
+        self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+tuple(extra_keys))
+        self.logger.writeheader()
+        self.f.flush()
 
     def write_row(self, epinfo):
         if self.logger:
@@ -133,7 +133,6 @@ def write_row(self, epinfo):
             self.f.flush()
 
 
-
 def get_monitor_files(dir):
     return glob(osp.join(dir, "*" + Monitor.EXT))
 
diff --git a/baselines/common/atari_wrappers.py b/baselines/common/atari_wrappers.py
index 12b141e..ecaa94b 100644
--- a/baselines/common/atari_wrappers.py
+++ b/baselines/common/atari_wrappers.py
@@ -6,6 +6,8 @@
 from gym import spaces
 import cv2
 cv2.ocl.setUseOpenCL(False)
+from .wrappers import TimeLimit
+
 
 class NoopResetEnv(gym.Wrapper):
     def __init__(self, env, noop_max=30):
@@ -221,14 +223,13 @@ def __len__(self):
     def __getitem__(self, i):
         return self._force()[i]
 
-def make_atari(env_id, timelimit=True):
-    # XXX(john): remove timelimit argument after gym is upgraded to allow double wrapping
+def make_atari(env_id, max_episode_steps=None):
     env = gym.make(env_id)
-    if not timelimit:
-        env = env.env
     assert 'NoFrameskip' in env.spec.id
     env = NoopResetEnv(env, noop_max=30)
     env = MaxAndSkipEnv(env, skip=4)
+    if max_episode_steps is not None:
+        env = TimeLimit(env, max_episode_steps=max_episode_steps)
     return env
 
 def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
diff --git a/baselines/common/cmd_util.py b/baselines/common/cmd_util.py
index c80bcd2..66c8db8 100644
--- a/baselines/common/cmd_util.py
+++ b/baselines/common/cmd_util.py
@@ -30,16 +30,19 @@ def make_vec_env(env_id, env_type, num_env, seed,
     wrapper_kwargs = wrapper_kwargs or {}
     mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
     seed = seed + 10000 * mpi_rank if seed is not None else None
+    logger_dir = logger.get_dir()
     def make_thunk(rank):
         return lambda: make_env(
             env_id=env_id,
             env_type=env_type,
-            subrank = rank,
+            mpi_rank=mpi_rank,
+            subrank=rank,
             seed=seed,
             reward_scale=reward_scale,
             gamestate=gamestate,
             flatten_dict_observations=flatten_dict_observations,
-            wrapper_kwargs=wrapper_kwargs
+            wrapper_kwargs=wrapper_kwargs,
+            logger_dir=logger_dir
         )
 
     set_global_seeds(seed)
@@ -49,8 +52,7 @@ def make_thunk(rank):
         return DummyVecEnv([make_thunk(start_index)])
 
 
-def make_env(env_id, env_type, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None):
-    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
+def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, logger_dir=None):
     wrapper_kwargs = wrapper_kwargs or {}
     if env_type == 'atari':
         env = make_atari(env_id)
@@ -67,7 +69,7 @@ def make_env(env_id, env_type, subrank=0, seed=None, reward_scale=1.0, gamestate
 
     env.seed(seed + subrank if seed is not None else None)
     env = Monitor(env,
-                  logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(subrank)),
+                  logger_dir and os.path.join(logger_dir, str(mpi_rank) + '.' + str(subrank)),
                   allow_early_resets=True)
 
     if env_type == 'atari':
@@ -133,6 +135,7 @@ def common_arg_parser():
     """
     parser = arg_parser()
     parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
+    parser.add_argument('--env_type', help='type of environment, used when the environment type cannot be automatically determined', type=str)
     parser.add_argument('--seed', help='RNG seed', type=int, default=None)
     parser.add_argument('--alg', help='Algorithm', type=str, default='ppo2')
     parser.add_argument('--num_timesteps', type=float, default=1e6),
diff --git a/baselines/common/distributions.py b/baselines/common/distributions.py
index 8966ee3..0b5fc76 100644
--- a/baselines/common/distributions.py
+++ b/baselines/common/distributions.py
@@ -206,7 +206,8 @@ def fromflat(cls, flat):
 class MultiCategoricalPd(Pd):
     def __init__(self, nvec, flat):
         self.flat = flat
-        self.categoricals = list(map(CategoricalPd, tf.split(flat, nvec, axis=-1)))
+        self.categoricals = list(map(CategoricalPd,
+            tf.split(flat, np.array(nvec, dtype=np.int32), axis=-1)))
     def flatparam(self):
         return self.flat
     def mode(self):
diff --git a/baselines/common/mpi_adam_optimizer.py b/baselines/common/mpi_adam_optimizer.py
index 8cf09c4..af381e7 100644
--- a/baselines/common/mpi_adam_optimizer.py
+++ b/baselines/common/mpi_adam_optimizer.py
@@ -17,9 +17,16 @@ def compute_gradients(self, loss, var_list, **kwargs):
         num_tasks = self.comm.Get_size()
         buf = np.zeros(sum(sizes), np.float32)
 
+        sess = tf.get_default_session()
+        assert sess is not None
+        countholder = [0] # Counts how many times _collect_grads has been called
+        stat = tf.reduce_sum(grads_and_vars[0][1]) # sum of first variable
         def _collect_grads(flat_grad):
             self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
             np.divide(buf, float(num_tasks), out=buf)
+            if countholder[0] % 100 == 0:
+                check_synced(sess, self.comm, stat)
+            countholder[0] += 1
             return buf
 
         avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32)
@@ -27,5 +34,13 @@ def _collect_grads(flat_grad):
         avg_grads = tf.split(avg_flat_grad, sizes, axis=0)
         avg_grads_and_vars = [(tf.reshape(g, v.shape), v)
                     for g, (_, v) in zip(avg_grads, grads_and_vars)]
-
         return avg_grads_and_vars
+
+def check_synced(sess, comm, tfstat):
+    """
+    Check that 'tfstat' evaluates to the same thing on every MPI worker
+    """
+    localval = sess.run(tfstat)
+    vals = comm.gather(localval)
+    if comm.rank == 0:
+        assert all(val==vals[0] for val in vals[1:])
diff --git a/baselines/common/mpi_util.py b/baselines/common/mpi_util.py
index f04187b..d05d9cf 100644
--- a/baselines/common/mpi_util.py
+++ b/baselines/common/mpi_util.py
@@ -1,9 +1,16 @@
 from collections import defaultdict
-from mpi4py import MPI
 import os, numpy as np
 import platform
 import shutil
 import subprocess
+import warnings
+import sys
+
+try:
+    from mpi4py import MPI
+except ImportError:
+    MPI = None
+
 
 def sync_from_root(sess, variables, comm=None):
     """
@@ -13,15 +20,10 @@ def sync_from_root(sess, variables, comm=None):
       variables: all parameter variables including optimizer's
     """
     if comm is None: comm = MPI.COMM_WORLD
-    rank = comm.Get_rank()
-    for var in variables:
-        if rank == 0:
-            comm.Bcast(sess.run(var))
-        else:
-            import tensorflow as tf
-            returned_var = np.empty(var.shape, dtype='float32')
-            comm.Bcast(returned_var)
-            sess.run(tf.assign(var, returned_var))
+    import tensorflow as tf
+    values = comm.bcast(sess.run(variables))
+    sess.run([tf.assign(var, val)
+        for (var, val) in zip(variables, values)])
 
 def gpu_count():
     """
@@ -34,13 +36,15 @@ def gpu_count():
 
 def setup_mpi_gpus():
     """
-    Set CUDA_VISIBLE_DEVICES using MPI.
+    Set CUDA_VISIBLE_DEVICES to MPI rank if not already set
     """
-    num_gpus = gpu_count()
-    if num_gpus == 0:
-        return
-    local_rank, _ = get_local_rank_size(MPI.COMM_WORLD)
-    os.environ['CUDA_VISIBLE_DEVICES'] = str(local_rank % num_gpus)
+    if 'CUDA_VISIBLE_DEVICES' not in os.environ:
+        if sys.platform == 'darwin': # This Assumes if you're on OSX you're just
+            ids = []                 # doing a smoke test and don't want GPUs
+        else:
+            lrank, _lsize = get_local_rank_size(MPI.COMM_WORLD)
+            ids = [lrank]
+        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, ids))
 
 def get_local_rank_size(comm):
     """
@@ -81,6 +85,9 @@ def share_file(comm, path):
     comm.Barrier()
 
 def dict_gather(comm, d, op='mean', assert_all_have_data=True):
+    """
+    Perform a reduction operation over dicts
+    """
     if comm is None: return d
     alldicts = comm.allgather(d)
     size = comm.size
@@ -99,3 +106,28 @@ def dict_gather(comm, d, op='mean', assert_all_have_data=True):
         else:
             assert 0, op
     return result
+
+def mpi_weighted_mean(comm, local_name2valcount):
+    """
+    Perform a weighted average over dicts that are each on a different node
+    Input: local_name2valcount: dict mapping key -> (value, count)
+    Returns: key -> mean
+    """
+    all_name2valcount = comm.gather(local_name2valcount)
+    if comm.rank == 0:
+        name2sum = defaultdict(float)
+        name2count = defaultdict(float)
+        for n2vc in all_name2valcount:
+            for (name, (val, count)) in n2vc.items():
+                try:
+                    val = float(val)
+                except ValueError:
+                    if comm.rank == 0:
+                        warnings.warn(f'WARNING: tried to compute mean on non-float {name}={val}')
+                else:
+                    name2sum[name] += val * count
+                    name2count[name] += count
+        return {name : name2sum[name] / name2count[name] for name in name2sum}
+    else:
+        return {}
+
diff --git a/baselines/common/retro_wrappers.py b/baselines/common/retro_wrappers.py
index 1e98044..2c42926 100644
--- a/baselines/common/retro_wrappers.py
+++ b/baselines/common/retro_wrappers.py
@@ -1,25 +1,11 @@
- # flake8: noqa F403, F405
-from .atari_wrappers import *
+from collections import deque
+import cv2
+cv2.ocl.setUseOpenCL(False)
+from .atari_wrappers import WarpFrame, ClipRewardEnv, FrameStack, ScaledFloatFrame
+from .wrappers import TimeLimit
 import numpy as np
 import gym
 
-class TimeLimit(gym.Wrapper):
-    def __init__(self, env, max_episode_steps=None):
-        super(TimeLimit, self).__init__(env)
-        self._max_episode_steps = max_episode_steps
-        self._elapsed_steps = 0
-
-    def step(self, ac):
-        observation, reward, done, info = self.env.step(ac)
-        self._elapsed_steps += 1
-        if self._elapsed_steps >= self._max_episode_steps:
-            done = True
-            info['TimeLimit.truncated'] = True
-        return observation, reward, done, info
-
-    def reset(self, **kwargs):
-        self._elapsed_steps = 0
-        return self.env.reset(**kwargs)
 
 class StochasticFrameSkip(gym.Wrapper):
     def __init__(self, env, n, stickprob):
@@ -99,7 +85,7 @@ def __init__(self, env, ratio):
         gym.ObservationWrapper.__init__(self, env)
         (oldh, oldw, oldc) = env.observation_space.shape
         newshape = (oldh//ratio, oldw//ratio, oldc)
-        self.observation_space = spaces.Box(low=0, high=255,
+        self.observation_space = gym.spaces.Box(low=0, high=255,
             shape=newshape, dtype=np.uint8)
 
     def observation(self, frame):
@@ -116,7 +102,7 @@ def __init__(self, env):
         """
         gym.ObservationWrapper.__init__(self, env)
         (oldh, oldw, _oldc) = env.observation_space.shape
-        self.observation_space = spaces.Box(low=0, high=255,
+        self.observation_space = gym.spaces.Box(low=0, high=255,
             shape=(oldh, oldw, 1), dtype=np.uint8)
 
     def observation(self, frame):
@@ -213,8 +199,10 @@ def step(self, a):
                 self.some_random_steps()
         return self.last_obs, rew, done, info
 
-def make_retro(*, game, state, max_episode_steps, **kwargs):
+def make_retro(*, game, state=None, max_episode_steps=4500, **kwargs):
     import retro
+    if state is None:
+        state = retro.State.DEFAULT
     env = retro.make(game, state, **kwargs)
     env = StochasticFrameSkip(env, n=4, stickprob=0.25)
     if max_episode_steps is not None:
diff --git a/baselines/common/test_mpi_util.py b/baselines/common/test_mpi_util.py
new file mode 100644
index 0000000..dcabf2e
--- /dev/null
+++ b/baselines/common/test_mpi_util.py
@@ -0,0 +1,27 @@
+from baselines import logger
+from baselines.common.tests.test_with_mpi import with_mpi
+from baselines.common import mpi_util
+
+@with_mpi()
+def test_mpi_weighted_mean():
+    from mpi4py import MPI
+    comm = MPI.COMM_WORLD
+    with logger.scoped_configure(comm=comm):
+        if comm.rank == 0:
+            name2valcount = {'a' : (10, 2), 'b' : (20,3)}
+        elif comm.rank == 1:
+            name2valcount = {'a' : (19, 1), 'c' : (42,3)}
+        else:
+            raise NotImplementedError
+
+        d = mpi_util.mpi_weighted_mean(comm, name2valcount)
+        correctval = {'a' : (10 * 2 + 19) / 3.0, 'b' : 20, 'c' : 42}
+        if comm.rank == 0:
+            assert d == correctval, f'{d} != {correctval}'
+
+        for name, (val, count) in name2valcount.items():
+            for _ in range(count):
+                logger.logkv_mean(name, val)
+        d2 = logger.dumpkvs()
+        if comm.rank == 0:
+            assert d2 == correctval
diff --git a/baselines/common/tests/envs/fixed_sequence_env.py b/baselines/common/tests/envs/fixed_sequence_env.py
index 9b538e7..f5460d5 100644
--- a/baselines/common/tests/envs/fixed_sequence_env.py
+++ b/baselines/common/tests/envs/fixed_sequence_env.py
@@ -7,21 +7,20 @@ class FixedSequenceEnv(Env):
     def __init__(
             self,
             n_actions=10,
-            seed=0,
             episode_len=100
     ):
         self.np_random = np.random.RandomState()
-        self.np_random.seed(seed)
-        self.sequence = [self.np_random.randint(0, n_actions-1) for _ in range(episode_len)]
+        self.sequence = None
 
         self.action_space = Discrete(n_actions)
         self.observation_space = Discrete(1)
 
         self.episode_len = episode_len
         self.time = 0
-        self.reset()
 
     def reset(self):
+        if self.sequence is None:
+            self.sequence = [self.np_random.randint(0, self.action_space.n-1) for _ in range(self.episode_len)]
         self.time = 0
         return 0
 
@@ -35,6 +34,9 @@ def step(self, actions):
 
         return 0, rew, done, {}
 
+    def seed(self, seed=None):
+        self.np_random.seed(seed)
+
     def _choose_next_state(self):
         self.time += 1
 
diff --git a/baselines/common/tests/envs/identity_env.py b/baselines/common/tests/envs/identity_env.py
index 4429f04..79e6c48 100644
--- a/baselines/common/tests/envs/identity_env.py
+++ b/baselines/common/tests/envs/identity_env.py
@@ -10,6 +10,7 @@ def __init__(
             episode_len=None
     ):
 
+        self.observation_space = self.action_space
         self.episode_len = episode_len
         self.time = 0
         self.reset()
@@ -17,7 +18,6 @@ def __init__(
     def reset(self):
         self._choose_next_state()
         self.time = 0
-        self.observation_space = self.action_space
 
         return self.state
 
@@ -26,11 +26,13 @@ def step(self, actions):
         self._choose_next_state()
         done = False
         if self.episode_len and self.time >= self.episode_len:
-            rew = 0
             done = True
 
         return self.state, rew, done, {}
 
+    def seed(self, seed=None):
+        self.action_space.seed(seed)
+
     def _choose_next_state(self):
         self.state = self.action_space.sample()
         self.time += 1
@@ -74,7 +76,7 @@ def __init__(
             episode_len=None,
     ):
 
-        self.action_space = Box(low=-1.0, high=1.0, shape=shape)
+        self.action_space = Box(low=-1.0, high=1.0, shape=shape, dtype=np.float32)
         super().__init__(episode_len=episode_len)
 
     def _get_reward(self, actions):
diff --git a/baselines/common/tests/envs/mnist_env.py b/baselines/common/tests/envs/mnist_env.py
index 473008d..cc0bde0 100644
--- a/baselines/common/tests/envs/mnist_env.py
+++ b/baselines/common/tests/envs/mnist_env.py
@@ -9,7 +9,6 @@
 class MnistEnv(Env):
     def __init__(
             self,
-            seed=0,
             episode_len=None,
             no_images=None
     ):
@@ -23,7 +22,6 @@ def __init__(
            self.mnist = input_data.read_data_sets(mnist_path)
 
         self.np_random = np.random.RandomState()
-        self.np_random.seed(seed)
 
         self.observation_space = Box(low=0.0, high=1.0, shape=(28,28,1))
         self.action_space = Discrete(10)
@@ -50,6 +48,9 @@ def step(self, actions):
 
         return self.state[0], rew, done, {}
 
+    def seed(self, seed=None):
+        self.np_random.seed(seed)
+
     def train_mode(self):
         self.dataset = self.mnist.train
 
diff --git a/baselines/common/tests/test_fixed_sequence.py b/baselines/common/tests/test_fixed_sequence.py
index 4131a9d..061c375 100644
--- a/baselines/common/tests/test_fixed_sequence.py
+++ b/baselines/common/tests/test_fixed_sequence.py
@@ -33,8 +33,7 @@ def test_fixed_sequence(alg, rnn):
     kwargs = learn_kwargs[alg]
     kwargs.update(common_kwargs)
 
-    episode_len = 5
-    env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len)
+    env_fn = lambda: FixedSequenceEnv(n_actions=10, episode_len=5)
     learn = lambda e: get_learn_function(alg)(
         env=e,
         network=rnn,
diff --git a/baselines/common/tests/test_mnist.py b/baselines/common/tests/test_mnist.py
index eea094d..bacc914 100644
--- a/baselines/common/tests/test_mnist.py
+++ b/baselines/common/tests/test_mnist.py
@@ -41,7 +41,7 @@ def test_mnist(alg):
 
     learn = get_learn_function(alg)
     learn_fn = lambda e: learn(env=e, **learn_kwargs)
-    env_fn = lambda: MnistEnv(seed=0, episode_len=100)
+    env_fn = lambda: MnistEnv(episode_len=100)
 
     simple_test(env_fn, learn_fn, 0.6)
 
diff --git a/baselines/common/tests/test_serialization.py b/baselines/common/tests/test_serialization.py
index 73d29e9..25cec88 100644
--- a/baselines/common/tests/test_serialization.py
+++ b/baselines/common/tests/test_serialization.py
@@ -44,7 +44,12 @@ def test_serialization(learn_fn, network_fn):
             # github issue: https://github.com/openai/baselines/issues/660
             return
 
-    env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)])
+    def make_env():
+        env = MnistEnv(episode_len=100)
+        env.seed(10)
+        return env
+
+    env = DummyVecEnv([make_env])
     ob = env.reset().copy()
     learn = get_learn_function(learn_fn)
 
diff --git a/baselines/common/tests/test_with_mpi.py b/baselines/common/tests/test_with_mpi.py
new file mode 100644
index 0000000..86be475
--- /dev/null
+++ b/baselines/common/tests/test_with_mpi.py
@@ -0,0 +1,36 @@
+import os
+import sys
+import subprocess
+import cloudpickle
+import base64
+import pytest
+
+try:
+    from mpi4py import MPI
+except ImportError:
+    MPI = None
+
+def with_mpi(nproc=2, timeout=30, skip_if_no_mpi=True):
+    def outer_thunk(fn):
+        def thunk(*args, **kwargs):
+            serialized_fn = base64.b64encode(cloudpickle.dumps(lambda: fn(*args, **kwargs)))
+            subprocess.check_call([
+                'mpiexec','-n', str(nproc),
+                sys.executable,
+                '-m', 'baselines.common.tests.test_with_mpi',
+                serialized_fn
+            ], env=os.environ, timeout=timeout)
+
+        if skip_if_no_mpi:
+            return pytest.mark.skipif(MPI is None, reason="MPI not present")(thunk)
+        else:
+            return thunk
+
+    return outer_thunk
+
+
+if __name__ == '__main__':
+    if len(sys.argv) > 1:
+        fn = cloudpickle.loads(base64.b64decode(sys.argv[1]))
+        assert callable(fn)
+        fn()
diff --git a/baselines/common/tests/util.py b/baselines/common/tests/util.py
index ce44d50..38ea4dc 100644
--- a/baselines/common/tests/util.py
+++ b/baselines/common/tests/util.py
@@ -6,48 +6,39 @@
 N_EPISODES = 100
 
 def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
-    np.random.seed(0)
-    env = DummyVecEnv([env_fn])
-
+    def seeded_env_fn():
+        env = env_fn()
+        env.seed(0)
+        return env
 
+    np.random.seed(0)
+    env = DummyVecEnv([seeded_env_fn])
     with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
         tf.set_random_seed(0)
-
         model = learn_fn(env)
-
         sum_rew = 0
         done = True
-
         for i in range(n_trials):
             if done:
                 obs = env.reset()
                 state = model.initial_state
-
             if state is not None:
                 a, v, state, _ = model.step(obs, S=state, M=[False])
             else:
                 a, v, _, _ = model.step(obs)
-
             obs, rew, done, _ = env.step(a)
             sum_rew += float(rew)
-
         print("Reward in {} trials is {}".format(n_trials, sum_rew))
         assert sum_rew > min_reward_fraction * n_trials, \
             'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials)
 
-
-
 def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES):
     env = DummyVecEnv([env_fn])
-
     with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
         model = learn_fn(env)
-
         N_TRIALS = 100
-
         observations, actions, rewards = rollout(env, model, N_TRIALS)
         rewards = [sum(r) for r in rewards]
-
         avg_rew = sum(rewards) / N_TRIALS
         print("Average reward in {} episodes is {}".format(n_trials, avg_rew))
         assert avg_rew > min_avg_reward, \
@@ -57,14 +48,12 @@ def rollout(env, model, n_trials):
     rewards = []
     actions = []
     observations = []
-
     for i in range(n_trials):
         obs = env.reset()
         state = model.initial_state if hasattr(model, 'initial_state') else None
         episode_rew = []
         episode_actions = []
         episode_obs = []
-
         while True:
             if state is not None:
                 a, v, state, _ = model.step(obs, S=state, M=[False])
@@ -72,17 +61,13 @@ def rollout(env, model, n_trials):
                 a,v, _, _ = model.step(obs)
 
             obs, rew, done, _ = env.step(a)
-
             episode_rew.append(rew)
             episode_actions.append(a)
             episode_obs.append(obs)
-
             if done:
                 break
-
         rewards.append(episode_rew)
         actions.append(episode_actions)
         observations.append(episode_obs)
-
     return observations, actions, rewards
 
diff --git a/baselines/common/tf_util.py b/baselines/common/tf_util.py
index 630c6fd..4643438 100644
--- a/baselines/common/tf_util.py
+++ b/baselines/common/tf_util.py
@@ -1,4 +1,3 @@
-import joblib
 import numpy as np
 import tensorflow as tf  # pylint: ignore-module
 import copy
@@ -339,6 +338,7 @@ def save_state(fname, sess=None):
 # TODO: ensure there is no subtle differences and remove one
 
 def save_variables(save_path, variables=None, sess=None):
+    import joblib
     sess = sess or get_session()
     variables = variables or tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
 
@@ -361,6 +361,7 @@ def save_trpo_variables(save_path, variables=None, sess=None):
     joblib.dump(save_dict, save_path)
 
 def load_variables(load_path, variables=None, sess=None):
+    import joblib
     sess = sess or get_session()
     variables = variables or tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
 
diff --git a/baselines/common/vec_env/__init__.py b/baselines/common/vec_env/__init__.py
index ff6f50f..1e4ef55 100644
--- a/baselines/common/vec_env/__init__.py
+++ b/baselines/common/vec_env/__init__.py
@@ -1,176 +1,10 @@
-from abc import ABC, abstractmethod
-from baselines.common.tile_images import tile_images
-
-class AlreadySteppingError(Exception):
-    """
-    Raised when an asynchronous step is running while
-    step_async() is called again.
-    """
-    def __init__(self):
-        msg = 'already running an async step'
-        Exception.__init__(self, msg)
-
-class NotSteppingError(Exception):
-    """
-    Raised when an asynchronous step is not running but
-    step_wait() is called.
-    """
-    def __init__(self):
-        msg = 'not running an async step'
-        Exception.__init__(self, msg)
-
-class VecEnv(ABC):
-    """
-    An abstract asynchronous, vectorized environment.
-    Used to batch data from multiple copies of an environment, so that
-    each observation becomes an batch of observations, and expected action is a batch of actions to
-    be applied per-environment.
-    """
-    closed = False
-    viewer = None
-
-    metadata = {
-        'render.modes': ['human', 'rgb_array']
-    }
-
-    def __init__(self, num_envs, observation_space, action_space):
-        self.num_envs = num_envs
-        self.observation_space = observation_space
-        self.action_space = action_space
-
-    @abstractmethod
-    def reset(self):
-        """
-        Reset all the environments and return an array of
-        observations, or a tuple of observation arrays.
-
-        If step_async is still doing work, that work will
-        be cancelled and step_wait() should not be called
-        until step_async() is invoked again.
-        """
-        pass
-
-    @abstractmethod
-    def step_async(self, actions):
-        """
-        Tell all the environments to start taking a step
-        with the given actions.
-        Call step_wait() to get the results of the step.
-
-        You should not call this if a step_async run is
-        already pending.
-        """
-        pass
-
-    @abstractmethod
-    def step_wait(self):
-        """
-        Wait for the step taken with step_async().
-
-        Returns (obs, rews, dones, infos):
-         - obs: an array of observations, or a tuple of
-                arrays of observations.
-         - rews: an array of rewards
-         - dones: an array of "episode done" booleans
-         - infos: a sequence of info objects
-        """
-        pass
-
-    def close_extras(self):
-        """
-        Clean up the  extra resources, beyond what's in this base class.
-        Only runs when not self.closed.
-        """
-        pass
-
-    def close(self):
-        if self.closed:
-            return
-        if self.viewer is not None:
-            self.viewer.close()
-        self.close_extras()
-        self.closed = True
-
-    def step(self, actions):
-        self.step_async(actions)
-        return self.step_wait()
-
-    def step_collisions(self, actions):
-        self.step_async(actions)
-        return self.step_wait_collisions()
-
-    def step_runtime(self, actions):
-        self.step_async(actions)
-        return self.step_wait_runtime()
-
-    def render(self, mode='human'):
-        imgs = self.get_images()
-        bigimg = tile_images(imgs)
-        if mode == 'human':
-            self.get_viewer().imshow(bigimg)
-            return self.get_viewer().isopen
-        elif mode == 'rgb_array':
-            return bigimg
-        else:
-            raise NotImplementedError
-
-    def get_images(self):
-        """
-        Return RGB images from each environment
-        """
-        raise NotImplementedError
-
-    @property
-    def unwrapped(self):
-        if isinstance(self, VecEnvWrapper):
-            return self.venv.unwrapped
-        else:
-            return self
-
-    def get_viewer(self):
-        if self.viewer is None:
-            from gym.envs.classic_control import rendering
-            self.viewer = rendering.SimpleImageViewer()
-        return self.viewer
-
-class VecEnvWrapper(VecEnv):
-    def __init__(self, venv, observation_space=None, action_space=None):
-        self.venv = venv
-        VecEnv.__init__(self,
-            num_envs=venv.num_envs,
-            observation_space=observation_space or venv.observation_space,
-            action_space=action_space or venv.action_space)
-
-    def step_async(self, actions):
-        self.venv.step_async(actions)
-
-    @abstractmethod
-    def reset(self):
-        pass
-
-    @abstractmethod
-    def step_wait(self):
-        pass
-
-    def close(self):
-        return self.venv.close()
-
-    def render(self, mode='human'):
-        return self.venv.render(mode=mode)
-
-
-    def get_images(self):
-        return self.venv.get_images()
-
-class CloudpickleWrapper(object):
-    """
-    Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
-    """
-    def __init__(self, x):
-        self.x = x
-    def __getstate__(self):
-        import cloudpickle
-        return cloudpickle.dumps(self.x)
-    def __setstate__(self, ob):
-        import pickle
-        self.x = pickle.loads(ob)
+from .vec_env import AlreadySteppingError, NotSteppingError, VecEnv, VecEnvWrapper, VecEnvObservationWrapper, CloudpickleWrapper
+from .dummy_vec_env import DummyVecEnv
+from .shmem_vec_env import ShmemVecEnv
+from .subproc_vec_env import SubprocVecEnv
+from .vec_frame_stack import VecFrameStack
+from .vec_monitor import VecMonitor
+from .vec_normalize import VecNormalize
+from .vec_remove_dict_obs import VecExtractDictObs
+
+__all__ = ['AlreadySteppingError', 'NotSteppingError', 'VecEnv', 'VecEnvWrapper', 'VecEnvObservationWrapper', 'CloudpickleWrapper', 'DummyVecEnv', 'ShmemVecEnv', 'SubprocVecEnv', 'VecFrameStack', 'VecMonitor', 'VecNormalize', 'VecExtractDictObs']
diff --git a/baselines/common/vec_env/dummy_vec_env.py b/baselines/common/vec_env/dummy_vec_env.py
index 3ff3246..19decd1 100644
--- a/baselines/common/vec_env/dummy_vec_env.py
+++ b/baselines/common/vec_env/dummy_vec_env.py
@@ -1,6 +1,5 @@
 import numpy as np
-from gym import spaces
-from . import VecEnv
+from .vec_env import VecEnv
 from .util import copy_obs_dict, dict_to_obs, obs_space_info
 
 class DummyVecEnv(VecEnv):
@@ -30,7 +29,7 @@ def __init__(self, env_fns):
         self.buf_collisions = np.zeros((self.num_envs,), dtype=np.bool)
         self.buf_infos = [{} for _ in range(self.num_envs)]
         self.actions = None
-        self.specs = [e.spec for e in self.envs]
+        self.spec = self.envs[0].spec
 
     def step_async(self, actions):
         listify = True
@@ -49,8 +48,8 @@ def step_async(self, actions):
     def step_wait(self):
         for e in range(self.num_envs):
             action = self.actions[e]
-            if isinstance(self.envs[e].action_space, spaces.Discrete):
-                action = int(action)
+            # if isinstance(self.envs[e].action_space, spaces.Discrete):
+            #    action = int(action)
 
             obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action)
             if self.buf_dones[e]:
diff --git a/baselines/common/vec_env/shmem_vec_env.py b/baselines/common/vec_env/shmem_vec_env.py
index 99cc586..343ef94 100644
--- a/baselines/common/vec_env/shmem_vec_env.py
+++ b/baselines/common/vec_env/shmem_vec_env.py
@@ -2,9 +2,9 @@
 An interface for asynchronous vectorized environments.
 """
 
-from multiprocessing import Pipe, Array, Process
+import multiprocessing as mp
 import numpy as np
-from . import VecEnv, CloudpickleWrapper
+from .vec_env import VecEnv, CloudpickleWrapper, clear_mpi_env_vars
 import ctypes
 from baselines import logger
 
@@ -22,11 +22,12 @@ class ShmemVecEnv(VecEnv):
     Optimized version of SubprocVecEnv that uses shared variables to communicate observations.
     """
 
-    def __init__(self, env_fns, spaces=None):
+    def __init__(self, env_fns, spaces=None, context='spawn'):
         """
         If you don't specify observation_space, we'll have to create a dummy
         environment to get it.
         """
+        ctx = mp.get_context(context)
         if spaces:
             observation_space, action_space = spaces
         else:
@@ -39,22 +40,22 @@ def __init__(self, env_fns, spaces=None):
         VecEnv.__init__(self, len(env_fns), observation_space, action_space)
         self.obs_keys, self.obs_shapes, self.obs_dtypes = obs_space_info(observation_space)
         self.obs_bufs = [
-            {k: Array(_NP_TO_CT[self.obs_dtypes[k].type], int(np.prod(self.obs_shapes[k]))) for k in self.obs_keys}
+            {k: ctx.Array(_NP_TO_CT[self.obs_dtypes[k].type], int(np.prod(self.obs_shapes[k]))) for k in self.obs_keys}
             for _ in env_fns]
         self.parent_pipes = []
         self.procs = []
-        for env_fn, obs_buf in zip(env_fns, self.obs_bufs):
-            wrapped_fn = CloudpickleWrapper(env_fn)
-            parent_pipe, child_pipe = Pipe()
-            proc = Process(target=_subproc_worker,
-                           args=(child_pipe, parent_pipe, wrapped_fn, obs_buf, self.obs_shapes, self.obs_dtypes, self.obs_keys))
-            proc.daemon = True
-            self.procs.append(proc)
-            self.parent_pipes.append(parent_pipe)
-            proc.start()
-            child_pipe.close()
+        with clear_mpi_env_vars():
+            for env_fn, obs_buf in zip(env_fns, self.obs_bufs):
+                wrapped_fn = CloudpickleWrapper(env_fn)
+                parent_pipe, child_pipe = ctx.Pipe()
+                proc = ctx.Process(target=_subproc_worker,
+                            args=(child_pipe, parent_pipe, wrapped_fn, obs_buf, self.obs_shapes, self.obs_dtypes, self.obs_keys))
+                proc.daemon = True
+                self.procs.append(proc)
+                self.parent_pipes.append(parent_pipe)
+                proc.start()
+                child_pipe.close()
         self.waiting_step = False
-        self.specs = [f().spec for f in env_fns]
         self.viewer = None
 
     def reset(self):
diff --git a/baselines/common/vec_env/subproc_vec_env.py b/baselines/common/vec_env/subproc_vec_env.py
index edb5a69..49ebc25 100644
--- a/baselines/common/vec_env/subproc_vec_env.py
+++ b/baselines/common/vec_env/subproc_vec_env.py
@@ -1,6 +1,8 @@
+import multiprocessing as mp
+
 import numpy as np
-from multiprocessing import Process, Pipe
-from . import VecEnv, CloudpickleWrapper
+from .vec_env import VecEnv, CloudpickleWrapper, clear_mpi_env_vars
+
 
 def worker(remote, parent_remote, env_fn_wrapper):
     parent_remote.close()
@@ -21,8 +23,8 @@ def worker(remote, parent_remote, env_fn_wrapper):
             elif cmd == 'close':
                 remote.close()
                 break
-            elif cmd == 'get_spaces':
-                remote.send((env.observation_space, env.action_space))
+            elif cmd == 'get_spaces_spec':
+                remote.send((env.observation_space, env.action_space, env.spec))
             else:
                 raise NotImplementedError
     except KeyboardInterrupt:
@@ -35,7 +37,7 @@ class SubprocVecEnv(VecEnv):
     VecEnv that runs multiple environments in parallel in subproceses and communicates with them via pipes.
     Recommended to use when num_envs > 1 and step() can be a bottleneck.
     """
-    def __init__(self, env_fns, spaces=None):
+    def __init__(self, env_fns, spaces=None, context='spawn'):
         """
         Arguments:
 
@@ -44,19 +46,20 @@ def __init__(self, env_fns, spaces=None):
         self.waiting = False
         self.closed = False
         nenvs = len(env_fns)
-        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
-        self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
-            for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
+        ctx = mp.get_context(context)
+        self.remotes, self.work_remotes = zip(*[ctx.Pipe() for _ in range(nenvs)])
+        self.ps = [ctx.Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
+                   for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
         for p in self.ps:
-            p.daemon = True # if the main process crashes, we should not cause things to hang
-            p.start()
+            p.daemon = True  # if the main process crashes, we should not cause things to hang
+            with clear_mpi_env_vars():
+                p.start()
         for remote in self.work_remotes:
             remote.close()
 
-        self.remotes[0].send(('get_spaces', None))
-        observation_space, action_space = self.remotes[0].recv()
+        self.remotes[0].send(('get_spaces_spec', None))
+        observation_space, action_space, self.spec = self.remotes[0].recv()
         self.viewer = None
-        # self.specs = [f().spec for f in env_fns]
         VecEnv.__init__(self, len(env_fns), observation_space, action_space)
 
     def step_async(self, actions):
@@ -98,14 +101,15 @@ def get_images(self):
     def _assert_not_closed(self):
         assert not self.closed, "Trying to operate on a SubprocVecEnv after calling close()"
 
+    def __del__(self):
+        if not self.closed:
+            self.close()
 
 def _flatten_obs(obs):
-    assert isinstance(obs, list) or isinstance(obs, tuple)
+    assert isinstance(obs, (list, tuple))
     assert len(obs) > 0
 
     if isinstance(obs[0], dict):
-        import collections
-        assert isinstance(obs, collections.OrderedDict)
         keys = obs[0].keys()
         return {k: np.stack([o[k] for o in obs]) for k in keys}
     else:
diff --git a/baselines/common/vec_env/test_vec_env.py b/baselines/common/vec_env/test_vec_env.py
index da2f3fb..4feabb0 100644
--- a/baselines/common/vec_env/test_vec_env.py
+++ b/baselines/common/vec_env/test_vec_env.py
@@ -8,39 +8,40 @@
 from .dummy_vec_env import DummyVecEnv
 from .shmem_vec_env import ShmemVecEnv
 from .subproc_vec_env import SubprocVecEnv
+from baselines.common.tests.test_with_mpi import with_mpi
 
 
-def assert_envs_equal(env1, env2, num_steps):
+def assert_venvs_equal(venv1, venv2, num_steps):
     """
     Compare two environments over num_steps steps and make sure
     that the observations produced by each are the same when given
     the same actions.
     """
-    assert env1.num_envs == env2.num_envs
-    assert env1.action_space.shape == env2.action_space.shape
-    assert env1.action_space.dtype == env2.action_space.dtype
-    joint_shape = (env1.num_envs,) + env1.action_space.shape
+    assert venv1.num_envs == venv2.num_envs
+    assert venv1.observation_space.shape == venv2.observation_space.shape
+    assert venv1.observation_space.dtype == venv2.observation_space.dtype
+    assert venv1.action_space.shape == venv2.action_space.shape
+    assert venv1.action_space.dtype == venv2.action_space.dtype
 
     try:
-        obs1, obs2 = env1.reset(), env2.reset()
+        obs1, obs2 = venv1.reset(), venv2.reset()
         assert np.array(obs1).shape == np.array(obs2).shape
-        assert np.array(obs1).shape == joint_shape
+        assert np.array(obs1).shape == (venv1.num_envs,) + venv1.observation_space.shape
         assert np.allclose(obs1, obs2)
-        np.random.seed(1337)
+        venv1.action_space.seed(1337)
         for _ in range(num_steps):
-            actions = np.array(np.random.randint(0, 0x100, size=joint_shape),
-                               dtype=env1.action_space.dtype)
-            for env in [env1, env2]:
-                env.step_async(actions)
-            outs1 = env1.step_wait()
-            outs2 = env2.step_wait()
+            actions = np.array([venv1.action_space.sample() for _ in range(venv1.num_envs)])
+            for venv in [venv1, venv2]:
+                venv.step_async(actions)
+            outs1 = venv1.step_wait()
+            outs2 = venv2.step_wait()
             for out1, out2 in zip(outs1[:3], outs2[:3]):
                 assert np.array(out1).shape == np.array(out2).shape
                 assert np.allclose(out1, out2)
             assert list(outs1[3]) == list(outs2[3])
     finally:
-        env1.close()
-        env2.close()
+        venv1.close()
+        venv2.close()
 
 
 @pytest.mark.parametrize('klass', (ShmemVecEnv, SubprocVecEnv))
@@ -63,7 +64,7 @@ def make_fn(seed):
     fns = [make_fn(i) for i in range(num_envs)]
     env1 = DummyVecEnv(fns)
     env2 = klass(fns)
-    assert_envs_equal(env1, env2, num_steps=num_steps)
+    assert_venvs_equal(env1, env2, num_steps=num_steps)
 
 
 class SimpleEnv(gym.Env):
@@ -99,3 +100,15 @@ def reset(self):
 
     def render(self, mode=None):
         raise NotImplementedError
+
+
+
+@with_mpi()
+def test_mpi_with_subprocvecenv():
+    shape = (2,3,4)
+    nenv = 1
+    venv = SubprocVecEnv([lambda: SimpleEnv(0, shape, 'float32')] * nenv)
+    ob = venv.reset()
+    venv.close()
+    assert ob.shape == (nenv,) + shape
+
diff --git a/baselines/common/vec_env/vec_env.py b/baselines/common/vec_env/vec_env.py
new file mode 100644
index 0000000..7aa7878
--- /dev/null
+++ b/baselines/common/vec_env/vec_env.py
@@ -0,0 +1,219 @@
+import contextlib
+import os
+from abc import ABC, abstractmethod
+
+from baselines.common.tile_images import tile_images
+
+class AlreadySteppingError(Exception):
+    """
+    Raised when an asynchronous step is running while
+    step_async() is called again.
+    """
+
+    def __init__(self):
+        msg = 'already running an async step'
+        Exception.__init__(self, msg)
+
+
+class NotSteppingError(Exception):
+    """
+    Raised when an asynchronous step is not running but
+    step_wait() is called.
+    """
+
+    def __init__(self):
+        msg = 'not running an async step'
+        Exception.__init__(self, msg)
+
+
+class VecEnv(ABC):
+    """
+    An abstract asynchronous, vectorized environment.
+    Used to batch data from multiple copies of an environment, so that
+    each observation becomes an batch of observations, and expected action is a batch of actions to
+    be applied per-environment.
+    """
+    closed = False
+    viewer = None
+
+    metadata = {
+        'render.modes': ['human', 'rgb_array']
+    }
+
+    def __init__(self, num_envs, observation_space, action_space):
+        self.num_envs = num_envs
+        self.observation_space = observation_space
+        self.action_space = action_space
+
+    @abstractmethod
+    def reset(self):
+        """
+        Reset all the environments and return an array of
+        observations, or a dict of observation arrays.
+
+        If step_async is still doing work, that work will
+        be cancelled and step_wait() should not be called
+        until step_async() is invoked again.
+        """
+        pass
+
+    @abstractmethod
+    def step_async(self, actions):
+        """
+        Tell all the environments to start taking a step
+        with the given actions.
+        Call step_wait() to get the results of the step.
+
+        You should not call this if a step_async run is
+        already pending.
+        """
+        pass
+
+    @abstractmethod
+    def step_wait(self):
+        """
+        Wait for the step taken with step_async().
+
+        Returns (obs, rews, dones, infos):
+         - obs: an array of observations, or a dict of
+                arrays of observations.
+         - rews: an array of rewards
+         - dones: an array of "episode done" booleans
+         - infos: a sequence of info objects
+        """
+        pass
+
+    def close_extras(self):
+        """
+        Clean up the  extra resources, beyond what's in this base class.
+        Only runs when not self.closed.
+        """
+        pass
+
+    def close(self):
+        if self.closed:
+            return
+        if self.viewer is not None:
+            self.viewer.close()
+        self.close_extras()
+        self.closed = True
+
+    def step(self, actions):
+        """
+        Step the environments synchronously.
+
+        This is available for backwards compatibility.
+        """
+        self.step_async(actions)
+        return self.step_wait()
+
+    def render(self, mode='human'):
+        imgs = self.get_images()
+        bigimg = tile_images(imgs)
+        if mode == 'human':
+            self.get_viewer().imshow(bigimg)
+            return self.get_viewer().isopen
+        elif mode == 'rgb_array':
+            return bigimg
+        else:
+            raise NotImplementedError
+
+    def get_images(self):
+        """
+        Return RGB images from each environment
+        """
+        raise NotImplementedError
+
+    @property
+    def unwrapped(self):
+        if isinstance(self, VecEnvWrapper):
+            return self.venv.unwrapped
+        else:
+            return self
+
+    def get_viewer(self):
+        if self.viewer is None:
+            from gym.envs.classic_control import rendering
+            self.viewer = rendering.SimpleImageViewer()
+        return self.viewer
+
+class VecEnvWrapper(VecEnv):
+    """
+    An environment wrapper that applies to an entire batch
+    of environments at once.
+    """
+
+    def __init__(self, venv, observation_space=None, action_space=None):
+        self.venv = venv
+        VecEnv.__init__(self,
+                        num_envs=venv.num_envs,
+                        observation_space=observation_space or venv.observation_space,
+                        action_space=action_space or venv.action_space)
+
+    def step_async(self, actions):
+        self.venv.step_async(actions)
+
+    @abstractmethod
+    def reset(self):
+        pass
+
+    @abstractmethod
+    def step_wait(self):
+        pass
+
+    def close(self):
+        return self.venv.close()
+
+    def render(self, mode='human'):
+        return self.venv.render(mode=mode)
+
+    def get_images(self):
+        return self.venv.get_images()
+
+class VecEnvObservationWrapper(VecEnvWrapper):
+    @abstractmethod
+    def process(self, obs):
+        pass
+
+    def reset(self):
+        obs = self.venv.reset()
+        return self.process(obs)
+
+    def step_wait(self):
+        obs, rews, dones, infos = self.venv.step_wait()
+        return self.process(obs), rews, dones, infos
+
+class CloudpickleWrapper(object):
+    """
+    Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
+    """
+
+    def __init__(self, x):
+        self.x = x
+
+    def __getstate__(self):
+        import cloudpickle
+        return cloudpickle.dumps(self.x)
+
+    def __setstate__(self, ob):
+        import pickle
+        self.x = pickle.loads(ob)
+
+
+@contextlib.contextmanager
+def clear_mpi_env_vars():
+    """
+    from mpi4py import MPI will call MPI_Init by default.  If the child process has MPI environment variables, MPI will think that the child process is an MPI process just like the parent and do bad things such as hang.
+    This context manager is a hacky way to clear those environment variables temporarily such as when we are starting multiprocessing
+    Processes.
+    """
+    removed_environment = {}
+    for k, v in list(os.environ.items()):
+        for prefix in ['OMPI_', 'PMI_']:
+            if k.startswith(prefix):
+                removed_environment[k] = v
+                del os.environ[k]
+    try:
+        yield
+    finally:
+        os.environ.update(removed_environment)
diff --git a/baselines/common/vec_env/vec_frame_stack.py b/baselines/common/vec_env/vec_frame_stack.py
index 1c4c507..36dfc2d 100644
--- a/baselines/common/vec_env/vec_frame_stack.py
+++ b/baselines/common/vec_env/vec_frame_stack.py
@@ -1,4 +1,4 @@
-from baselines.common.vec_env import VecEnvWrapper
+from .vec_env import VecEnvWrapper
 import numpy as np
 from gym import spaces
 
diff --git a/baselines/common/vec_env/vec_remove_dict_obs.py b/baselines/common/vec_env/vec_remove_dict_obs.py
new file mode 100644
index 0000000..602b949
--- /dev/null
+++ b/baselines/common/vec_env/vec_remove_dict_obs.py
@@ -0,0 +1,11 @@
+from .vec_env import VecEnvObservationWrapper
+
+
+class VecExtractDictObs(VecEnvObservationWrapper):
+    def __init__(self, venv, key):
+        self.key = key
+        super().__init__(venv=venv,
+            observation_space=venv.observation_space.spaces[self.key])
+
+    def process(self, obs):
+        return obs[self.key]
\ No newline at end of file
diff --git a/baselines/common/wrappers.py b/baselines/common/wrappers.py
new file mode 100644
index 0000000..7683d18
--- /dev/null
+++ b/baselines/common/wrappers.py
@@ -0,0 +1,19 @@
+import gym
+
+class TimeLimit(gym.Wrapper):
+    def __init__(self, env, max_episode_steps=None):
+        super(TimeLimit, self).__init__(env)
+        self._max_episode_steps = max_episode_steps
+        self._elapsed_steps = 0
+
+    def step(self, ac):
+        observation, reward, done, info = self.env.step(ac)
+        self._elapsed_steps += 1
+        if self._elapsed_steps >= self._max_episode_steps:
+            done = True
+            info['TimeLimit.truncated'] = True
+        return observation, reward, done, info
+
+    def reset(self, **kwargs):
+        self._elapsed_steps = 0
+        return self.env.reset(**kwargs)
\ No newline at end of file
diff --git a/baselines/ddpg/test_smoke.py b/baselines/ddpg/test_smoke.py
index b5bf866..a9fdc05 100644
--- a/baselines/ddpg/test_smoke.py
+++ b/baselines/ddpg/test_smoke.py
@@ -1,7 +1,10 @@
-from baselines.run import main as M
+from multiprocessing import Process
+import baselines.run
 
 def _run(argstr):
-    M(('--alg=ddpg --env=Pendulum-v0 --num_timesteps=0 ' + argstr).split(' '))
+    p = Process(target=baselines.run.main, args=('--alg=ddpg --env=Pendulum-v0 --num_timesteps=0 ' + argstr).split(' '))
+    p.start()
+    p.join()
 
 def test_popart():
     _run('--normalize_returns=True --popart=True')
diff --git a/baselines/her/her.py b/baselines/her/her.py
index ec96f9e..1821fc5 100644
--- a/baselines/her/her.py
+++ b/baselines/her/her.py
@@ -108,7 +108,7 @@ def learn(*, network, env, total_timesteps,
 
     # Prepare params.
     params = config.DEFAULT_PARAMS
-    env_name = env.specs[0].id
+    env_name = env.spec.id
     params['env_name'] = env_name
     params['replay_strategy'] = replay_strategy
     if env_name in config.DEFAULT_ENV_PARAMS:
diff --git a/baselines/logger.py b/baselines/logger.py
index a45fa8a..6c08ca0 100644
--- a/baselines/logger.py
+++ b/baselines/logger.py
@@ -7,6 +7,7 @@
 import datetime
 import tempfile
 from collections import defaultdict
+from contextlib import contextmanager
 
 DEBUG = 10
 INFO = 20
@@ -68,7 +69,8 @@ def writekvs(self, kvs):
         self.file.flush()
 
     def _truncate(self, s):
-        return s[:20] + '...' if len(s) > 23 else s
+        maxlen = 30
+        return s[:maxlen-3] + '...' if len(s) > maxlen else s
 
     def writeseq(self, seq):
         seq = list(seq)
@@ -195,13 +197,13 @@ def logkv(key, val):
     Call this once for each diagnostic quantity, each iteration
     If called many times, last value will be used.
     """
-    Logger.CURRENT.logkv(key, val)
+    get_current().logkv(key, val)
 
 def logkv_mean(key, val):
     """
     The same as logkv(), but if called many times, values averaged.
     """
-    Logger.CURRENT.logkv_mean(key, val)
+    get_current().logkv_mean(key, val)
 
 def logkvs(d):
     """
@@ -213,21 +215,18 @@ def logkvs(d):
 def dumpkvs():
     """
     Write all of the diagnostics from the current iteration
-
-    level: int. (see logger.py docs) If the global logger level is higher than
-                the level argument here, don't print to stdout.
     """
-    Logger.CURRENT.dumpkvs()
+    return get_current().dumpkvs()
 
 def getkvs():
-    return Logger.CURRENT.name2val
+    return get_current().name2val
 
 
 def log(*args, level=INFO):
     """
     Write the sequence of args, with no separators, to the console and output files (if you've configured an output file).
     """
-    Logger.CURRENT.log(*args, level=level)
+    get_current().log(*args, level=level)
 
 def debug(*args):
     log(*args, level=DEBUG)
@@ -246,30 +245,29 @@ def set_level(level):
     """
     Set logging threshold on current logger.
     """
-    Logger.CURRENT.set_level(level)
+    get_current().set_level(level)
+
+def set_comm(comm):
+    get_current().set_comm(comm)
 
 def get_dir():
     """
     Get directory that log files are being written to.
     will be None if there is no output directory (i.e., if you didn't call start)
     """
-    return Logger.CURRENT.get_dir()
+    return get_current().get_dir()
 
 record_tabular = logkv
 dump_tabular = dumpkvs
 
-class ProfileKV:
-    """
-    Usage:
-    with logger.ProfileKV("interesting_scope"):
-        code
-    """
-    def __init__(self, n):
-        self.n = "wait_" + n
-    def __enter__(self):
-        self.t1 = time.time()
-    def __exit__(self ,type, value, traceback):
-        Logger.CURRENT.name2val[self.n] += time.time() - self.t1
+@contextmanager
+def profile_kv(scopename):
+    logkey = 'wait_' + scopename
+    tstart = time.time()
+    try:
+        yield
+    finally:
+        get_current().name2val[logkey] += time.time() - tstart
 
 def profile(n):
     """
@@ -279,7 +277,7 @@ def my_func(): code
     """
     def decorator_with_name(func):
         def func_wrapper(*args, **kwargs):
-            with ProfileKV(n):
+            with profile_kv(n):
                 return func(*args, **kwargs)
         return func_wrapper
     return decorator_with_name
@@ -289,17 +287,25 @@ def func_wrapper(*args, **kwargs):
 # Backend
 # ================================================================
 
+def get_current():
+    if Logger.CURRENT is None:
+        _configure_default_logger()
+
+    return Logger.CURRENT
+
+
 class Logger(object):
     DEFAULT = None  # A logger with no output files. (See right below class definition)
                     # So that you can still log to the terminal without setting up any output files
     CURRENT = None  # Current logger being used by the free functions above
 
-    def __init__(self, dir, output_formats):
+    def __init__(self, dir, output_formats, comm=None):
         self.name2val = defaultdict(float)  # values this iteration
         self.name2cnt = defaultdict(int)
         self.level = INFO
         self.dir = dir
         self.output_formats = output_formats
+        self.comm = comm
 
     # Logging API, forwarded
     # ----------------------------------------
@@ -307,20 +313,27 @@ def logkv(self, key, val):
         self.name2val[key] = val
 
     def logkv_mean(self, key, val):
-        if val is None:
-            self.name2val[key] = None
-            return
         oldval, cnt = self.name2val[key], self.name2cnt[key]
         self.name2val[key] = oldval*cnt/(cnt+1) + val/(cnt+1)
         self.name2cnt[key] = cnt + 1
 
     def dumpkvs(self):
-        if self.level == DISABLED: return
+        if self.comm is None:
+            d = self.name2val
+        else:
+            from baselines.common import mpi_util
+            d = mpi_util.mpi_weighted_mean(self.comm,
+                {name : (val, self.name2cnt.get(name, 1))
+                    for (name, val) in self.name2val.items()})
+            if self.comm.rank != 0:
+                d['dummy'] = 1 # so we don't get a warning about empty dict
+        out = d.copy() # Return the dict for unit testing purposes
         for fmt in self.output_formats:
             if isinstance(fmt, KVWriter):
-                fmt.writekvs(self.name2val)
+                fmt.writekvs(d)
         self.name2val.clear()
         self.name2cnt.clear()
+        return out
 
     def log(self, *args, level=INFO):
         if self.level <= level:
@@ -331,6 +344,9 @@ def log(self, *args, level=INFO):
     def set_level(self, level):
         self.level = level
 
+    def set_comm(self, comm):
+        self.comm = comm
+
     def get_dir(self):
         return self.dir
 
@@ -345,7 +361,10 @@ def _do_log(self, args):
             if isinstance(fmt, SeqWriter):
                 fmt.writeseq(map(str, args))
 
-def configure(dir=None, format_strs=None):
+def configure(dir=None, format_strs=None, comm=None):
+    """
+    If comm is provided, average all numerical stats across that comm
+    """
     if dir is None:
         dir = os.getenv('OPENAI_LOGDIR')
     if dir is None:
@@ -372,15 +391,11 @@ def configure(dir=None, format_strs=None):
     format_strs = filter(None, format_strs)
     output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs]
 
-    Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
+    Logger.CURRENT = Logger(dir=dir, output_formats=output_formats, comm=comm)
     log('Logging to %s'%dir)
 
 def _configure_default_logger():
-    format_strs = None
-    # keep the old default of only writing to stdout
-    if 'OPENAI_LOG_FORMAT' not in os.environ:
-        format_strs = ['stdout']
-    configure(format_strs=format_strs)
+    configure()
     Logger.DEFAULT = Logger.CURRENT
 
 def reset():
@@ -389,17 +404,15 @@ def reset():
         Logger.CURRENT = Logger.DEFAULT
         log('Reset logger')
 
-class scoped_configure(object):
-    def __init__(self, dir=None, format_strs=None):
-        self.dir = dir
-        self.format_strs = format_strs
-        self.prevlogger = None
-    def __enter__(self):
-        self.prevlogger = Logger.CURRENT
-        configure(dir=self.dir, format_strs=self.format_strs)
-    def __exit__(self, *args):
+@contextmanager
+def scoped_configure(dir=None, format_strs=None, comm=None):
+    prevlogger = Logger.CURRENT
+    configure(dir=dir, format_strs=format_strs, comm=comm)
+    try:
+        yield
+    finally:
         Logger.CURRENT.close()
-        Logger.CURRENT = self.prevlogger
+        Logger.CURRENT = prevlogger
 
 # ================================================================
 
@@ -423,7 +436,7 @@ def _demo():
     logkv_mean("b", -44.4)
     logkv("a", 5.5)
     dumpkvs()
-    info("^^^ should see b = 33.3")
+    info("^^^ should see b = -33.3")
 
     logkv("b", -2.5)
     dumpkvs()
@@ -456,7 +469,6 @@ def read_tb(path):
     import pandas
     import numpy as np
     from glob import glob
-    from collections import defaultdict
     import tensorflow as tf
     if osp.isdir(path):
         fnames = glob(osp.join(path, "events.*"))
@@ -482,8 +494,5 @@ def read_tb(path):
             data[step-1, colidx] = value
     return pandas.DataFrame(data, columns=tags)
 
-# configure the default logger on import
-_configure_default_logger()
-
 if __name__ == "__main__":
     _demo()
diff --git a/baselines/ppo1/pposgd_simple.py b/baselines/ppo1/pposgd_simple.py
index 8019f7b..505b652 100644
--- a/baselines/ppo1/pposgd_simple.py
+++ b/baselines/ppo1/pposgd_simple.py
@@ -106,7 +106,6 @@ def learn(env, policy_fn, *,
     ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
 
     lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
-    clip_param = clip_param * lrmult # Annealed clipping parameter epsilon
 
     ob = U.get_placeholder_cached(name="ob")
     ac = pi.pdtype.sample_placeholder([None])
diff --git a/baselines/ppo1/run_humanoid.py b/baselines/ppo1/run_humanoid.py
index 17b42b5..91a6db7 100644
--- a/baselines/ppo1/run_humanoid.py
+++ b/baselines/ppo1/run_humanoid.py
@@ -19,16 +19,17 @@ def policy_fn(name, ob_space, ac_space):
     # these are good enough to make humanoid walk, but whether those are
     # an absolute best or not is not certain
     env = RewScale(env, 0.1)
+    logger.log("NOTE: reward will be scaled by a factor of 10  in logged stats. Check the monitor for unscaled reward.")
     pi = pposgd_simple.learn(env, policy_fn,
             max_timesteps=num_timesteps,
             timesteps_per_actorbatch=2048,
-            clip_param=0.2, entcoeff=0.0,
+            clip_param=0.1, entcoeff=0.0,
             optim_epochs=10,
-            optim_stepsize=3e-4,
+            optim_stepsize=1e-4,
             optim_batchsize=64,
             gamma=0.99,
             lam=0.95,
-            schedule='linear',
+            schedule='constant',
         )
     env.close()
     if model_path:
@@ -47,7 +48,7 @@ def main():
     logger.configure()
     parser = mujoco_arg_parser()
     parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
-    parser.set_defaults(num_timesteps=int(2e7))
+    parser.set_defaults(num_timesteps=int(5e7))
 
     args = parser.parse_args()
 
@@ -68,8 +69,5 @@ def main():
             if done:
                 ob = env.reset()
 
-
-
-
 if __name__ == '__main__':
     main()
diff --git a/baselines/ppo2/defaults.py b/baselines/ppo2/defaults.py
index 9102f27..6445557 100644
--- a/baselines/ppo2/defaults.py
+++ b/baselines/ppo2/defaults.py
@@ -18,7 +18,7 @@ def atari():
         lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
         ent_coef=.01,
         lr=lambda f : f * 2.5e-4,
-        cliprange=lambda f : f * 0.1,
+        cliprange=0.1,
     )
 def retro():
    return atari()
diff --git a/baselines/run.py b/baselines/run.py
index 0c50fda..01df981 100644
--- a/baselines/run.py
+++ b/baselines/run.py
@@ -5,15 +5,13 @@
 from collections import defaultdict
 import tensorflow as tf
 
+from baselines.common.vec_env import VecFrameStack, VecNormalize
 from baselines.common.vec_env.vec_video_recorder import VecVideoRecorder
-from baselines.common.vec_env.vec_frame_stack import VecFrameStack
 from baselines.common.cmd_util import common_arg_parser, parse_unknown_args, make_vec_env, make_env
 from baselines.common.tf_util import get_session
 from baselines import logger
 from importlib import import_module
 
-from baselines.common.vec_env.vec_normalize import VecNormalize
-
 try:
     from mpi4py import MPI
 except ImportError:
@@ -51,7 +49,7 @@
 
 
 def train(args, extra_args):
-    env_type, env_id = get_env_type(args.env)
+    env_type, env_id = get_env_type(args)
     print('env_type: {}'.format(env_type))
 
     total_timesteps = int(args.num_timesteps)
@@ -63,7 +61,7 @@ def train(args, extra_args):
 
     env = build_env(args)
     if args.save_video_interval != 0:
-        env = VecVideoRecorder(env, osp.join(logger.Logger.CURRENT.dir, "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length)
+        env = VecVideoRecorder(env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length)
 
     if args.network:
         alg_kwargs['network'] = args.network
@@ -90,7 +88,7 @@ def build_env(args, render=False):
     alg = args.alg
     seed = args.seed
 
-    env_type, env_id = get_env_type(args.env)
+    env_type, env_id = get_env_type(args)
 
     if env_type in {'atari', 'retro'}:
         if alg == 'deepq':
@@ -103,22 +101,27 @@ def build_env(args, render=False):
             env = VecFrameStack(env, frame_stack_size)
 
     else:
-       config = tf.ConfigProto(allow_soft_placement=True,
+        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=1,
                                inter_op_parallelism_threads=1)
-       config.gpu_options.allow_growth = True
-       get_session(config=config)
+        config.gpu_options.allow_growth = True
+        get_session(config=config)
 
-       flatten_dict_observations = alg not in {'her'}
-       env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale, flatten_dict_observations=flatten_dict_observations)
+        flatten_dict_observations = alg not in {'her'}
+        env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale, flatten_dict_observations=flatten_dict_observations)
 
-       if env_type == 'mujoco':
-           env = VecNormalize(env)
+        if env_type == 'mujoco':
+            env = VecNormalize(env)
 
     return env
 
 
-def get_env_type(env_id):
+def get_env_type(args):
+    env_id = args.env
+
+    if args.env_type is not None:
+        return args.env_type, env_id
+
     # Re-parse the gym registry, since we could have new envs since last time.
     for env in gym.envs.registry.all():
         env_type = env._entry_point.split(':')[0].split('.')[-1]
@@ -204,7 +207,6 @@ def main(args):
         rank = MPI.COMM_WORLD.Get_rank()
 
     model, env = train(args, extra_args)
-    env.close()
 
     if args.save_path is not None and rank == 0:
         save_path = osp.expanduser(args.save_path)
@@ -212,24 +214,28 @@ def main(args):
 
     if args.play:
         logger.log("Running trained model")
-        env = build_env(args, render=True)
         obs = env.reset()
 
         state = model.initial_state if hasattr(model, 'initial_state') else None
         dones = np.zeros((1,))
 
+        episode_rew = 0
         while True:
             if state is not None:
                 actions, _, state, _ = model.step(obs,S=state, M=dones)
             else:
                 actions, _, _, _ = model.step(obs)
 
-            obs, _, done, _ = env.step(actions)
+            obs, rew, done, _ = env.step(actions)
+            episode_rew += rew[0]
             env.render()
+            done = done.any() if isinstance(done, np.ndarray) else done
             if done:
+                print(f'episode_rew={episode_rew}')
+                episode_rew = 0
                 obs = env.reset()
 
-        env.close()
+    env.close()
 
     return model
 

From 208a5118a2f4a5a1bad9057ab13da7f9d2a992e5 Mon Sep 17 00:00:00 2001
From: rkojcev <rkojcev@gmail.com>
Date: Wed, 6 Mar 2019 10:28:09 +0100
Subject: [PATCH 04/37] adding run WIP

---
 baselines/common/vec_env/dummy_vec_env.py | 8 ++++----
 baselines/common/vec_env/vec_env.py       | 8 ++++++++
 baselines/ppo2/defaults.py                | 6 +++---
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/baselines/common/vec_env/dummy_vec_env.py b/baselines/common/vec_env/dummy_vec_env.py
index 19decd1..a6f486e 100644
--- a/baselines/common/vec_env/dummy_vec_env.py
+++ b/baselines/common/vec_env/dummy_vec_env.py
@@ -61,8 +61,8 @@ def step_wait(self):
     def step_wait_collisions(self):
         for e in range(self.num_envs):
             action = self.actions[e]
-            if isinstance(self.envs[e].action_space, spaces.Discrete):
-                action = int(action)
+            # if isinstance(self.envs[e].action_space, spaces.Discrete):
+            #     action = int(action)
 
             obs, self.buf_rews[e], self.buf_dones[e], self.buf_collisions[e], self.buf_infos[e] = self.envs[e].step_collisions(action)
             if self.buf_dones[e]:
@@ -74,8 +74,8 @@ def step_wait_collisions(self):
     def step_wait_runtime(self):
         for e in range(self.num_envs):
             action = self.actions[e]
-            if isinstance(self.envs[e].action_space, spaces.Discrete):
-                action = int(action)
+            # if isinstance(self.envs[e].action_space, spaces.Discrete):
+            #     action = int(action)
 
             obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step_runtime(action)
             self._save_obs(e, obs)
diff --git a/baselines/common/vec_env/vec_env.py b/baselines/common/vec_env/vec_env.py
index 7aa7878..959d60e 100644
--- a/baselines/common/vec_env/vec_env.py
+++ b/baselines/common/vec_env/vec_env.py
@@ -106,6 +106,14 @@ def step(self, actions):
         """
         self.step_async(actions)
         return self.step_wait()
+    def step_runtime(self, actions):
+        """
+        Step the environments synchronously.
+
+        This is available for backwards compatibility.
+        """
+        self.step_async(actions)
+        return self.step_wait_runtime()
 
     def render(self, mode='human'):
         imgs = self.get_images()
diff --git a/baselines/ppo2/defaults.py b/baselines/ppo2/defaults.py
index 6445557..f83c7ff 100644
--- a/baselines/ppo2/defaults.py
+++ b/baselines/ppo2/defaults.py
@@ -45,12 +45,12 @@ def mara_mlp():
         total_timesteps = 1e8,
         save_interval = 10,
         # env_name = 'MARA-v0',
-        env_name = 'MARAOrient-v0',
+        # env_name = 'MARAOrient-v0',
         # env_name = 'MARACollision-v0',
-        # env_name = 'MARACollisionOrient-v0',
+        env_name = 'MARACollisionOrient-v0',
         transfer_path = None,
         # transfer_path = '/tmp/ros2learn/MARACollision-v0/ppo2_mlp/2019-02-19_12h47min/checkpoints/best',
-        trained_path = '/media/yue/hard_disk/ros2learn/MARAOrient-v0/2019-02-27_18h01min/checkpoints/02420'
+        trained_path = '/home/rkojcev/MARA_NN/2019-02-25_18h02min/checkpoint/01750'
     )
 
 def mara_lstm():

From 30fddacbf4a8e5173f182a2f01c7c5545a693d3d Mon Sep 17 00:00:00 2001
From: rkojcev <rkojcev@gmail.com>
Date: Wed, 6 Mar 2019 12:32:50 +0100
Subject: [PATCH 05/37] remove agent from this branch too

---
 baselines/agent/scara_arm/__init__.py         |   0
 baselines/agent/scara_arm/agent_scara.py      | 510 ----------------
 .../scara_arm/agent_scara_experimental.py     | 528 -----------------
 baselines/agent/scara_arm/agent_scara_real.py | 493 ----------------
 .../agent/scara_arm/agent_scara_real1.py      | 434 --------------
 .../agent/scara_arm/agent_scara_real_mlsh.py  | 545 ------------------
 baselines/agent/scara_arm/tree_urdf.py        | 134 -----
 baselines/agent/utility/error.py              | 140 -----
 baselines/agent/utility/general_utils.py      | 436 --------------
 baselines/agent/utility/seeding.py            |  95 ---
 baselines/agent/utility/spaces/__init__.py    |   9 -
 baselines/agent/utility/spaces/box.py         |  43 --
 baselines/agent/utility/spaces/dict_space.py  |  49 --
 baselines/agent/utility/spaces/discrete.py    |  32 -
 .../agent/utility/spaces/multi_binary.py      |  15 -
 .../agent/utility/spaces/multi_discrete.py    |  47 --
 baselines/agent/utility/spaces/prng.py        |  20 -
 .../agent/utility/spaces/tests/__init__.py    |   0
 .../agent/utility/spaces/tests/test_spaces.py |  32 -
 baselines/agent/utility/spaces/tuple_space.py |  31 -
 20 files changed, 3593 deletions(-)
 delete mode 100644 baselines/agent/scara_arm/__init__.py
 delete mode 100644 baselines/agent/scara_arm/agent_scara.py
 delete mode 100644 baselines/agent/scara_arm/agent_scara_experimental.py
 delete mode 100644 baselines/agent/scara_arm/agent_scara_real.py
 delete mode 100644 baselines/agent/scara_arm/agent_scara_real1.py
 delete mode 100644 baselines/agent/scara_arm/agent_scara_real_mlsh.py
 delete mode 100644 baselines/agent/scara_arm/tree_urdf.py
 delete mode 100644 baselines/agent/utility/error.py
 delete mode 100644 baselines/agent/utility/general_utils.py
 delete mode 100644 baselines/agent/utility/seeding.py
 delete mode 100644 baselines/agent/utility/spaces/__init__.py
 delete mode 100644 baselines/agent/utility/spaces/box.py
 delete mode 100644 baselines/agent/utility/spaces/dict_space.py
 delete mode 100644 baselines/agent/utility/spaces/discrete.py
 delete mode 100644 baselines/agent/utility/spaces/multi_binary.py
 delete mode 100644 baselines/agent/utility/spaces/multi_discrete.py
 delete mode 100644 baselines/agent/utility/spaces/prng.py
 delete mode 100644 baselines/agent/utility/spaces/tests/__init__.py
 delete mode 100644 baselines/agent/utility/spaces/tests/test_spaces.py
 delete mode 100644 baselines/agent/utility/spaces/tuple_space.py

diff --git a/baselines/agent/scara_arm/__init__.py b/baselines/agent/scara_arm/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/baselines/agent/scara_arm/agent_scara.py b/baselines/agent/scara_arm/agent_scara.py
deleted file mode 100644
index 0e8141b..0000000
--- a/baselines/agent/scara_arm/agent_scara.py
+++ /dev/null
@@ -1,510 +0,0 @@
-import os
-import time
-import copy
-import json
-import numpy as np # Used pretty much everywhere.
-import matplotlib.pyplot as plt
-import threading # Used for time locks to synchronize position data.
-import rclpy
-import tensorflow as tf
-
-from timeit import default_timer as timer
-from scipy import stats
-from scipy.interpolate import spline
-import geometry_msgs.msg
-
-from os import path
-from rclpy.qos import QoSProfile, qos_profile_sensor_data
-from baselines.agent.utility.general_utils import forward_kinematics, get_ee_points, rotation_from_matrix, \
-    get_rotation_matrix,quaternion_from_matrix# For getting points and velocities.
-from trajectory_msgs.msg import JointTrajectory, JointTrajectoryPoint # Used for publishing scara joint angles.
-from control_msgs.msg import JointTrajectoryControllerState
-from std_msgs.msg import String
-
-from baselines.agent.scara_arm.tree_urdf import treeFromFile # For KDL Jacobians
-from PyKDL import Jacobian, Chain, ChainJntToJacSolver, JntArray # For KDL Jacobians
-from collections import namedtuple
-import scipy.ndimage as sp_ndimage
-from functools import partial
-
-from baselines.agent.utility import error
-from baselines.agent.utility import seeding
-from baselines.agent.utility import spaces
-
-StartEndPoints = namedtuple('StartEndPoints', ['start', 'target'])
-class MSG_INVALID_JOINT_NAMES_DIFFER(Exception):
-    """Error object exclusively raised by _process_observations."""
-    pass
-
-# class ROBOT_MADE_CONTACT_WITH_GAZEBO_GROUND_SO_RESTART_ROSLAUNCH(Exception):
-#     """Error object exclusively raised by reset."""
-#     pass
-
-
-class AgentSCARAROS(object):
-    """Connects the SCARA actions and Deep Learning algorithms."""
-
-    def __init__(self, init_node=True): #hyperparams, , urdf_path, init_node=True
-        """Initialized Agent.
-        init_node:   Whether or not to initialize a new ROS node."""
-
-        print("I am in init")
-        self._observation_msg = None
-        self.scale = None  # must be set from elsewhere based on observations
-        self.bias = None
-        self.x_idx = None
-        self.obs = None
-        self.reward = None
-        self.done = None
-        self.reward_dist = None
-        self.reward_ctrl = None
-        self.action_space = None
-        # to work with baselines a2c need this ones
-        self.num_envs = 1
-        self.remotes = [0]
-
-        # Setup the main node.
-        print("Init ros node")
-        rclpy.init(args=None)
-        node = rclpy.create_node('robot_ai_node')
-        global node
-        self._pub = node.create_publisher(JointTrajectory,self.agent['joint_publisher'])
-        self._sub = node.create_subscription(JointTrajectoryControllerState, self.agent['joint_subscriber'], self._observation_callback, qos_profile=qos_profile_sensor_data)
-        assert self._sub
-        self._time_lock = threading.RLock()
-        print("setting time clocks")
-
-        if self.agent['tree_path'].startswith("/"):
-            fullpath = self.agent['tree_path']
-            print(fullpath)
-        else:
-            fullpath = os.path.join(os.path.dirname(__file__), "assets", self.agent['tree_path'])
-        if not path.exists(fullpath):
-            raise IOError("File %s does not exist" % fullpath)
-
-        print("I am in reading the file path: ", fullpath)
-
-        # Initialize a tree structure from the robot urdf.
-        # Note that the xacro of the urdf is updated by hand.
-        # Then the urdf must be compiled.
-        _, self.scara_tree = treeFromFile(self.agent['tree_path'])
-        # Retrieve a chain structure between the base and the start of the end effector.
-        self.scara_chain = self.scara_tree.getChain(self.agent['link_names'][0], self.agent['link_names'][-1])
-        print("Nr. of jnts: ", self.scara_chain.getNrOfJoints())
-
-    #     # Initialize a KDL Jacobian solver from the chain.
-        self.jac_solver = ChainJntToJacSolver(self.scara_chain)
-        print(self.jac_solver)
-        self._observations_stale = False
-        print("after observations stale")
-
-    #
-        self._currently_resetting = [False for _ in range(1)]
-        self.reset_joint_angles = [None for _ in range(1)]
-
-        # taken from mujoco in OpenAi how to initialize observation space and action space.
-        observation, _reward, done, _info = self.step(np.zeros(self.scara_chain.getNrOfJoints()))
-        assert not done
-        self.obs_dim = observation.size
-        print(self.obs_dim)
-        # print(observation, _reward)
-        # Here idially we should find the control range of the robot. Unfortunatelly in ROS/KDL there is nothing like this.
-        # I have tested this with the mujoco enviroment and the output is always same low[-1.,-1.], high[1.,1.]
-        # bounds = self.model.actuator_ctrlrange.copy()
-        low = -np.pi/2.0 * np.ones(self.scara_chain.getNrOfJoints()) #bounds[:, 0]
-        high = np.pi/2.0 * np.ones(self.scara_chain.getNrOfJoints()) #bounds[:, 1]
-        # print("Action Spaces:")
-        # print("low: ", low, "high: ", high)
-        self.action_space = spaces.Box(low, high)
-
-        high = np.inf*np.ones(self.obs_dim)
-        low = -high
-        self.observation_space = spaces.Box(low, high)
-        print(self.observation_space)
-
-        # self.seed()
-    def _observation_callback(self, message):
-        # message: observation from the robot to store each listen."""
-        # self._time_lock.acquire(True, -1)
-        # with self._time_lock:
-        self._observations_stale = False
-        self._observation_msg = message
-            # if self._currently_resetting:
-            #     epsilon = 1e-3
-            #     reset_action = self.agent['reset_conditions']['initial_positions']
-            #     now_action = self._observation_msg.actual.positions
-            #     du = np.linalg.norm(reset_action-now_action, float(np.inf))
-            #     if du < epsilon:
-            #         self._currently_resetting = False
-
-        # self._time_lock.release()
-
-
-    def reset(self):
-        """Reset function should call reset_model.
-           In OpenAI in reset_model we are setting the robot to initial pose + some random number.
-           This function returns the observations which are then used again for the policy calculation.
-           In our case we are going to set the robot to initial pose, for now given by the user."""
-        self.obs = None
-        # self._currently_resetting = True
-
-        # Set the reset position as the initial position from agent hyperparams.
-        self.reset_joint_angles = self.agent['reset_conditions']['initial_positions']
-        # Prepare the present positions to see how far off we are.
-        now_position = np.asarray(self._observation_msg.actual.positions)
-
-        # Wait until the arm is within epsilon of reset configuration.
-        # action_msg = JointTrajectory()
-        # self._time_lock.acquire(True, -1)
-        # with self._time_lock:
-        #     self._currently_resetting = True
-
-
-        # if self._currently_resetting:
-        #     epsilon = 1e-3
-        #     reset_action = self.agent['reset_conditions']['initial_positions']
-        #     now_action = self._observation_msg.actual.positions
-        #     du = np.linalg.norm(reset_action-now_action, float(np.inf))
-        #     if du > epsilon:
-        #         self._pub.publish(self._get_trajectory_message(self.agent['reset_conditions']['initial_positions'], self.agent))
-        #         time.sleep(self.agent['slowness'])
-        #         self._currently_resetting = True
-        #         print("reset is false.")
-        #     else:
-        #         self._currently_resetting = False
-        #         print("reset is true.")
-
-        self.ob, ee_points = self._get_obs()
-        # self._time_lock.release()
-
-        # print("resetting: ", self.ob)
-
-        return self.ob
-    def seed(self, seed=None):
-        self.np_random, seed = seeding.np_random(seed)
-        return [seed]
-    def _get_obs(self):
-        observations = None
-        ee_points = None
-        if self._observations_stale is False:
-            # # Acquire the lock to prevent the subscriber thread from
-            # # updating times or observation messages.
-            # self._time_lock.acquire(True)
-
-            obs_message = self._observation_msg
-            # if obs_message is None:
-            #     obs_message = self._observation_msg
-            #     # time.sleep(self.agent['slowness'])
-            #     # self._observations_stale = True
-            # self._time_lock.release()
-
-
-
-            # Make it so that subscriber's thread observation callback
-            # must be called before publishing again.
-
-            # Collect the end effector points and velocities in
-            # cartesian coordinates for the state.
-            # Collect the present joint angles and velocities from ROS for the state.
-            last_observations = self._process_observations(obs_message, self.agent)
-
-            # self._observations_stale = True
-
-
-            if last_observations is None:
-                print("last_observations is empty")
-            else:
-
-            # # # Get Jacobians from present joint angles and KDL trees
-            # # # The Jacobians consist of a 6x6 matrix getting its from from
-            # # # (# joint angles) x (len[x, y, z] + len[roll, pitch, yaw])
-                ee_link_jacobians = self._get_jacobians(last_observations)
-                if self.agent['link_names'][-1] is None:
-                    print("End link is empty!!")
-                else:
-                    self._observations_stale = False
-                    # print(self.agent['link_names'][-1])
-                    trans, rot = forward_kinematics(self.scara_chain,
-                                                self.agent['link_names'],
-                                                last_observations[:self.scara_chain.getNrOfJoints()],
-                                                base_link=self.agent['link_names'][0],
-                                                end_link=self.agent['link_names'][-1])
-                    # #
-                    rotation_matrix = np.eye(4)
-                    rotation_matrix[:3, :3] = rot
-                    rotation_matrix[:3, 3] = trans
-                    # angle, dir, _ = rotation_from_matrix(rotation_matrix)
-                    # #
-                    # current_quaternion = np.array([angle]+dir.tolist())#
-
-                    # I need this calculations for the new reward function, need to send them back to the run scara or calculate them here
-                    current_quaternion = quaternion_from_matrix(rotation_matrix)
-
-                    current_ee_tgt = np.ndarray.flatten(get_ee_points(self.agent['end_effector_points'],
-                                                                      trans,
-                                                                      rot).T)
-                    ee_points = current_ee_tgt - self.agent['ee_points_tgt']
-
-                    ee_points_jac_trans, _ = self._get_ee_points_jacobians(ee_link_jacobians,
-                                                                           self.agent['end_effector_points'],
-                                                                           rot)
-                    ee_velocities = self._get_ee_points_velocities(ee_link_jacobians,
-                                                                   self.agent['end_effector_points'],
-                                                                   rot,
-                                                                   last_observations)
-
-                    #
-                    # Concatenate the information that defines the robot state
-                    # vector, typically denoted asrobot_id 'x'.
-                    state = np.r_[np.reshape(last_observations, -1),
-                                  np.reshape(ee_points, -1),
-                                  np.reshape(ee_velocities, -1),]
-
-                    observations = np.r_[np.reshape(last_observations, -1),
-                                  np.reshape(ee_points, -1),
-                                  np.reshape(ee_velocities, -1),]
-                    if observations is None:
-                        #need to handle this properly
-                        print("Observations are none!!!")
-                    # else:
-                    #     print("We have observations")
-
-
-            return observations, ee_points
-    # initialize the steps
-    def step(self, action):
-        time_step = 0
-        ee_points = None
-        self.ob = None
-        while time_step < self.agent['T']: #rclpy.ok():
-            """
-            How they do in OpenAI:
-              1. Calculate the reward
-              2. Perform action (do_simulation)
-              3. Get the Observations
-            """
-            _, ee_points  = self._get_obs()
-
-            if ee_points is None:
-                print("self.ob: ", self.ob)
-                print("ee_points: ", ee_points)
-                done = False
-                rclpy.spin_once(node)
-                # time.sleep(self.agent['slowness'])
-            else:
-                # self.reward_dist = - self.rmse_func(ee_points) - 0.5 * abs(np.sum(self.log_dist_func(ee_points)))
-                # print("reward: ", self.reward_dist)
-
-                self.reward_dist = - self.rmse_func(ee_points)
-                # print("reward_dist :", self.reward_dist)
-                if(self.reward_dist<0.005):
-                    self.reward = 1 - self.rmse_func(ee_points) # Make the reward increase as the distance decreases
-                    print("Reward is: ", self.reward)
-                    print("Eucledian distance is: ", np.linalg.norm(ee_points))
-                else:
-                    self.reward = self.reward_dist
-
-                print("reward: ", self.reward)
-
-                # Calculate if the env has been solved
-                done = bool(abs(self.reward_dist) < 0.005)
-
-                # self._time_lock.acquire(True)
-                self._pub.publish(self._get_trajectory_message(action[:self.scara_chain.getNrOfJoints()], self.agent))#rclpy.ok():
-
-                while self.ob is None or ee_points is None:
-                    self.ob, ee_points  = self._get_obs()
-                    rclpy.spin_once(node)
-                    # time.sleep(self.agent['slowness'])
-
-                # self._time_lock.release()
-
-                    # self._time_lock.release()
-                rclpy.spin_once(node)
-
-                time_step += 1
-
-                # print("time_step: ", time_step)
-                return self.ob, self.reward, done, {}
-
-    def step(self, action):
-        """
-        Dont know if we need this but just in case set the obs to None, so always takes fresh values
-        """
-        self.obs = None
-        observations = None
-
-        print("Action: ", action)
-        """
-        How they do in OpenAI:
-             1. Calculate the reward
-             2. Perform action (do_simulation)
-             3. Get the Observations
-        """
-        start = timer()
-        # Check if ROS2 is ok
-        if rclpy.ok():
-            observations, ee_points  = self._get_obs()
-            if self.ob is None or ee_points is None:
-                print("self.ob: ", self.ob)
-                print("ee_points: ", ee_points)
-                done = False
-                rclpy.spin_once(node)
-                time_step += 1
-            else:
-                # self.reward_dist = - abs(self.rmse_func(ee_points)) #np.linalg.norm(ee_points) #self.rmse_func(ee_points) #- 0.5 * abs(np.sum(self.log_dist_func(ee_points)))
-                # # print("reward: ",self.reward_dist)
-
-                self.reward_dist = -self.rmse_func(ee_points)
-                # print("reward_dist :", self.reward_dist)
-                if(self.rmse_func(ee_points)<0.005):
-                    self.reward = 1 - self.rmse_func(ee_points) # Make the reward increase as the distance decreases
-                    print("Reward is: ", self.reward)
-                    print("Eucledian distance is: ", np.linalg.norm(ee_points))
-                else:
-                    self.reward = self.reward_dist
-                    # print("Eucledian distance is: ", np.linalg.norm(ee_points))
-
-                # print("reward: ", self.reward)
-                # print("rmse_func: ", self.rmse_func(ee_points))
-
-                # Calculate if the env has been solved
-                done = bool(abs(self.reward_dist) < 0.005)
-
-                # self._time_lock.acquire(True)
-                self._pub.publish(self._get_trajectory_message(action[:self.scara_chain.getNrOfJoints()], self.agent))#rclpy.ok():
-                # self._time_lock.release()
-
-                rclpy.spin_once(node)
-                self.ob, ee_points  = self._get_obs()
-
-        return self.ob, self.reward, done, {}
-
-    def rmse_func(self, ee_points):
-      """
-        Computes the Residual Mean Square Error of the difference between current and desired end-effector position
-      """
-      rmse = np.sqrt(np.mean(np.square(ee_points), dtype=np.float32))
-      return rmse
-    def log_dist_func(self, ee_points):
-        """
-        Computes the Log of the Eucledian Distance Error between current and desired end-effector position
-        """
-        log_dist = np.log(ee_points, dtype=np.float32) # [:(self.scara_chain.getNrOfJoints())]
-        log_dist[log_dist == -np.inf] = 0.0
-        log_dist = np.nan_to_num(log_dist)
-        # print("log_distance: ", log_dist)
-        return log_dist
-
-    def _get_jacobians(self, state):
-        """Produce a Jacobian from the urdf that maps from joint angles to x, y, z.
-        This makes a 6x6 matrix from 6 joint angles to x, y, z and 3 angles.
-        The angles are roll, pitch, and yaw (not Euler angles) and are not needed.
-        Returns a repackaged Jacobian that is 3x6.
-        """
-
-        # Initialize a Jacobian for n joint angles by 3 cartesian coords and 3 orientation angles
-        jacobian = Jacobian(self.scara_chain.getNrOfJoints())
-
-        # Initialize a joint array for the present n joint angles.
-        angles = JntArray(self.scara_chain.getNrOfJoints())
-
-        # Construct the joint array from the most recent joint angles.
-        for i in range(self.scara_chain.getNrOfJoints()):
-            angles[i] = state[i]
-
-        # Update the jacobian by solving for the given angles.
-        self.jac_solver.JntToJac(angles, jacobian)
-
-        # Initialize a numpy array to store the Jacobian.
-        J = np.array([[jacobian[i, j] for j in range(jacobian.columns())] for i in range(jacobian.rows())])
-
-        # Only want the cartesian position, not Roll, Pitch, Yaw (RPY) Angles
-        ee_jacobians = J
-        return ee_jacobians
-
-
-    def _process_observations(self, message, agent):
-        """Helper fuinction only called by _run_trial to convert a ROS message
-        to joint angles and velocities.
-        Check for and handle the case where a message is either malformed
-        or contains joint values in an order different from that expected
-        in hyperparams['joint_order']"""
-        # print(message)
-        # len(message)
-        if not message:
-            print("Message is empty");
-        else:
-            # # Check if joint values are in the expected order and size.
-            if message.joint_names != agent['joint_order']:
-                # Check that the message is of same size as the expected message.
-                if len(message.joint_names) != len(agent['joint_order']):
-                    raise MSG_INVALID_JOINT_NAMES_DIFFER
-
-                # Check that all the expected joint values are present in a message.
-                if not all(map(lambda x,y: x in y, message.joint_names,
-                    [self._valid_joint_set for _ in range(len(message.joint_names))])):
-                    raise MSG_INVALID_JOINT_NAMES_DIFFER
-                    print("Joints differ")
-
-            return np.array(message.actual.positions) # + message.actual.velocities
-
-
-    def _get_trajectory_message(self, action, agent):
-        """Helper function only called by reset() and run_trial().
-        Wraps an action vector of joint angles into a JointTrajectory message.
-        The velocities, accelerations, and effort do not control the arm motion"""
-
-        # Set up a trajectory message to publish.
-        action_msg = JointTrajectory()
-        action_msg.joint_names = agent['joint_order']
-
-        # Create a point to tell the robot to move to.
-        target = JointTrajectoryPoint()
-        action_float = [float(i) for i in action]
-        target.positions = action_float
-
-        # These times determine the speed at which the robot moves:
-        # it tries to reach the specified target position in 'slowness' time.
-        # target.time_from_start.nanosec = agent['slowness']
-        target.time_from_start.sec = agent['slowness']
-        # Package the single point into a trajectory of points with length 1.
-        action_msg.points = [target]
-
-        return action_msg
-
-    def _get_ee_points_jacobians(self, ref_jacobian, ee_points, ref_rot):
-        """
-        Get the jacobians of the points on a link given the jacobian for that link's origin
-        :param ref_jacobian: 6 x 6 numpy array, jacobian for the link's origin
-        :param ee_points: N x 3 numpy array, points' coordinates on the link's coordinate system
-        :param ref_rot: 3 x 3 numpy array, rotational matrix for the link's coordinate system
-        :return: 3N x 6 Jac_trans, each 3 x 6 numpy array is the Jacobian[:3, :] for that point
-                 3N x 6 Jac_rot, each 3 x 6 numpy array is the Jacobian[3:, :] for that point
-        """
-        ee_points = np.asarray(ee_points)
-        ref_jacobians_trans = ref_jacobian[:3, :]
-        ref_jacobians_rot = ref_jacobian[3:, :]
-        end_effector_points_rot = np.expand_dims(ref_rot.dot(ee_points.T).T, axis=1)
-        ee_points_jac_trans = np.tile(ref_jacobians_trans, (ee_points.shape[0], 1)) + \
-                                        np.cross(ref_jacobians_rot.T, end_effector_points_rot).transpose(
-                                            (0, 2, 1)).reshape(-1, self.scara_chain.getNrOfJoints())
-        ee_points_jac_rot = np.tile(ref_jacobians_rot, (ee_points.shape[0], 1))
-        return ee_points_jac_trans, ee_points_jac_rot
-
-    def _get_ee_points_velocities(self, ref_jacobian, ee_points, ref_rot, joint_velocities):
-        """
-        Get the velocities of the points on a link
-        :param ref_jacobian: 6 x 6 numpy array, jacobian for the link's origin
-        :param ee_points: N x 3 numpy array, points' coordinates on the link's coordinate system
-        :param ref_rot: 3 x 3 numpy array, rotational matrix for the link's coordinate system
-        :param joint_velocities: 1 x 6 numpy array, joint velocities
-        :return: 3N numpy array, velocities of each point
-        """
-        ref_jacobians_trans = ref_jacobian[:3, :]
-        ref_jacobians_rot = ref_jacobian[3:, :]
-        ee_velocities_trans = np.dot(ref_jacobians_trans, joint_velocities)
-        ee_velocities_rot = np.dot(ref_jacobians_rot, joint_velocities)
-        ee_velocities = ee_velocities_trans + np.cross(ee_velocities_rot.reshape(1, 3),
-                                                       ref_rot.dot(ee_points.T).T)
-        return ee_velocities.reshape(-1)
diff --git a/baselines/agent/scara_arm/agent_scara_experimental.py b/baselines/agent/scara_arm/agent_scara_experimental.py
deleted file mode 100644
index d1b37a4..0000000
--- a/baselines/agent/scara_arm/agent_scara_experimental.py
+++ /dev/null
@@ -1,528 +0,0 @@
-import os
-import time
-import copy
-import json
-import numpy as np # Used pretty much everywhere.
-import matplotlib.pyplot as plt
-import threading # Used for time locks to synchronize position data.
-import rclpy
-import tensorflow as tf
-
-from timeit import default_timer as timer
-from scipy import stats
-from scipy.interpolate import spline
-import geometry_msgs.msg
-
-from os import path
-from rclpy.qos import QoSProfile, qos_profile_sensor_data
-from baselines.agent.utility.general_utils import forward_kinematics, get_ee_points, rotation_from_matrix, \
-    get_rotation_matrix,quaternion_from_matrix# For getting points and velocities.
-from trajectory_msgs.msg import JointTrajectory, JointTrajectoryPoint # Used for publishing scara joint angles.
-from control_msgs.msg import JointTrajectoryControllerState
-from std_msgs.msg import String
-
-from baselines.agent.scara_arm.tree_urdf import treeFromFile # For KDL Jacobians
-from PyKDL import Jacobian, Chain, ChainJntToJacSolver, JntArray # For KDL Jacobians
-from collections import namedtuple
-import scipy.ndimage as sp_ndimage
-from functools import partial
-
-from baselines.agent.utility import error
-from baselines.agent.utility import seeding
-from baselines.agent.utility import spaces
-
-StartEndPoints = namedtuple('StartEndPoints', ['start', 'target'])
-class MSG_INVALID_JOINT_NAMES_DIFFER(Exception):
-    """Error object exclusively raised by _process_observations."""
-    pass
-
-# class ROBOT_MADE_CONTACT_WITH_GAZEBO_GROUND_SO_RESTART_ROSLAUNCH(Exception):
-#     """Error object exclusively raised by reset."""
-#     pass
-
-
-class AgentSCARAROS(object):
-    """Connects the SCARA actions and Deep Learning algorithms."""
-
-    def __init__(self, init_node=True): #hyperparams, , urdf_path, init_node=True
-        """Initialized Agent.
-        init_node:   Whether or not to initialize a new ROS node."""
-
-        print("I am in init")
-        self._observation_msg = None
-        self.scale = None  # must be set from elsewhere based on observations
-        self.bias = None
-        self.x_idx = None
-        self.obs = None
-        self.reward = None
-        self.done = None
-        self.reward_dist = None
-        self.reward_ctrl = None
-        self.action_space = None
-        # to work with baselines a2c need this ones
-        self.num_envs = 1
-        self.remotes = [0]
-
-        # Setup the main node.
-        print("Init ros node")
-        rclpy.init(args=None)
-        node = rclpy.create_node('robot_ai_node')
-        global node
-        self._pub = node.create_publisher(JointTrajectory,self.agent['joint_publisher'])
-        self._sub = node.create_subscription(JointTrajectoryControllerState, self.agent['joint_subscriber'], self._observation_callback, qos_profile=qos_profile_sensor_data)
-        assert self._sub
-        self._time_lock = threading.RLock()
-        print("setting time clocks")
-
-        if self.agent['tree_path'].startswith("/"):
-            fullpath = self.agent['tree_path']
-            print(fullpath)
-        else:
-            fullpath = os.path.join(os.path.dirname(__file__), "assets", self.agent['tree_path'])
-        if not path.exists(fullpath):
-            raise IOError("File %s does not exist" % fullpath)
-
-        print("I am in reading the file path: ", fullpath)
-
-        # Initialize a tree structure from the robot urdf.
-        # Note that the xacro of the urdf is updated by hand.
-        # Then the urdf must be compiled.
-        _, self.scara_tree = treeFromFile(self.agent['tree_path'])
-        # Retrieve a chain structure between the base and the start of the end effector.
-        self.scara_chain = self.scara_tree.getChain(self.agent['link_names'][0], self.agent['link_names'][-1])
-        print("Nr. of jnts: ", self.scara_chain.getNrOfJoints())
-
-    #     # Initialize a KDL Jacobian solver from the chain.
-        self.jac_solver = ChainJntToJacSolver(self.scara_chain)
-        print(self.jac_solver)
-        self._observations_stale = False
-        print("after observations stale")
-
-    #
-        self._currently_resetting = [False for _ in range(1)]
-        self.reset_joint_angles = [None for _ in range(1)]
-
-        # taken from mujoco in OpenAi how to initialize observation space and action space.
-        observation, _reward, done, _info = self.step(np.zeros(self.scara_chain.getNrOfJoints()))
-        assert not done
-        self.obs_dim = observation.size
-        print(self.obs_dim)
-        # print(observation, _reward)
-        # Here idially we should find the control range of the robot. Unfortunatelly in ROS/KDL there is nothing like this.
-        # I have tested this with the mujoco enviroment and the output is always same low[-1.,-1.], high[1.,1.]
-        # bounds = self.model.actuator_ctrlrange.copy()
-        low = -np.pi/2.0 * np.ones(self.scara_chain.getNrOfJoints()) #bounds[:, 0]
-        high = np.pi/2.0 * np.ones(self.scara_chain.getNrOfJoints()) #bounds[:, 1]
-        # print("Action Spaces:")
-        # print("low: ", low, "high: ", high)
-        self.action_space = spaces.Box(low, high)
-
-        high = np.inf*np.ones(self.obs_dim)
-        low = -high
-        self.observation_space = spaces.Box(low, high)
-        print(self.observation_space)
-
-        # self.seed()
-    def _observation_callback(self, message):
-        # message: observation from the robot to store each listen."""
-        # self._time_lock.acquire(True, -1)
-        # with self._time_lock:
-        self._observations_stale = False
-        self._observation_msg = message
-            # if self._currently_resetting:
-            #     epsilon = 1e-3
-            #     reset_action = self.agent['reset_conditions']['initial_positions']
-            #     now_action = self._observation_msg.actual.positions
-            #     du = np.linalg.norm(reset_action-now_action, float(np.inf))
-            #     if du < epsilon:
-            #         self._currently_resetting = False
-
-        # self._time_lock.release()
-
-
-    def reset(self):
-        """Reset function should call reset_model.
-           In OpenAI in reset_model we are setting the robot to initial pose + some random number.
-           This function returns the observations which are then used again for the policy calculation.
-           In our case we are going to set the robot to initial pose, for now given by the user."""
-        self.obs = None
-        # self._currently_resetting = True
-
-        # Set the reset position as the initial position from agent hyperparams.
-        self.reset_joint_angles = self.agent['reset_conditions']['initial_positions']
-        # Prepare the present positions to see how far off we are.
-        now_position = np.asarray(self._observation_msg.actual.positions)
-
-        # Wait until the arm is within epsilon of reset configuration.
-        # action_msg = JointTrajectory()
-        # self._time_lock.acquire(True, -1)
-        # with self._time_lock:
-        #     self._currently_resetting = True
-
-
-        # if self._currently_resetting:
-        #     epsilon = 1e-3
-        #     reset_action = self.agent['reset_conditions']['initial_positions']
-        #     now_action = self._observation_msg.actual.positions
-        #     du = np.linalg.norm(reset_action-now_action, float(np.inf))
-        #     if du > epsilon:
-        #         self._pub.publish(self._get_trajectory_message(self.agent['reset_conditions']['initial_positions'], self.agent))
-        #         time.sleep(self.agent['slowness'])
-        #         self._currently_resetting = True
-        #         print("reset is false.")
-        #     else:
-        #         self._currently_resetting = False
-        #         print("reset is true.")
-
-        self.ob, ee_points = self._get_obs()
-        # self._time_lock.release()
-
-        # print("resetting: ", self.ob)
-
-        return self.ob
-    def seed(self, seed=None):
-        self.np_random, seed = seeding.np_random(seed)
-        return [seed]
-    def _get_obs(self):
-        observations = None
-        ee_points = None
-        if self._observations_stale is False:
-            # # Acquire the lock to prevent the subscriber thread from
-            # # updating times or observation messages.
-            # self._time_lock.acquire(True)
-
-            obs_message = self._observation_msg
-            # time.sleep(self.agent['slowness'])
-            # if obs_message is None:
-            #     obs_message = self._observation_msg
-            #     # time.sleep(self.agent['slowness'])
-            #     # self._observations_stale = True
-            # self._time_lock.release()
-
-
-
-            # Make it so that subscriber's thread observation callback
-            # must be called before publishing again.
-
-            # Collect the end effector points and velocities in
-            # cartesian coordinates for the state.
-            # Collect the present joint angles and velocities from ROS for the state.
-            last_observations = self._process_observations(obs_message, self.agent)
-
-            # self._observations_stale = True
-
-
-            if last_observations is None:
-                print("last_observations is empty")
-            else:
-
-            # # # Get Jacobians from present joint angles and KDL trees
-            # # # The Jacobians consist of a 6x6 matrix getting its from from
-            # # # (# joint angles) x (len[x, y, z] + len[roll, pitch, yaw])
-                ee_link_jacobians = self._get_jacobians(last_observations)
-                if self.agent['link_names'][-1] is None:
-                    print("End link is empty!!")
-                else:
-                    self._observations_stale = False
-                    # print(self.agent['link_names'][-1])
-                    trans, rot = forward_kinematics(self.scara_chain,
-                                                self.agent['link_names'],
-                                                last_observations[:self.scara_chain.getNrOfJoints()],
-                                                base_link=self.agent['link_names'][0],
-                                                end_link=self.agent['link_names'][-1])
-                    # #
-                    rotation_matrix = np.eye(4)
-                    rotation_matrix[:3, :3] = rot
-                    rotation_matrix[:3, 3] = trans
-                    # angle, dir, _ = rotation_from_matrix(rotation_matrix)
-                    # #
-                    # current_quaternion = np.array([angle]+dir.tolist())#
-
-                    # I need this calculations for the new reward function, need to send them back to the run scara or calculate them here
-                    current_quaternion = quaternion_from_matrix(rotation_matrix)
-
-                    current_ee_tgt = np.ndarray.flatten(get_ee_points(self.agent['end_effector_points'],
-                                                                      trans,
-                                                                      rot).T)
-                    ee_points = current_ee_tgt - self.agent['ee_points_tgt']
-
-                    ee_points_jac_trans, _ = self._get_ee_points_jacobians(ee_link_jacobians,
-                                                                           self.agent['end_effector_points'],
-                                                                           rot)
-                    ee_velocities = self._get_ee_points_velocities(ee_link_jacobians,
-                                                                   self.agent['end_effector_points'],
-                                                                   rot,
-                                                                   last_observations)
-
-                    #
-                    # Concatenate the information that defines the robot state
-                    # vector, typically denoted asrobot_id 'x'.
-                    state = np.r_[np.reshape(last_observations, -1),
-                                  np.reshape(ee_points, -1),
-                                  np.reshape(ee_velocities, -1),]
-
-                    observations = np.r_[np.reshape(last_observations, -1),
-                                  np.reshape(ee_points, -1),
-                                  np.reshape(ee_velocities, -1),]
-                    if observations is None:
-                        #need to handle this properly
-                        print("Observations are none!!!")
-                    # else:
-                    #     print("We have observations")
-
-
-            return observations, ee_points
-    # initialize the steps
-    def step(self, action):
-        time_step = 0
-        ee_points = None
-        self.ob = None
-        while time_step < self.agent['T']: #rclpy.ok():
-            """
-            How they do in OpenAI:
-              1. Calculate the reward
-              2. Perform action (do_simulation)
-              3. Get the Observations
-            """
-            _, ee_points  = self._get_obs()
-
-            if ee_points is None:
-                print("self.ob: ", self.ob)
-                print("ee_points: ", ee_points)
-                done = False
-                rclpy.spin_once(node)
-                time.sleep(self.agent['slowness'])
-            else:
-                # self.reward_dist = - self.rmse_func(ee_points) - 0.5 * abs(np.sum(self.log_dist_func(ee_points)))
-                # print("reward: ", self.reward_dist)
-
-                self.reward_dist = - self.rmse_func(ee_points)
-                # print("reward_dist :", self.reward_dist)
-                if(self.reward_dist<0.005):
-                    self.reward = 1 - self.rmse_func(ee_points) # Make the reward increase as the distance decreases
-                    print("Reward is: ", self.reward)
-                    print("Eucledian distance is: ", np.linalg.norm(ee_points))
-                else:
-                    self.reward = self.reward_dist
-
-                print("reward: ", self.reward)
-
-                # Calculate if the env has been solved
-                done = bool(abs(self.reward_dist) < 0.005)
-
-                # self._time_lock.acquire(True)
-                self._pub.publish(self._get_trajectory_message(action[:self.scara_chain.getNrOfJoints()], self.agent))#rclpy.ok():
-
-                while self.ob is None or ee_points is None:
-                    self.ob, ee_points  = self._get_obs()
-                    rclpy.spin_once(node)
-                    time.sleep(self.agent['slowness'])
-
-                # self._time_lock.release()
-
-                    # self._time_lock.release()
-                rclpy.spin_once(node)
-
-                time_step += 1
-
-                # print("time_step: ", time_step)
-                return self.ob, self.reward, done, {}
-
-    def step(self, action):
-        """
-        Dont know if we need this but just in case set the obs to None, so always takes fresh values
-        """
-        self.obs = None
-        observations = None
-
-        # print("Action: ", action)
-        """
-        How they do in OpenAI:
-             1. Calculate the reward
-             2. Perform action (do_simulation)
-             3. Get the Observations
-        """
-        start = timer()
-        # Check if ROS2 is ok
-        if rclpy.ok():
-            observations, ee_points  = self._get_obs()
-            if self.ob is None or ee_points is None:
-                print("self.ob: ", self.ob)
-                print("ee_points: ", ee_points)
-                done = False
-                rclpy.spin_once(node)
-                time_step += 1
-            else:
-                # self.reward_dist = - abs(self.rmse_func(ee_points)) #np.linalg.norm(ee_points) #self.rmse_func(ee_points) #- 0.5 * abs(np.sum(self.log_dist_func(ee_points)))
-                # # print("reward: ",self.reward_dist)
-
-                self.reward_dist = -self.rmse_func(ee_points)
-                # print("reward_dist :", self.reward_dist)
-                if(self.rmse_func(ee_points)<0.005):
-                    self.reward = 1 - self.rmse_func(ee_points) # Make the reward increase as the distance decreases
-                    print("Reward is: ", self.reward)
-                    print("Eucledian distance is: ", np.linalg.norm(ee_points))
-                else:
-                    self.reward = self.reward_dist
-                    # print("Eucledian distance is: ", np.linalg.norm(ee_points))
-
-                # print("reward: ", self.reward)
-                # print("rmse_func: ", self.rmse_func(ee_points))
-
-                # Calculate if the env has been solved
-                done = bool(abs(self.reward_dist) < 0.005)
-
-                # self._time_lock.acquire(True)
-                # self._pub.publish(self._get_trajectory_message(action[:self.scara_chain.getNrOfJoints()], self.agent))#rclpy.ok():
-
-                epsilon = 1e-3
-                current_action = action[:self.scara_chain.getNrOfJoints()]#self._get_trajectory_message(action[:self.scara_chain.getNrOfJoints()], self.agent).points.positions
-                # print("current_action: ",action[:self.scara_chain.getNrOfJoints()])
-                now_position = self._observation_msg.actual.positions
-                # print("now position: ", now_position)
-                du = np.linalg.norm(current_action-now_position, float(np.inf))
-                if du > epsilon:
-                    self._pub.publish(self._get_trajectory_message(action[:self.scara_chain.getNrOfJoints()], self.agent))#rclpy.ok():
-                    # time.sleep(self.agent['slowness'])
-                    # self._currently_resetting = True
-                    print("positions do not match.")
-                else:
-                    # self._currently_resetting = False
-                    print("positions match.")
-                    self.ob, ee_points  = self._get_obs()
-                # me.sleep(self.agent['slowness'])
-                # self._time_lock.release()
-
-                rclpy.spin_once(node)
-                # self.ob, ee_points  = self._get_obs()
-
-        return self.ob, self.reward, done, {}
-
-    def rmse_func(self, ee_points):
-      """
-        Computes the Residual Mean Square Error of the difference between current and desired end-effector position
-      """
-      rmse = np.sqrt(np.mean(np.square(ee_points), dtype=np.float32))
-      return rmse
-    def log_dist_func(self, ee_points):
-        """
-        Computes the Log of the Eucledian Distance Error between current and desired end-effector position
-        """
-        log_dist = np.log(ee_points, dtype=np.float32) # [:(self.scara_chain.getNrOfJoints())]
-        log_dist[log_dist == -np.inf] = 0.0
-        log_dist = np.nan_to_num(log_dist)
-        # print("log_distance: ", log_dist)
-        return log_dist
-
-    def _get_jacobians(self, state):
-        """Produce a Jacobian from the urdf that maps from joint angles to x, y, z.
-        This makes a 6x6 matrix from 6 joint angles to x, y, z and 3 angles.
-        The angles are roll, pitch, and yaw (not Euler angles) and are not needed.
-        Returns a repackaged Jacobian that is 3x6.
-        """
-
-        # Initialize a Jacobian for n joint angles by 3 cartesian coords and 3 orientation angles
-        jacobian = Jacobian(self.scara_chain.getNrOfJoints())
-
-        # Initialize a joint array for the present n joint angles.
-        angles = JntArray(self.scara_chain.getNrOfJoints())
-
-        # Construct the joint array from the most recent joint angles.
-        for i in range(self.scara_chain.getNrOfJoints()):
-            angles[i] = state[i]
-
-        # Update the jacobian by solving for the given angles.
-        self.jac_solver.JntToJac(angles, jacobian)
-
-        # Initialize a numpy array to store the Jacobian.
-        J = np.array([[jacobian[i, j] for j in range(jacobian.columns())] for i in range(jacobian.rows())])
-
-        # Only want the cartesian position, not Roll, Pitch, Yaw (RPY) Angles
-        ee_jacobians = J
-        return ee_jacobians
-
-
-    def _process_observations(self, message, agent):
-        """Helper fuinction only called by _run_trial to convert a ROS message
-        to joint angles and velocities.
-        Check for and handle the case where a message is either malformed
-        or contains joint values in an order different from that expected
-        in hyperparams['joint_order']"""
-        # print(message)
-        # len(message)
-        if not message:
-            print("Message is empty");
-        else:
-            # # Check if joint values are in the expected order and size.
-            if message.joint_names != agent['joint_order']:
-                # Check that the message is of same size as the expected message.
-                if len(message.joint_names) != len(agent['joint_order']):
-                    raise MSG_INVALID_JOINT_NAMES_DIFFER
-
-                # Check that all the expected joint values are present in a message.
-                if not all(map(lambda x,y: x in y, message.joint_names,
-                    [self._valid_joint_set for _ in range(len(message.joint_names))])):
-                    raise MSG_INVALID_JOINT_NAMES_DIFFER
-                    print("Joints differ")
-
-            return np.array(message.actual.positions) # + message.actual.velocities
-
-
-    def _get_trajectory_message(self, action, agent):
-        """Helper function only called by reset() and run_trial().
-        Wraps an action vector of joint angles into a JointTrajectory message.
-        The velocities, accelerations, and effort do not control the arm motion"""
-
-        # Set up a trajectory message to publish.
-        action_msg = JointTrajectory()
-        action_msg.joint_names = agent['joint_order']
-
-        # Create a point to tell the robot to move to.
-        target = JointTrajectoryPoint()
-        action_float = [float(i) for i in action]
-        target.positions = action_float
-
-        # These times determine the speed at which the robot moves:
-        # it tries to reach the specified target position in 'slowness' time.
-        # target.time_from_start.nanosec = agent['slowness']
-        target.time_from_start.sec = agent['slowness']
-        # Package the single point into a trajectory of points with length 1.
-        action_msg.points = [target]
-
-        return action_msg
-
-    def _get_ee_points_jacobians(self, ref_jacobian, ee_points, ref_rot):
-        """
-        Get the jacobians of the points on a link given the jacobian for that link's origin
-        :param ref_jacobian: 6 x 6 numpy array, jacobian for the link's origin
-        :param ee_points: N x 3 numpy array, points' coordinates on the link's coordinate system
-        :param ref_rot: 3 x 3 numpy array, rotational matrix for the link's coordinate system
-        :return: 3N x 6 Jac_trans, each 3 x 6 numpy array is the Jacobian[:3, :] for that point
-                 3N x 6 Jac_rot, each 3 x 6 numpy array is the Jacobian[3:, :] for that point
-        """
-        ee_points = np.asarray(ee_points)
-        ref_jacobians_trans = ref_jacobian[:3, :]
-        ref_jacobians_rot = ref_jacobian[3:, :]
-        end_effector_points_rot = np.expand_dims(ref_rot.dot(ee_points.T).T, axis=1)
-        ee_points_jac_trans = np.tile(ref_jacobians_trans, (ee_points.shape[0], 1)) + \
-                                        np.cross(ref_jacobians_rot.T, end_effector_points_rot).transpose(
-                                            (0, 2, 1)).reshape(-1, self.scara_chain.getNrOfJoints())
-        ee_points_jac_rot = np.tile(ref_jacobians_rot, (ee_points.shape[0], 1))
-        return ee_points_jac_trans, ee_points_jac_rot
-
-    def _get_ee_points_velocities(self, ref_jacobian, ee_points, ref_rot, joint_velocities):
-        """
-        Get the velocities of the points on a link
-        :param ref_jacobian: 6 x 6 numpy array, jacobian for the link's origin
-        :param ee_points: N x 3 numpy array, points' coordinates on the link's coordinate system
-        :param ref_rot: 3 x 3 numpy array, rotational matrix for the link's coordinate system
-        :param joint_velocities: 1 x 6 numpy array, joint velocities
-        :return: 3N numpy array, velocities of each point
-        """
-        ref_jacobians_trans = ref_jacobian[:3, :]
-        ref_jacobians_rot = ref_jacobian[3:, :]
-        ee_velocities_trans = np.dot(ref_jacobians_trans, joint_velocities)
-        ee_velocities_rot = np.dot(ref_jacobians_rot, joint_velocities)
-        ee_velocities = ee_velocities_trans + np.cross(ee_velocities_rot.reshape(1, 3),
-                                                       ref_rot.dot(ee_points.T).T)
-        return ee_velocities.reshape(-1)
diff --git a/baselines/agent/scara_arm/agent_scara_real.py b/baselines/agent/scara_arm/agent_scara_real.py
deleted file mode 100644
index 04ebaaa..0000000
--- a/baselines/agent/scara_arm/agent_scara_real.py
+++ /dev/null
@@ -1,493 +0,0 @@
-import os
-import time
-import copy
-import json
-import numpy as np # Used pretty much everywhere.
-import matplotlib.pyplot as plt
-import threading # Used for time locks to synchronize position data.
-import rclpy
-import tensorflow as tf
-
-from timeit import default_timer as timer
-from scipy import stats
-from scipy.interpolate import spline
-import geometry_msgs.msg
-
-from os import path
-
-from rclpy.executors import MultiThreadedExecutor
-from rclpy.qos import QoSProfile, qos_profile_sensor_data
-from baselines.agent.utility.general_utils import forward_kinematics, get_ee_points, rotation_from_matrix, \
-    get_rotation_matrix,quaternion_from_matrix# For getting points and velocities.
-from trajectory_msgs.msg import JointTrajectory, JointTrajectoryPoint # Used for publishing scara joint angles.
-from control_msgs.msg import JointTrajectoryControllerState
-from std_msgs.msg import String
-
-from baselines.agent.scara_arm.tree_urdf import treeFromFile # For KDL Jacobians
-from PyKDL import Jacobian, Chain, ChainJntToJacSolver, JntArray # For KDL Jacobians
-from collections import namedtuple
-import scipy.ndimage as sp_ndimage
-from functools import partial
-
-from baselines.agent.utility import error
-from baselines.agent.utility import seeding
-from baselines.agent.utility import spaces
-
-StartEndPoints = namedtuple('StartEndPoints', ['start', 'target'])
-class MSG_INVALID_JOINT_NAMES_DIFFER(Exception):
-    """Error object exclusively raised by _process_observations."""
-    pass
-
-# class ROBOT_MADE_CONTACT_WITH_GAZEBO_GROUND_SO_RESTART_ROSLAUNCH(Exception):
-#     """Error object exclusively raised by reset."""
-#     pass
-
-
-class AgentSCARAROS(object):
-    """Connects the SCARA actions and Deep Learning algorithms."""
-
-    def __init__(self, init_node=True): #hyperparams, , urdf_path, init_node=True
-        """Initialized Agent.
-        init_node:   Whether or not to initialize a new ROS node."""
-
-        print("I am in init")
-        self._observation_msg = None
-        self.scale = None  # must be set from elsewhere based on observations
-        self.bias = None
-        self.x_idx = None
-        self.obs = None
-        self.reward = None
-        self.done = None
-        self.reward_dist = None
-        self.reward_ctrl = None
-        self.action_space = None
-        # to work with baselines a2c need this ones
-        self.num_envs = 1
-        self.remotes = [0]
-
-        # Setup the main node.
-        print("Init ros node")
-
-        rclpy.init(args=None)
-
-        self.node = rclpy.create_node('robot_ai_node')
-        self.executor = MultiThreadedExecutor()
-
-        self._pub = self.node.create_publisher(JointTrajectory,
-                                               self.agent['joint_publisher'],
-                                               qos_profile=qos_profile_sensor_data)
-
-        self._sub = self.node.create_subscription(JointTrajectoryControllerState,
-                                                  self.agent['joint_subscriber'],
-                                                  self._observation_callback,
-                                                  qos_profile=qos_profile_sensor_data)
-        assert self._sub
-
-        self.executor.add_node(self.node)
-
-        if self.agent['tree_path'].startswith("/"):
-            fullpath = self.agent['tree_path']
-            print(fullpath)
-        else:
-            fullpath = os.path.join(os.path.dirname(__file__), "assets", self.agent['tree_path'])
-        if not path.exists(fullpath):
-            raise IOError("File %s does not exist" % fullpath)
-
-        print("I am in reading the file path: ", fullpath)
-
-        # Initialize a tree structure from the robot urdf.
-        # Note that the xacro of the urdf is updated by hand.
-        # Then the urdf must be compiled.
-        _, self.scara_tree = treeFromFile(self.agent['tree_path'])
-        # Retrieve a chain structure between the base and the start of the end effector.
-        self.scara_chain = self.scara_tree.getChain(self.agent['link_names'][0], self.agent['link_names'][-1])
-        print("Nr. of jnts: ", self.scara_chain.getNrOfJoints())
-
-        # Initialize a KDL Jacobian solver from the chain.
-        self.jac_solver = ChainJntToJacSolver(self.scara_chain)
-        print(self.jac_solver)
-        self._observations_stale = False
-        print("after observations stale")
-
-        self._currently_resetting = [False for _ in range(1)]
-        self.reset_joint_angles = [None for _ in range(1)]
-
-        # taken from mujoco in OpenAi how to initialize observation space and action space.
-        observation, _reward, done, _info = self.step(np.zeros(self.scara_chain.getNrOfJoints()))
-        #assert not done
-        self.obs_dim = observation.size
-        print(self.obs_dim)
-        # print(observation, _reward)
-        # Here idially we should find the control range of the robot. Unfortunatelly in ROS/KDL there is nothing like this.
-        # I have tested this with the mujoco enviroment and the output is always same low[-1.,-1.], high[1.,1.]
-        # bounds = self.model.actuator_ctrlrange.copy()
-        low = -np.pi/2.0 * np.ones(self.scara_chain.getNrOfJoints()) #bounds[:, 0]
-        high = np.pi/2.0 * np.ones(self.scara_chain.getNrOfJoints()) #bounds[:, 1]
-        # print("Action Spaces:")
-        # print("low: ", low, "high: ", high)
-        self.action_space = spaces.Box(low, high)
-
-        high = np.inf*np.ones(self.obs_dim)
-        low = -high
-        self.observation_space = spaces.Box(low, high)
-        print(self.observation_space)
-
-        self.goal_cmd = [0]*self.scara_chain.getNrOfJoints()
-        self.goal_vel = [ self.agent['goal_vel'] ]*self.scara_chain.getNrOfJoints()
-
-
-        self.goal_vel_value = [self.agent['goal_vel']]*self.scara_chain.getNrOfJoints()
-        self.goal_effort_value = [float('nan')]*self.scara_chain.getNrOfJoints()
-        self.joint_order = self.agent['joint_order']
-
-
-    def _observation_callback(self, message):
-        # message: observation from the robot to store each listen."""
-
-        self._observations_stale = False
-        self._observation_msg = message
-
-    def reset(self):
-        """Reset function should call reset_model.
-           In OpenAI in reset_model we are setting the robot to initial pose + some random number.
-           This function returns the observations which are then used again for the policy calculation.
-           In our case we are going to set the robot to initial pose, for now given by the user."""
-        self.obs = None
-
-        # Set the reset position as the initial position from agent hyperparams.
-        self.reset_joint_angles = self.agent['reset_conditions']['initial_positions']
-        self.ob, ee_points = self._get_obs()
-
-        return self.ob
-
-    def seed(self, seed=None):
-        self.np_random, seed = seeding.np_random(seed)
-        return [seed]
-
-    def _get_obs(self):
-        observations = None
-        ee_points = None
-        if self._observations_stale is False:
-            # Acquire the lock to prevent the subscriber thread from
-            # updating times or observation messages.
-
-            obs_message = self._observation_msg
-
-            # Make it so that subscriber's thread observation callback
-            # must be called before publishing again.
-
-            # Collect the end effector points and velocities in
-            # cartesian coordinates for the state.
-            # Collect the present joint angles and velocities from ROS for the state.
-            last_observations = self._process_observations(obs_message, self.agent)
-
-            if last_observations is None:
-                print("last_observations is empty")
-            else:
-                # Get Jacobians from present joint angles and KDL trees
-                # The Jacobians consist of a 6x6 matrix getting its from from
-                # (# joint angles) x (len[x, y, z] + len[roll, pitch, yaw])
-                ee_link_jacobians = self._get_jacobians(last_observations)
-                if self.agent['link_names'][-1] is None:
-                    print("End link is empty!!")
-                else:
-                    self._observations_stale = False
-                    # print(self.agent['link_names'][-1])
-                    trans, rot = forward_kinematics(self.scara_chain,
-                                                self.agent['link_names'],
-                                                last_observations[:self.scara_chain.getNrOfJoints()],
-                                                base_link=self.agent['link_names'][0],
-                                                end_link=self.agent['link_names'][-1])
-                    rotation_matrix = np.eye(4)
-                    rotation_matrix[:3, :3] = rot
-                    rotation_matrix[:3, 3] = trans
-
-                    # I need this calculations for the new reward function, need to send them back to the run scara or calculate them here
-                    current_quaternion = quaternion_from_matrix(rotation_matrix)
-
-                    current_ee_tgt = np.ndarray.flatten(get_ee_points(self.agent['end_effector_points'],
-                                                                      trans,
-                                                                      rot).T)
-                    ee_points = current_ee_tgt - self.agent['ee_points_tgt']
-
-                    ee_points_jac_trans, _ = self._get_ee_points_jacobians(ee_link_jacobians,
-                                                                           self.agent['end_effector_points'],
-                                                                           rot)
-                    ee_velocities = self._get_ee_points_velocities(ee_link_jacobians,
-                                                                   self.agent['end_effector_points'],
-                                                                   rot,
-                                                                   last_observations)
-
-                    # Concatenate the information that defines the robot state
-                    # vector, typically denoted asrobot_id 'x'.
-                    state = np.r_[np.reshape(last_observations, -1),
-                                  np.reshape(ee_points, -1),
-                                  np.reshape(ee_velocities, -1),]
-
-                    observations = np.r_[np.reshape(last_observations, -1),
-                                  np.reshape(ee_points, -1),
-                                  np.reshape(ee_velocities, -1),]
-                    if observations is None:
-                        print("Observations are none!!!")
-
-            return observations, ee_points
-
-    # initialize the steps
-    def step(self, action):
-        time_step = 0
-        ee_points = None
-        self.ob = None
-        while time_step < self.agent['T']:
-            """
-            How they do in OpenAI:
-              1. Calculate the reward
-              2. Perform action (do_simulation)
-              3. Get the Observations
-            """
-            self.executor.spin_once()
-
-            self.ob, ee_points  = self._get_obs()
-
-            if ee_points is None:
-                print("self.ob: ", self.ob)
-                print("ee_points: ", ee_points)
-                done = False
-
-            else:
-                # self.reward_dist = - self.rmse_func(ee_points) - 0.5 * abs(np.sum(self.log_dist_func(ee_points)))
-                # print("reward: ", self.reward_dist)
-
-                self.reward_dist = - self.rmse_func(ee_points)
-                # print("reward_dist :", self.reward_dist)
-                if(self.reward_dist<0.005):
-                    self.reward = 1 - self.rmse_func(ee_points) # Make the reward increase as the distance decreases
-                    print("Reward is: ", self.reward)
-                    print("Eucledian distance is: ", np.linalg.norm(ee_points))
-                else:
-                    self.reward = self.reward_dist
-
-                print("reward: ", self.reward)
-
-                # Calculate if the env has been solved
-                done = bool(abs(self.reward_dist) < 0.005)
-
-                while self.ob is None or ee_points is None:
-                    self.ob, ee_points  = self._get_obs()
-                    self.executor.spin_once()
-                    time.sleep(self.agent['slowness'])
-                    print("Trying to get an observation")
-
-                time_step += 1
-
-                self.goal_cmd = self._observation_msg.actual.positions
-
-                self.last_time = time.time()
-
-                print ("Exit step")
-
-                return self.ob, self.reward, done, {}
-
-    def start_steps(self):
-        self.last_time = time.time()
-        self.goal_cmd = self._observation_msg.actual.positions
-
-    def step(self, action):
-        """
-        Dont know if we need this but just in case set the obs to None, so always takes fresh values
-        """
-
-        self.obs = None
-        observations = None
-
-        """
-        How they do in OpenAI:
-             1. Calculate the reward
-             2. Perform action (do_simulation)
-             3. Get the Observations
-        """
-        # Check if ROS2 is ok
-        if rclpy.ok():
-
-            self.executor.spin_once()
-            observations, ee_points  = self._get_obs()
-            if self.ob is None or ee_points is None:
-                print("self.ob: ", self.ob)
-                print("ee_points: ", ee_points)
-                self.goal_cmd = self._observation_msg.actual.positions
-                done = False
-            else:
-                self.reward_dist = -self.rmse_func(ee_points)
-
-                if(self.rmse_func(ee_points)<0.005):
-                    self.reward = 1 - self.rmse_func(ee_points) # Make the reward increase as the distance decreases
-                    print("Reward is: ", self.reward)
-                    print("Eucledian distance is: ", np.linalg.norm(ee_points))
-                else:
-                    self.reward = self.reward_dist
-                    print("*Eucledian distance is: ", np.linalg.norm(ee_points))
-
-                # Calculate if the env has been solved
-                done = bool(abs(self.reward_dist) < 0.005)
-
-                self.ob, ee_points  = self._get_obs()
-
-                dt = time.time() - self.last_time
-                # print("dt: ", dt)
-                for i in range(self.scara_chain.getNrOfJoints()):
-                    if(self._observation_msg.actual.positions[i] > action[i]):
-                        self.goal_vel[i] = -self.goal_vel_value[0]
-                    else:
-                        self.goal_vel[i] =  self.goal_vel_value[0]
-                    self.goal_cmd[i] += dt*self.goal_vel[i]
-
-                self.last_time = time.time()
-
-                action_msg = JointTrajectory()
-                action_msg.joint_names = self.joint_order
-
-                # Create a point to tell the robot to move to.
-                target = JointTrajectoryPoint()
-                target.positions  = self.goal_cmd
-
-                target.velocities = self.goal_vel_value
-                target.effort = self.goal_effort_value
-
-                # Package the single point into a trajectory of points with length 1.
-                action_msg.points = [target]
-
-                self._pub.publish(action_msg)
-
-
-        return self.ob, self.reward, done, {}
-
-    def rmse_func(self, ee_points):
-      """
-        Computes the Residual Mean Square Error of the difference between current and desired end-effector position
-      """
-      rmse = np.sqrt(np.mean(np.square(ee_points), dtype=np.float32))
-      return rmse
-
-    def log_dist_func(self, ee_points):
-        """
-        Computes the Log of the Eucledian Distance Error between current and desired end-effector position
-        """
-        log_dist = np.log(ee_points, dtype=np.float32)
-        log_dist[log_dist == -np.inf] = 0.0
-        log_dist = np.nan_to_num(log_dist)
-
-        return log_dist
-
-    def _get_jacobians(self, state):
-        """Produce a Jacobian from the urdf that maps from joint angles to x, y, z.
-        This makes a 6x6 matrix from 6 joint angles to x, y, z and 3 angles.
-        The angles are roll, pitch, and yaw (not Euler angles) and are not needed.
-        Returns a repackaged Jacobian that is 3x6.
-        """
-
-        # Initialize a Jacobian for n joint angles by 3 cartesian coords and 3 orientation angles
-        jacobian = Jacobian(self.scara_chain.getNrOfJoints())
-
-        # Initialize a joint array for the present n joint angles.
-        angles = JntArray(self.scara_chain.getNrOfJoints())
-
-        # Construct the joint array from the most recent joint angles.
-        for i in range(self.scara_chain.getNrOfJoints()):
-            angles[i] = state[i]
-
-        # Update the jacobian by solving for the given angles.
-        self.jac_solver.JntToJac(angles, jacobian)
-
-        # Initialize a numpy array to store the Jacobian.
-        J = np.array([[jacobian[i, j] for j in range(jacobian.columns())] for i in range(jacobian.rows())])
-
-        # Only want the cartesian position, not Roll, Pitch, Yaw (RPY) Angles
-        ee_jacobians = J
-        return ee_jacobians
-
-
-    def _process_observations(self, message, agent):
-        """Helper fuinction only called by _run_trial to convert a ROS message
-        to joint angles and velocities.
-        Check for and handle the case where a message is either malformed
-        or contains joint values in an order different from that expected
-        in hyperparams['joint_order']"""
-
-        if not message:
-            print("Message is empty");
-        else:
-            # # Check if joint values are in the expected order and size.
-            if message.joint_names != agent['joint_order']:
-                # Check that the message is of same size as the expected message.
-                if len(message.joint_names) != len(agent['joint_order']):
-                    raise MSG_INVALID_JOINT_NAMES_DIFFER
-
-                # Check that all the expected joint values are present in a message.
-                if not all(map(lambda x,y: x in y, message.joint_names,
-                    [self._valid_joint_set for _ in range(len(message.joint_names))])):
-                    raise MSG_INVALID_JOINT_NAMES_DIFFER
-                    print("Joints differ")
-
-            return np.array(message.actual.positions) # + message.actual.velocities
-
-
-    def _get_trajectory_message(self, action, agent):
-        """Helper function only called by reset() and run_trial().
-        Wraps an action vector of joint angles into a JointTrajectory message.
-        The velocities, accelerations, and effort do not control the arm motion"""
-
-        # Set up a trajectory message to publish.
-        action_msg = JointTrajectory()
-        action_msg.joint_names = agent['joint_order']
-
-        # Create a point to tell the robot to move to.
-        target = JointTrajectoryPoint()
-        action_float = [float(i) for i in action]
-        target.positions = action_float
-
-        # These times determine the speed at which the robot moves:
-        # it tries to reach the specified target position in 'slowness' time.
-        # target.time_from_start.nanosec = agent['slowness']
-
-        i, d = divmod(agent['slowness'], 1)
-        target.time_from_start.sec =  int(i)
-        target.time_from_start.nanosec =  int(d*1000000000);
-        # Package the single point into a trajectory of points with length 1.
-        action_msg.points = [target]
-
-        return action_msg
-
-    def _get_ee_points_jacobians(self, ref_jacobian, ee_points, ref_rot):
-        """
-        Get the jacobians of the points on a link given the jacobian for that link's origin
-        :param ref_jacobian: 6 x 6 numpy array, jacobian for the link's origin
-        :param ee_points: N x 3 numpy array, points' coordinates on the link's coordinate system
-        :param ref_rot: 3 x 3 numpy array, rotational matrix for the link's coordinate system
-        :return: 3N x 6 Jac_trans, each 3 x 6 numpy array is the Jacobian[:3, :] for that point
-                 3N x 6 Jac_rot, each 3 x 6 numpy array is the Jacobian[3:, :] for that point
-        """
-        ee_points = np.asarray(ee_points)
-        ref_jacobians_trans = ref_jacobian[:3, :]
-        ref_jacobians_rot = ref_jacobian[3:, :]
-        end_effector_points_rot = np.expand_dims(ref_rot.dot(ee_points.T).T, axis=1)
-        ee_points_jac_trans = np.tile(ref_jacobians_trans, (ee_points.shape[0], 1)) + \
-                                        np.cross(ref_jacobians_rot.T, end_effector_points_rot).transpose(
-                                            (0, 2, 1)).reshape(-1, self.scara_chain.getNrOfJoints())
-        ee_points_jac_rot = np.tile(ref_jacobians_rot, (ee_points.shape[0], 1))
-        return ee_points_jac_trans, ee_points_jac_rot
-
-    def _get_ee_points_velocities(self, ref_jacobian, ee_points, ref_rot, joint_velocities):
-        """
-        Get the velocities of the points on a link
-        :param ref_jacobian: 6 x 6 numpy array, jacobian for the link's origin
-        :param ee_points: N x 3 numpy array, points' coordinates on the link's coordinate system
-        :param ref_rot: 3 x 3 numpy array, rotational matrix for the link's coordinate system
-        :param joint_velocities: 1 x 6 numpy array, joint velocities
-        :return: 3N numpy array, velocities of each point
-        """
-        ref_jacobians_trans = ref_jacobian[:3, :]
-        ref_jacobians_rot = ref_jacobian[3:, :]
-        ee_velocities_trans = np.dot(ref_jacobians_trans, joint_velocities)
-        ee_velocities_rot = np.dot(ref_jacobians_rot, joint_velocities)
-        ee_velocities = ee_velocities_trans + np.cross(ee_velocities_rot.reshape(1, 3),
-                                                       ref_rot.dot(ee_points.T).T)
-        return ee_velocities.reshape(-1)
diff --git a/baselines/agent/scara_arm/agent_scara_real1.py b/baselines/agent/scara_arm/agent_scara_real1.py
deleted file mode 100644
index f3dd1df..0000000
--- a/baselines/agent/scara_arm/agent_scara_real1.py
+++ /dev/null
@@ -1,434 +0,0 @@
-import os
-import time
-import copy
-import json
-import numpy as np # Used pretty much everywhere.
-import matplotlib.pyplot as plt
-import threading # Used for time locks to synchronize position data.
-import rclpy
-import tensorflow as tf
-
-from timeit import default_timer as timer
-from scipy import stats
-from scipy.interpolate import spline
-import geometry_msgs.msg
-
-from os import path
-
-from rclpy.qos import QoSProfile, qos_profile_sensor_data
-from rclpy.executors import MultiThreadedExecutor
-
-from baselines.agent.utility.general_utils import forward_kinematics, get_ee_points, rotation_from_matrix, \
-    get_rotation_matrix,quaternion_from_matrix# For getting points and velocities.
-from trajectory_msgs.msg import JointTrajectory, JointTrajectoryPoint # Used for publishing scara joint angles.
-from control_msgs.msg import JointTrajectoryControllerState
-from std_msgs.msg import String
-
-from baselines.agent.scara_arm.tree_urdf import treeFromFile # For KDL Jacobians
-from PyKDL import Jacobian, Chain, ChainJntToJacSolver, JntArray # For KDL Jacobians
-from collections import namedtuple
-import scipy.ndimage as sp_ndimage
-from functools import partial
-
-from baselines.agent.utility import error
-from baselines.agent.utility import seeding
-from baselines.agent.utility import spaces
-
-StartEndPoints = namedtuple('StartEndPoints', ['start', 'target'])
-class MSG_INVALID_JOINT_NAMES_DIFFER(Exception):
-    """Error object exclusively raised by _process_observations."""
-    pass
-
-# class ROBOT_MADE_CONTACT_WITH_GAZEBO_GROUND_SO_RESTART_ROSLAUNCH(Exception):
-#     """Error object exclusively raised by reset."""
-#     pass
-
-
-class AgentSCARAROS(object):
-    """Connects the SCARA actions and Deep Learning algorithms."""
-
-    def __init__(self, init_node=True, observation=None): #hyperparams, , urdf_path, init_node=True
-        """Initialized Agent.
-        init_node:   Whether or not to initialize a new ROS node."""
-
-        print("I am in init")
-        self._observation_msg = observation
-        self.scale = None  # must be set from elsewhere based on observations
-        self.bias = None
-        self.x_idx = None
-        self.obs = None
-        self.reward = None
-        self.done = None
-        self.reward_dist = None
-        self.reward_ctrl = None
-        self.action_space = None
-        # to work with baselines a2c need this ones
-        self.num_envs = 1
-        self.remotes = [0]
-
-        # Setup the main node.
-        print("Init ros node")
-
-        if self.agent['tree_path'].startswith("/"):
-            fullpath = self.agent['tree_path']
-            print(fullpath)
-        else:
-            fullpath = os.path.join(os.path.dirname(__file__), "assets", self.agent['tree_path'])
-        if not path.exists(fullpath):
-            raise IOError("File %s does not exist" % fullpath)
-
-        print("I am in reading the file path: ", fullpath)
-
-        # Initialize a tree structure from the robot urdf.
-        # Note that the xacro of the urdf is updated by hand.
-        # Then the urdf must be compiled.
-        _, self.scara_tree = treeFromFile(self.agent['tree_path'])
-        # Retrieve a chain structure between the base and the start of the end effector.
-        self.scara_chain = self.scara_tree.getChain(self.agent['link_names'][0], self.agent['link_names'][-1])
-        print("Nr. of jnts: ", self.scara_chain.getNrOfJoints())
-
-        # Initialize a KDL Jacobian solver from the chain.
-        self.jac_solver = ChainJntToJacSolver(self.scara_chain)
-        print(self.jac_solver)
-        self._observations_stale = False
-        print("after observations stale")
-
-        self._currently_resetting = [False for _ in range(1)]
-        self.reset_joint_angles = [None for _ in range(1)]
-
-        # taken from mujoco in OpenAi how to initialize observation space and action space.
-        observation, _reward, done, _info = self.step(np.zeros(self.scara_chain.getNrOfJoints()))
-        #assert not done
-        self.obs_dim = observation.size
-        print(self.obs_dim)
-
-        # Here idially we should find the control range of the robot. Unfortunatelly in ROS/KDL there is nothing like this.
-        # I have tested this with the mujoco enviroment and the output is always same low[-1.,-1.], high[1.,1.]
-        # bounds = self.model.actuator_ctrlrange.copy()
-        low = -np.pi/2.0 * np.ones(self.scara_chain.getNrOfJoints()) #bounds[:, 0]
-        high = np.pi/2.0 * np.ones(self.scara_chain.getNrOfJoints()) #bounds[:, 1]
-
-        self.action_space = spaces.Box(low, high)
-
-        high = np.inf*np.ones(self.obs_dim)
-        low = -high
-        self.observation_space = spaces.Box(low, high)
-        print(self.observation_space)
-
-        # self.seed()
-
-    def reset(self):
-        """Reset function should call reset_model.
-           In OpenAI in reset_model we are setting the robot to initial pose + some random number.
-           This function returns the observations which are then used again for the policy calculation.
-           In our case we are going to set the robot to initial pose, for now given by the user."""
-        self.obs = None
-        # self._currently_resetting = True
-
-        # Set the reset position as the initial position from agent hyperparams.
-        self.reset_joint_angles = self.agent['reset_conditions']['initial_positions']
-        # Prepare the present positions to see how far off we are.
-        now_position = np.asarray(self._observation_msg.actual.positions)
-
-        # Wait until the arm is within epsilon of reset configuration.
-        # action_msg = JointTrajectory()
-        self._currently_resetting = True
-        self.ob, ee_points = self._get_obs()
-
-        # print("resetting: ", self.ob)
-
-        return self.ob
-    def seed(self, seed=None):
-        self.np_random, seed = seeding.np_random(seed)
-        return [seed]
-
-    def _get_obs(self):
-        observations = None
-        ee_points = None
-        if self._observations_stale is False:
-            # # Acquire the lock to prevent the subscriber thread from
-            # # updating times or observation messages.
-            obs_message = self._observation_msg
-
-            # Make it so that subscriber's thread observation callback
-            # must be called before publishing again.
-
-            # Collect the end effector points and velocities in
-            # cartesian coordinates for the state.
-            # Collect the present joint angles and velocities from ROS for the state.
-            last_observations = self._process_observations(obs_message, self.agent)
-
-            if last_observations is None:
-                print("last_observations is empty")
-            else:
-
-            # # # Get Jacobians from present joint angles and KDL trees
-            # # # The Jacobians consist of a 6x6 matrix getting its from from
-            # # # (# joint angles) x (len[x, y, z] + len[roll, pitch, yaw])
-                ee_link_jacobians = self._get_jacobians(last_observations)
-                if self.agent['link_names'][-1] is None:
-                    print("End link is empty!!")
-                else:
-                    self._observations_stale = False
-                    # print(self.agent['link_names'][-1])
-                    trans, rot = forward_kinematics(self.scara_chain,
-                                                self.agent['link_names'],
-                                                last_observations[:self.scara_chain.getNrOfJoints()],
-                                                base_link=self.agent['link_names'][0],
-                                                end_link=self.agent['link_names'][-1])
-                    rotation_matrix = np.eye(4)
-                    rotation_matrix[:3, :3] = rot
-                    rotation_matrix[:3, 3] = trans
-
-                    # I need this calculations for the new reward function, need to send them back to the run scara or calculate them here
-                    current_quaternion = quaternion_from_matrix(rotation_matrix)
-
-                    current_ee_tgt = np.ndarray.flatten(get_ee_points(self.agent['end_effector_points'],
-                                                                      trans,
-                                                                      rot).T)
-                    ee_points = current_ee_tgt - self.agent['ee_points_tgt']
-
-                    ee_points_jac_trans, _ = self._get_ee_points_jacobians(ee_link_jacobians,
-                                                                           self.agent['end_effector_points'],
-                                                                           rot)
-                    ee_velocities = self._get_ee_points_velocities(ee_link_jacobians,
-                                                                   self.agent['end_effector_points'],
-                                                                   rot,
-                                                                   last_observations)
-
-                    # Concatenate the information that defines the robot state
-                    # vector, typically denoted asrobot_id 'x'.
-                    state = np.r_[np.reshape(last_observations, -1),
-                                  np.reshape(ee_points, -1),
-                                  np.reshape(ee_velocities, -1),]
-
-                    observations = np.r_[np.reshape(last_observations, -1),
-                                  np.reshape(ee_points, -1),
-                                  np.reshape(ee_velocities, -1),]
-                    if observations is None:
-                        #need to handle this properly
-                        print("Observations are none!!!")
-                    # else:
-                    #     print("We have observations")
-
-
-            return observations, ee_points
-    # initialize the steps
-    def step(self, action):
-        time_step = 0
-        ee_points = None
-        self.ob = None
-        while time_step < self.agent['T']: #rclpy.ok():
-            """
-            How they do in OpenAI:
-              1. Calculate the reward
-              2. Perform action (do_simulation)
-              3. Get the Observations
-            """
-            self.ob, ee_points  = self._get_obs()
-
-            if ee_points is None:
-                print("self.ob: ", self.ob)
-                print("ee_points: ", ee_points)
-                done = False
-                #rclpy.spin_once(node)
-                #time.sleep(self.agent['slowness'])
-            else:
-                # self.reward_dist = - self.rmse_func(ee_points) - 0.5 * abs(np.sum(self.log_dist_func(ee_points)))
-                # print("reward: ", self.reward_dist)
-
-                self.reward_dist = - self.rmse_func(ee_points)
-                # print("reward_dist :", self.reward_dist)
-                if(self.reward_dist<0.005):
-                    self.reward = 1 - self.rmse_func(ee_points) # Make the reward increase as the distance decreases
-                    print("Reward is: ", self.reward)
-                    print("Eucledian distance is: ", np.linalg.norm(ee_points))
-                else:
-                    self.reward = self.reward_dist
-
-                print("reward: ", self.reward)
-
-                # Calculate if the env has been solved
-                done = bool(abs(self.reward_dist) < 0.005)
-
-                while self.ob is None or ee_points is None:
-                    self.ob, ee_points  = self._get_obs()
-                    time.sleep(self.agent['slowness'])
-
-                time_step += 1
-
-                return self.ob, self.reward, done, {}
-
-    def step(self, action):
-        """
-        Dont know if we need this but just in case set the obs to None, so always takes fresh values
-        """
-        self.obs = None
-        observations = None
-        """
-        How they do in OpenAI:
-             1. Calculate the reward
-             2. Perform action (do_simulation)
-             3. Get the Observations
-        """
-        start = timer()
-        # Check if ROS2 is ok
-        if rclpy.ok():
-            observations, ee_points  = self._get_obs()
-            if self.ob is None or ee_points is None:
-                print("self.ob: ", self.ob)
-                print("ee_points: ", ee_points)
-                done = False
-                #rclpy.spin_once(node)
-                time_step += 1
-            else:
-
-                self.reward_dist = -self.rmse_func(ee_points)
-
-                if(self.rmse_func(ee_points)<0.005):
-                    self.reward = 1 - self.rmse_func(ee_points) # Make the reward increase as the distance decreases
-                    print("Reward is: ", self.reward)
-                    print("Eucledian distance is: ", np.linalg.norm(ee_points))
-                else:
-                    self.reward = self.reward_dist
-                    print("*Eucledian distance is: ", np.linalg.norm(ee_points))
-
-                # Calculate if the env has been solved
-                done = bool(abs(self.reward_dist) < 0.005)
-
-                self.ob, ee_points  = self._get_obs()
-
-        return self.ob, self.reward, done, {}
-
-    def rmse_func(self, ee_points):
-      """
-        Computes the Residual Mean Square Error of the difference between current and desired end-effector position
-      """
-      rmse = np.sqrt(np.mean(np.square(ee_points), dtype=np.float32))
-      return rmse
-
-    def log_dist_func(self, ee_points):
-        """
-        Computes the Log of the Eucledian Distance Error between current and desired end-effector position
-        """
-        log_dist = np.log(ee_points, dtype=np.float32) # [:(self.scara_chain.getNrOfJoints())]
-        log_dist[log_dist == -np.inf] = 0.0
-        log_dist = np.nan_to_num(log_dist)
-        # print("log_distance: ", log_dist)
-        return log_dist
-
-    def _get_jacobians(self, state):
-        """Produce a Jacobian from the urdf that maps from joint angles to x, y, z.
-        This makes a 6x6 matrix from 6 joint angles to x, y, z and 3 angles.
-        The angles are roll, pitch, and yaw (not Euler angles) and are not needed.
-        Returns a repackaged Jacobian that is 3x6.
-        """
-
-        # Initialize a Jacobian for n joint angles by 3 cartesian coords and 3 orientation angles
-        jacobian = Jacobian(self.scara_chain.getNrOfJoints())
-
-        # Initialize a joint array for the present n joint angles.
-        angles = JntArray(self.scara_chain.getNrOfJoints())
-
-        # Construct the joint array from the most recent joint angles.
-        for i in range(self.scara_chain.getNrOfJoints()):
-            angles[i] = state[i]
-
-        # Update the jacobian by solving for the given angles.
-        self.jac_solver.JntToJac(angles, jacobian)
-
-        # Initialize a numpy array to store the Jacobian.
-        J = np.array([[jacobian[i, j] for j in range(jacobian.columns())] for i in range(jacobian.rows())])
-
-        # Only want the cartesian position, not Roll, Pitch, Yaw (RPY) Angles
-        ee_jacobians = J
-        return ee_jacobians
-
-
-    def _process_observations(self, message, agent):
-        """Helper fuinction only called by _run_trial to convert a ROS message
-        to joint angles and velocities.
-        Check for and handle the case where a message is either malformed
-        or contains joint values in an order different from that expected
-        in hyperparams['joint_order']"""
-        # print(message)
-        # len(message)
-        if not message:
-            print("Message is empty");
-        else:
-            # # Check if joint values are in the expected order and size.
-            if message.joint_names != agent['joint_order']:
-                # Check that the message is of same size as the expected message.
-                if len(message.joint_names) != len(agent['joint_order']):
-                    raise MSG_INVALID_JOINT_NAMES_DIFFER
-
-                # Check that all the expected joint values are present in a message.
-                if not all(map(lambda x,y: x in y, message.joint_names,
-                    [self._valid_joint_set for _ in range(len(message.joint_names))])):
-                    raise MSG_INVALID_JOINT_NAMES_DIFFER
-                    print("Joints differ")
-
-            return np.array(message.actual.positions) # + message.actual.velocities
-
-
-    def _get_trajectory_message(self, action, agent):
-        """Helper function only called by reset() and run_trial().
-        Wraps an action vector of joint angles into a JointTrajectory message.
-        The velocities, accelerations, and effort do not control the arm motion"""
-
-        # Set up a trajectory message to publish.
-        action_msg = JointTrajectory()
-        action_msg.joint_names = agent['joint_order']
-
-        # Create a point to tell the robot to move to.
-        target = JointTrajectoryPoint()
-        action_float = [float(i) for i in action]
-        target.positions = action_float
-
-        # These times determine the speed at which the robot moves:
-        # it tries to reach the specified target position in 'slowness' time.
-        # target.time_from_start.nanosec = agent['slowness']
-
-        i, d = divmod(agent['slowness'], 1)
-        target.time_from_start.sec =  int(i)
-        target.time_from_start.nanosec =  int(d*1000000000);
-        # Package the single point into a trajectory of points with length 1.
-        action_msg.points = [target]
-
-        return action_msg
-
-    def _get_ee_points_jacobians(self, ref_jacobian, ee_points, ref_rot):
-        """
-        Get the jacobians of the points on a link given the jacobian for that link's origin
-        :param ref_jacobian: 6 x 6 numpy array, jacobian for the link's origin
-        :param ee_points: N x 3 numpy array, points' coordinates on the link's coordinate system
-        :param ref_rot: 3 x 3 numpy array, rotational matrix for the link's coordinate system
-        :return: 3N x 6 Jac_trans, each 3 x 6 numpy array is the Jacobian[:3, :] for that point
-                 3N x 6 Jac_rot, each 3 x 6 numpy array is the Jacobian[3:, :] for that point
-        """
-        ee_points = np.asarray(ee_points)
-        ref_jacobians_trans = ref_jacobian[:3, :]
-        ref_jacobians_rot = ref_jacobian[3:, :]
-        end_effector_points_rot = np.expand_dims(ref_rot.dot(ee_points.T).T, axis=1)
-        ee_points_jac_trans = np.tile(ref_jacobians_trans, (ee_points.shape[0], 1)) + \
-                                        np.cross(ref_jacobians_rot.T, end_effector_points_rot).transpose(
-                                            (0, 2, 1)).reshape(-1, self.scara_chain.getNrOfJoints())
-        ee_points_jac_rot = np.tile(ref_jacobians_rot, (ee_points.shape[0], 1))
-        return ee_points_jac_trans, ee_points_jac_rot
-
-    def _get_ee_points_velocities(self, ref_jacobian, ee_points, ref_rot, joint_velocities):
-        """
-        Get the velocities of the points on a link
-        :param ref_jacobian: 6 x 6 numpy array, jacobian for the link's origin
-        :param ee_points: N x 3 numpy array, points' coordinates on the link's coordinate system
-        :param ref_rot: 3 x 3 numpy array, rotational matrix for the link's coordinate system
-        :param joint_velocities: 1 x 6 numpy array, joint velocities
-        :return: 3N numpy array, velocities of each point
-        """
-        ref_jacobians_trans = ref_jacobian[:3, :]
-        ref_jacobians_rot = ref_jacobian[3:, :]
-        ee_velocities_trans = np.dot(ref_jacobians_trans, joint_velocities)
-        ee_velocities_rot = np.dot(ref_jacobians_rot, joint_velocities)
-        ee_velocities = ee_velocities_trans + np.cross(ee_velocities_rot.reshape(1, 3),
-                                                       ref_rot.dot(ee_points.T).T)
-        return ee_velocities.reshape(-1)
diff --git a/baselines/agent/scara_arm/agent_scara_real_mlsh.py b/baselines/agent/scara_arm/agent_scara_real_mlsh.py
deleted file mode 100644
index 5defa70..0000000
--- a/baselines/agent/scara_arm/agent_scara_real_mlsh.py
+++ /dev/null
@@ -1,545 +0,0 @@
-import os
-import time
-import copy
-import json
-import numpy as np # Used pretty much everywhere.
-import matplotlib.pyplot as plt
-import threading # Used for time locks to synchronize position data.
-import rclpy
-import tensorflow as tf
-
-from timeit import default_timer as timer
-from scipy import stats
-from scipy.interpolate import spline
-import geometry_msgs.msg
-
-from os import path
-from rclpy.qos import QoSProfile, qos_profile_sensor_data
-from baselines.agent.utility.general_utils import forward_kinematics, get_ee_points, rotation_from_matrix, \
-    get_rotation_matrix,quaternion_from_matrix# For getting points and velocities.
-from trajectory_msgs.msg import JointTrajectory, JointTrajectoryPoint # Used for publishing scara joint angles.
-from control_msgs.msg import JointTrajectoryControllerState
-from std_msgs.msg import String
-
-from baselines.agent.scara_arm.tree_urdf import treeFromFile # For KDL Jacobians
-from PyKDL import Jacobian, Chain, ChainJntToJacSolver, JntArray # For KDL Jacobians
-from collections import namedtuple
-import scipy.ndimage as sp_ndimage
-from functools import partial
-
-from baselines.agent.utility import error
-from baselines.agent.utility import seeding
-from baselines.agent.utility import spaces
-
-StartEndPoints = namedtuple('StartEndPoints', ['start', 'target'])
-class MSG_INVALID_JOINT_NAMES_DIFFER(Exception):
-    """Error object exclusively raised by _process_observations."""
-    pass
-
-# class ROBOT_MADE_CONTACT_WITH_GAZEBO_GROUND_SO_RESTART_ROSLAUNCH(Exception):
-#     """Error object exclusively raised by reset."""
-#     pass
-
-
-class AgentSCARAROS(object):
-    """Connects the SCARA actions and Deep Learning algorithms."""
-
-    def __init__(self, init_node=True): #hyperparams, , urdf_path, init_node=True
-        """Initialized Agent.
-        init_node:   Whether or not to initialize a new ROS node."""
-
-        print("I am in init")
-        self._observation_msg = None
-        self.scale = None  # must be set from elsewhere based on observations
-        self.bias = None
-        self.x_idx = None
-        self.obs = None
-        self.reward = None
-        self.done = None
-        self.reward_dist = None
-        self.reward_ctrl = None
-        self.action_space = None
-        # to work with baselines a2c need this ones
-        self.num_envs = 1
-        self.remotes = [0]
-
-        # Setup the main node.
-        print("Init ros node")
-        rclpy.init(args=None)
-        self.node = rclpy.create_node('robot_ai_node')
-        self._pub = self.node.create_publisher(JointTrajectory,self.agent['joint_publisher'])
-        self._sub = self.node.create_subscription(JointTrajectoryControllerState,
-                                            self.agent['joint_subscriber'],
-                                            self._observation_callback,
-                                            qos_profile=qos_profile_sensor_data)
-        assert self._sub
-        self._time_lock = threading.RLock()
-        print("setting time clocks")
-
-        if self.agent['tree_path'].startswith("/"):
-            fullpath = self.agent['tree_path']
-            print(fullpath)
-        else:
-            fullpath = os.path.join(os.path.dirname(__file__), "assets", self.agent['tree_path'])
-        if not path.exists(fullpath):
-            raise IOError("File %s does not exist" % fullpath)
-
-        print("I am in reading the file path: ", fullpath)
-
-        # Initialize a tree structure from the robot urdf.
-        # Note that the xacro of the urdf is updated by hand.
-        # Then the urdf must be compiled.
-        _, self.scara_tree = treeFromFile(self.agent['tree_path'])
-        # Retrieve a chain structure between the base and the start of the end effector.
-        self.scara_chain = self.scara_tree.getChain(self.agent['link_names'][0], self.agent['link_names'][-1])
-        print("Nr. of jnts: ", self.scara_chain.getNrOfJoints())
-
-    #     # Initialize a KDL Jacobian solver from the chain.
-        self.jac_solver = ChainJntToJacSolver(self.scara_chain)
-        print(self.jac_solver)
-        self._observations_stale = False
-        print("after observations stale")
-
-    #
-        self._currently_resetting = [False for _ in range(1)]
-        self.reset_joint_angles = [None for _ in range(1)]
-
-        # taken from mujoco in OpenAi how to initialize observation space and action space.
-        observation, _reward, done, _info = self.step(np.zeros(self.scara_chain.getNrOfJoints()))
-        assert not done
-        self.obs_dim = observation.size
-        print(self.obs_dim)
-        # print(observation, _reward)
-        # Here idially we should find the control range of the robot. Unfortunatelly in ROS/KDL there is nothing like this.
-        # I have tested this with the mujoco enviroment and the output is always same low[-1.,-1.], high[1.,1.]
-        # bounds = self.model.actuator_ctrlrange.copy()
-        low = -np.pi/2.0 * np.ones(self.scara_chain.getNrOfJoints()) #bounds[:, 0]
-        high = np.pi/2.0 * np.ones(self.scara_chain.getNrOfJoints()) #bounds[:, 1]
-        # print("Action Spaces:")
-        # print("low: ", low, "high: ", high)
-        self.action_space = spaces.Box(low, high)
-
-        high = np.inf*np.ones(self.obs_dim)
-        low = -high
-        self.observation_space = spaces.Box(low, high)
-        print(self.observation_space)
-
-        self.goal_cmd = [0]*self.scara_chain.getNrOfJoints()
-        self.goal_vel = [ self.agent['goal_vel'] ]*self.scara_chain.getNrOfJoints()
-
-        self.goal_vel_value = [self.agent['goal_vel']]*self.scara_chain.getNrOfJoints()
-        self.goal_effort_value = [float('nan')]*self.scara_chain.getNrOfJoints()
-        self.joint_order = self.agent['joint_order']
-
-
-        # self.seed()
-    def _observation_callback(self, message):
-        # message: observation from the robot to store each listen."""
-        # self._time_lock.acquire(True, -1)
-        # with self._time_lock:
-        self._observations_stale = False
-        self._observation_msg = message
-            # if self._currently_resetting:
-            #     epsilon = 1e-3
-            #     reset_action = self.agent['reset_conditions']['initial_positions']
-            #     now_action = self._observation_msg.actual.positions
-            #     du = np.linalg.norm(reset_action-now_action, float(np.inf))
-            #     if du < epsilon:
-            #         self._currently_resetting = False
-
-        # self._time_lock.release()
-
-
-    def reset(self):
-        """Reset function should call reset_model.
-           In OpenAI in reset_model we are setting the robot to initial pose + some random number.
-           This function returns the observations which are then used again for the policy calculation.
-           In our case we are going to set the robot to initial pose, for now given by the user."""
-        self.obs = None
-        # self._currently_resetting = True
-
-        # Set the reset position as the initial position from agent hyperparams.
-        self.reset_joint_angles = self.agent['reset_conditions']['initial_positions']
-        # Prepare the present positions to see how far off we are.
-        now_position = np.asarray(self._observation_msg.actual.positions)
-
-        # Wait until the arm is within epsilon of reset configuration.
-        # action_msg = JointTrajectory()
-        # self._time_lock.acquire(True, -1)
-        # with self._time_lock:
-        #     self._currently_resetting = True
-
-
-        # if self._currently_resetting:
-        #     epsilon = 1e-3
-        #     reset_action = self.agent['reset_conditions']['initial_positions']
-        #     now_action = self._observation_msg.actual.positions
-        #     du = np.linalg.norm(reset_action-now_action, float(np.inf))
-        #     if du > epsilon:
-        #         self._pub.publish(self._get_trajectory_message(self.agent['reset_conditions']['initial_positions'], self.agent))
-        #         time.sleep(self.agent['slowness'])
-        #         self._currently_resetting = True
-        #         print("reset is false.")
-        #     else:
-        #         self._currently_resetting = False
-        #         print("reset is true.")
-
-        self.ob, ee_points = self._get_obs()
-        # self._time_lock.release()
-
-        # print("resetting: ", self.ob)
-        self.last_time = time.time()
-
-        return self.ob
-    def seed(self, seed=None):
-        self.np_random, seed = seeding.np_random(seed)
-        return [seed]
-    def _get_obs(self):
-        observations = None
-        ee_points = None
-        if self._observations_stale is False:
-            # # Acquire the lock to prevent the subscriber thread from
-            # # updating times or observation messages.
-            # self._time_lock.acquire(True)
-
-            obs_message = self._observation_msg
-            # if obs_message is None:
-            #     obs_message = self._observation_msg
-            #     # time.sleep(self.agent['slowness'])
-            #     # self._observations_stale = True
-            # self._time_lock.release()
-
-
-
-            # Make it so that subscriber's thread observation callback
-            # must be called before publishing again.
-
-            # Collect the end effector points and velocities in
-            # cartesian coordinates for the state.
-            # Collect the present joint angles and velocities from ROS for the state.
-            last_observations = self._process_observations(obs_message, self.agent)
-
-            # self._observations_stale = True
-
-
-            if last_observations is None:
-                print("last_observations is empty")
-            else:
-
-            # # # Get Jacobians from present joint angles and KDL trees
-            # # # The Jacobians consist of a 6x6 matrix getting its from from
-            # # # (# joint angles) x (len[x, y, z] + len[roll, pitch, yaw])
-                ee_link_jacobians = self._get_jacobians(last_observations)
-                if self.agent['link_names'][-1] is None:
-                    print("End link is empty!!")
-                else:
-                    self._observations_stale = False
-                    # print(self.agent['link_names'][-1])
-                    trans, rot = forward_kinematics(self.scara_chain,
-                                                self.agent['link_names'],
-                                                last_observations[:self.scara_chain.getNrOfJoints()],
-                                                base_link=self.agent['link_names'][0],
-                                                end_link=self.agent['link_names'][-1])
-                    # #
-                    rotation_matrix = np.eye(4)
-                    rotation_matrix[:3, :3] = rot
-                    rotation_matrix[:3, 3] = trans
-                    # angle, dir, _ = rotation_from_matrix(rotation_matrix)
-                    # #
-                    # current_quaternion = np.array([angle]+dir.tolist())#
-
-                    # I need this calculations for the new reward function, need to send them back to the run scara or calculate them here
-                    current_quaternion = quaternion_from_matrix(rotation_matrix)
-
-                    current_ee_tgt = np.ndarray.flatten(get_ee_points(self.agent['end_effector_points'],
-                                                                      trans,
-                                                                      rot).T)
-                    ee_points = current_ee_tgt - self.agent['ee_points_tgt']
-
-                    ee_points_jac_trans, _ = self._get_ee_points_jacobians(ee_link_jacobians,
-                                                                           self.agent['end_effector_points'],
-                                                                           rot)
-                    ee_velocities = self._get_ee_points_velocities(ee_link_jacobians,
-                                                                   self.agent['end_effector_points'],
-                                                                   rot,
-                                                                   last_observations)
-
-                    #
-                    # Concatenate the information that defines the robot state
-                    # vector, typically denoted asrobot_id 'x'.
-                    state = np.r_[np.reshape(last_observations, -1),
-                                  np.reshape(ee_points, -1),
-                                  np.reshape(ee_velocities, -1),]
-
-                    observations = np.r_[np.reshape(last_observations, -1),
-                                  np.reshape(ee_points, -1),
-                                  np.reshape(ee_velocities, -1),]
-                    if observations is None:
-                        #need to handle this properly
-                        print("Observations are none!!!")
-                    # else:
-                    #     print("We have observations")
-
-
-            return observations, ee_points
-    # initialize the steps
-    def step(self, action):
-        time_step = 0
-        ee_points = None
-        self.ob = None
-        while time_step < self.agent['T']: #rclpy.ok():
-            """
-            How they do in OpenAI:
-              1. Calculate the reward
-              2. Perform action (do_simulation)
-              3. Get the Observations
-            """
-            _, ee_points  = self._get_obs()
-
-            if ee_points is None:
-                print("self.ob: ", self.ob)
-                print("ee_points: ", ee_points)
-                done = False
-                rclpy.spin_once(self.node)
-                # time.sleep(self.agent['slowness'])
-            else:
-                # self.reward_dist = - self.rmse_func(ee_points) - 0.5 * abs(np.sum(self.log_dist_func(ee_points)))
-                # print("reward: ", self.reward_dist)
-
-                self.reward_dist = - self.rmse_func(ee_points)
-                # print("reward_dist :", self.reward_dist)
-                if(self.reward_dist<0.005):
-                    self.reward = 1 - self.rmse_func(ee_points) # Make the reward increase as the distance decreases
-                    print("Reward is: ", self.reward)
-                    print("Eucledian distance is: ", np.linalg.norm(ee_points))
-                else:
-                    self.reward = self.reward_dist
-
-                print("reward: ", self.reward)
-
-                # Calculate if the env has been solved
-                done = bool(abs(self.reward_dist) < 0.005)
-
-                # self._time_lock.acquire(True)
-                # self._pub.publish(self._get_trajectory_message(action[:self.scara_chain.getNrOfJoints()], self.agent))#rclpy.ok():
-
-                while self.ob is None or ee_points is None:
-                    self.ob, ee_points  = self._get_obs()
-                    rclpy.spin_once(self.node)
-                    # time.sleep(self.agent['slowness'])
-
-                # self._time_lock.release()
-
-                    # self._time_lock.release()
-                rclpy.spin_once(self.node)
-
-                time_step += 1
-
-                # print("time_step: ", time_step)
-                return self.ob, self.reward, done, {}
-
-    def step(self, action):
-        """
-        Dont know if we need this but just in case set the obs to None, so always takes fresh values
-        """
-        self.obs = None
-        observations = None
-
-        print("Action: ", action)
-        """
-        How they do in OpenAI:
-             1. Calculate the reward
-             2. Perform action (do_simulation)
-             3. Get the Observations
-        """
-        start = timer()
-        # Check if ROS2 is ok
-        if rclpy.ok():
-            observations, ee_points  = self._get_obs()
-            if self.ob is None or ee_points is None:
-                print("self.ob: ", self.ob)
-                print("ee_points: ", ee_points)
-                done = False
-                rclpy.spin_once(self.node)
-                time_step += 1
-            else:
-                # self.reward_dist = - abs(self.rmse_func(ee_points)) #np.linalg.norm(ee_points) #self.rmse_func(ee_points) #- 0.5 * abs(np.sum(self.log_dist_func(ee_points)))
-                # # print("reward: ",self.reward_dist)
-
-                self.reward_dist = -self.rmse_func(ee_points)
-                # print("reward_dist :", self.reward_dist)
-                if(self.rmse_func(ee_points)<0.005):
-                    self.reward = 1 - self.rmse_func(ee_points) # Make the reward increase as the distance decreases
-                    print("Reward is: ", self.reward)
-                    print("Eucledian distance is: ", np.linalg.norm(ee_points))
-                else:
-                    self.reward = self.reward_dist
-                    # print("Eucledian distance is: ", np.linalg.norm(ee_points))
-
-                # print("reward: ", self.reward)
-                # print("rmse_func: ", self.rmse_func(ee_points))
-
-                # Calculate if the env has been solved
-                done = bool(abs(self.reward_dist) < 0.005)
-
-                self.ob, ee_points  = self._get_obs()
-
-                dt = time.time() - self.last_time
-                for i in range(self.scara_chain.getNrOfJoints()):
-                    if(self._observation_msg.actual.positions[i] > action[i]):
-                        self.goal_vel[i] = -self.goal_vel_value[0]
-                    else:
-                        self.goal_vel[i] =  self.goal_vel_value[0]
-                    self.goal_cmd[i] += dt*self.goal_vel[i]
-
-                self.last_time = time.time()
-
-                action_msg = JointTrajectory()
-                action_msg.joint_names = self.joint_order
-
-                # Create a point to tell the robot to move to.
-                target = JointTrajectoryPoint()
-                target.positions  = self.goal_cmd
-
-                target.velocities = self.goal_vel
-                target.effort = self.goal_effort_value
-
-                # Package the single point into a trajectory of points with length 1.
-                action_msg.points = [target]
-
-                self._pub.publish(action_msg)
-                rclpy.spin_once(self.node)
-
-
-        return self.ob, self.reward, done, {}
-
-    def rmse_func(self, ee_points):
-      """
-        Computes the Residual Mean Square Error of the difference between current and desired end-effector position
-      """
-      rmse = np.sqrt(np.mean(np.square(ee_points), dtype=np.float32))
-      return rmse
-    def log_dist_func(self, ee_points):
-        """
-        Computes the Log of the Eucledian Distance Error between current and desired end-effector position
-        """
-        log_dist = np.log(ee_points, dtype=np.float32) # [:(self.scara_chain.getNrOfJoints())]
-        log_dist[log_dist == -np.inf] = 0.0
-        log_dist = np.nan_to_num(log_dist)
-        # print("log_distance: ", log_dist)
-        return log_dist
-
-    def _get_jacobians(self, state):
-        """Produce a Jacobian from the urdf that maps from joint angles to x, y, z.
-        This makes a 6x6 matrix from 6 joint angles to x, y, z and 3 angles.
-        The angles are roll, pitch, and yaw (not Euler angles) and are not needed.
-        Returns a repackaged Jacobian that is 3x6.
-        """
-
-        # Initialize a Jacobian for n joint angles by 3 cartesian coords and 3 orientation angles
-        jacobian = Jacobian(self.scara_chain.getNrOfJoints())
-
-        # Initialize a joint array for the present n joint angles.
-        angles = JntArray(self.scara_chain.getNrOfJoints())
-
-        # Construct the joint array from the most recent joint angles.
-        for i in range(self.scara_chain.getNrOfJoints()):
-            angles[i] = state[i]
-
-        # Update the jacobian by solving for the given angles.
-        self.jac_solver.JntToJac(angles, jacobian)
-
-        # Initialize a numpy array to store the Jacobian.
-        J = np.array([[jacobian[i, j] for j in range(jacobian.columns())] for i in range(jacobian.rows())])
-
-        # Only want the cartesian position, not Roll, Pitch, Yaw (RPY) Angles
-        ee_jacobians = J
-        return ee_jacobians
-
-
-    def _process_observations(self, message, agent):
-        """Helper fuinction only called by _run_trial to convert a ROS message
-        to joint angles and velocities.
-        Check for and handle the case where a message is either malformed
-        or contains joint values in an order different from that expected
-        in hyperparams['joint_order']"""
-        # print(message)
-        # len(message)
-        if not message:
-            print("Message is empty");
-        else:
-            # # Check if joint values are in the expected order and size.
-            if message.joint_names != agent['joint_order']:
-                # Check that the message is of same size as the expected message.
-                if len(message.joint_names) != len(agent['joint_order']):
-                    raise MSG_INVALID_JOINT_NAMES_DIFFER
-
-                # Check that all the expected joint values are present in a message.
-                if not all(map(lambda x,y: x in y, message.joint_names,
-                    [self._valid_joint_set for _ in range(len(message.joint_names))])):
-                    raise MSG_INVALID_JOINT_NAMES_DIFFER
-                    print("Joints differ")
-
-            return np.array(message.actual.positions) # + message.actual.velocities
-
-
-    def _get_trajectory_message(self, action, agent):
-        """Helper function only called by reset() and run_trial().
-        Wraps an action vector of joint angles into a JointTrajectory message.
-        The velocities, accelerations, and effort do not control the arm motion"""
-
-        # Set up a trajectory message to publish.
-        action_msg = JointTrajectory()
-        action_msg.joint_names = agent['joint_order']
-
-        # Create a point to tell the robot to move to.
-        target = JointTrajectoryPoint()
-        action_float = [float(i) for i in action]
-        target.positions = action_float
-
-        # These times determine the speed at which the robot moves:
-        # it tries to reach the specified target position in 'slowness' time.
-        # target.time_from_start.nanosec = agent['slowness']
-        target.time_from_start.sec = agent['slowness']
-        # Package the single point into a trajectory of points with length 1.
-        action_msg.points = [target]
-
-        print(action_msg)
-
-        return action_msg
-
-    def _get_ee_points_jacobians(self, ref_jacobian, ee_points, ref_rot):
-        """
-        Get the jacobians of the points on a link given the jacobian for that link's origin
-        :param ref_jacobian: 6 x 6 numpy array, jacobian for the link's origin
-        :param ee_points: N x 3 numpy array, points' coordinates on the link's coordinate system
-        :param ref_rot: 3 x 3 numpy array, rotational matrix for the link's coordinate system
-        :return: 3N x 6 Jac_trans, each 3 x 6 numpy array is the Jacobian[:3, :] for that point
-                 3N x 6 Jac_rot, each 3 x 6 numpy array is the Jacobian[3:, :] for that point
-        """
-        ee_points = np.asarray(ee_points)
-        ref_jacobians_trans = ref_jacobian[:3, :]
-        ref_jacobians_rot = ref_jacobian[3:, :]
-        end_effector_points_rot = np.expand_dims(ref_rot.dot(ee_points.T).T, axis=1)
-        ee_points_jac_trans = np.tile(ref_jacobians_trans, (ee_points.shape[0], 1)) + \
-                                        np.cross(ref_jacobians_rot.T, end_effector_points_rot).transpose(
-                                            (0, 2, 1)).reshape(-1, self.scara_chain.getNrOfJoints())
-        ee_points_jac_rot = np.tile(ref_jacobians_rot, (ee_points.shape[0], 1))
-        return ee_points_jac_trans, ee_points_jac_rot
-
-    def _get_ee_points_velocities(self, ref_jacobian, ee_points, ref_rot, joint_velocities):
-        """
-        Get the velocities of the points on a link
-        :param ref_jacobian: 6 x 6 numpy array, jacobian for the link's origin
-        :param ee_points: N x 3 numpy array, points' coordinates on the link's coordinate system
-        :param ref_rot: 3 x 3 numpy array, rotational matrix for the link's coordinate system
-        :param joint_velocities: 1 x 6 numpy array, joint velocities
-        :return: 3N numpy array, velocities of each point
-        """
-        ref_jacobians_trans = ref_jacobian[:3, :]
-        ref_jacobians_rot = ref_jacobian[3:, :]
-        ee_velocities_trans = np.dot(ref_jacobians_trans, joint_velocities)
-        ee_velocities_rot = np.dot(ref_jacobians_rot, joint_velocities)
-        ee_velocities = ee_velocities_trans + np.cross(ee_velocities_rot.reshape(1, 3),
-                                                       ref_rot.dot(ee_points.T).T)
-        return ee_velocities.reshape(-1)
diff --git a/baselines/agent/scara_arm/tree_urdf.py b/baselines/agent/scara_arm/tree_urdf.py
deleted file mode 100644
index 9fba8f5..0000000
--- a/baselines/agent/scara_arm/tree_urdf.py
+++ /dev/null
@@ -1,134 +0,0 @@
-from __future__ import print_function
-
-import urdf_parser_py.urdf as urdf
-
-# import sys
-# sys.path.append('/home/rkojcev/catkin_ws/devel/lib/python3/dist-packages')
-# sys.path.append('/home/rkojcev/catkin_ws/devel/lib')
-
-import PyKDL as kdl
-
-def treeFromFile(filename):
-    """
-    Construct a PyKDL.Tree from an URDF file.
-    :param filename: URDF file path
-    """
-
-    with open(filename) as urdf_file:
-        return treeFromUrdfModel(urdf.URDF.from_xml_string(urdf_file.read()))
-
-def treeFromParam(param):
-    """
-    Construct a PyKDL.Tree from an URDF in a ROS parameter.
-    :param param: Parameter name, ``str``
-    """
-
-    return treeFromUrdfModel(urdf.URDF.from_parameter_server())
-
-def treeFromString(xml):
-    """
-    Construct a PyKDL.Tree from an URDF xml string.
-    :param xml: URDF xml string, ``str``
-    """
-
-    return treeFromUrdfModel(urdf.URDF.from_xml_string(xml))
-
-def _toKdlPose(pose):
-    """
-    Helper function that packages a pose structure containing orientation values (roll, pitch, yaw)
-    and position values (x, y, z) into a KDL Frame.
-    """
-    if pose and pose.rpy and len(pose.rpy) == 3 and pose.xyz and len(pose.xyz) == 3:
-        return kdl.Frame(
-              kdl.Rotation.RPY(*pose.rpy),
-              kdl.Vector(*pose.xyz))
-    else:
-        return kdl.Frame.Identity()
-
-def _toKdlInertia(i):
-    # kdl specifies the inertia in the reference frame of the link, the urdf
-    # specifies the inertia in the inertia reference frame
-    origin = _toKdlPose(i.origin)
-    inertia = i.inertia
-    return origin.M * kdl.RigidBodyInertia(
-            i.mass, origin.p,
-            kdl.RotationalInertia(inertia.ixx, inertia.iyy, inertia.izz, inertia.ixy, inertia.ixz, inertia.iyz));
-
-def _toKdlJoint(jnt):
-    #  define a mapping for joints and kdl
-    fixed = lambda j,F: kdl.Joint(j.name)
-    rotational = lambda j,F: kdl.Joint(j.name, F.p, F.M * kdl.Vector(*j.axis), kdl.Joint.RotAxis)
-    translational = lambda j,F: kdl.Joint(j.name, F.p, F.M * kdl.Vector(*j.axis), kdl.Joint.TransAxis)
-
-    type_map = {
-            'fixed': fixed,
-            'revolute': rotational,
-            'continuous': rotational,
-            'prismatic': translational,
-            'floating': fixed,
-            'planar': fixed,
-            'unknown': fixed,
-            }
-
-    return type_map[jnt.type](jnt, _toKdlPose(jnt.origin))
-
-def _add_children_to_tree(robot_model, root, tree):
-    """
-    Helper function that adds children to a KDL tree.
-    """
-
-    # constructs the optional inertia
-    inert = kdl.RigidBodyInertia(0)
-    if root.inertial:
-        inert = _toKdlInertia(root.inertial)
-
-    # constructs the kdl joint
-    parent_joint_name, parent_link_name = robot_model.parent_map[root.name]
-    parent_joint = robot_model.joint_map[parent_joint_name]
-
-    # construct the kdl segment
-    sgm = kdl.Segment(
-        root.name,
-        _toKdlJoint(parent_joint),
-        _toKdlPose(parent_joint.origin),
-        inert)
-
-    # add segment to tree
-    if not tree.addSegment(sgm, parent_link_name):
-        return False
-
-    if root.name not in robot_model.child_map:
-        return True
-
-    children = [robot_model.link_map[l] for (j,l) in robot_model.child_map[root.name]]
-
-    # recurslively add all children
-    for child in children:
-        if not _add_children_to_tree(robot_model, child, tree):
-            return False
-
-    return True;
-
-def treeFromUrdfModel(robot_model, quiet=False):
-    """
-    Construct a PyKDL.Tree from an URDF model from urdf_parser_python.
-
-    :param robot_model: URDF xml string, ``str``
-    :param quiet: If true suppress messages to stdout, ``bool``
-    """
-
-    root = robot_model.link_map[robot_model.get_root()]
-
-    if root.inertial and not quiet:
-        print("The root link %s has an inertia specified in the URDF, but KDL does not support a root link with an inertia.  As a workaround, you can add an extra dummy link to your URDF." % root.name);
-
-    ok = True
-    tree = kdl.Tree(root.name)
-
-    #  add all children
-    for joint,child in robot_model.child_map[root.name]:
-        if not _add_children_to_tree(robot_model, robot_model.link_map[child], tree):
-            ok = False
-            break
-
-    return (ok, tree)
diff --git a/baselines/agent/utility/error.py b/baselines/agent/utility/error.py
deleted file mode 100644
index 2a72b2d..0000000
--- a/baselines/agent/utility/error.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import sys
-
-class Error(Exception):
-    pass
-
-# Local errors
-
-class Unregistered(Error):
-    """Raised when the user requests an item from the registry that does
-    not actually exist.
-    """
-    pass
-
-class UnregisteredEnv(Unregistered):
-    """Raised when the user requests an env from the registry that does
-    not actually exist.
-    """
-    pass
-
-class UnregisteredBenchmark(Unregistered):
-    """Raised when the user requests an env from the registry that does
-    not actually exist.
-    """
-    pass
-
-class DeprecatedEnv(Error):
-    """Raised when the user requests an env from the registry with an
-    older version number than the latest env with the same name.
-    """
-    pass
-
-class UnseedableEnv(Error):
-    """Raised when the user tries to seed an env that does not support
-    seeding.
-    """
-    pass
-
-class DependencyNotInstalled(Error):
-    pass
-
-class UnsupportedMode(Exception):
-    """Raised when the user requests a rendering mode not supported by the
-    environment.
-    """
-    pass
-
-class ResetNeeded(Exception):
-    """When the monitor is active, raised when the user tries to step an
-    environment that's already done.
-    """
-    pass
-
-class ResetNotAllowed(Exception):
-    """When the monitor is active, raised when the user tries to step an
-    environment that's not yet done.
-    """
-    pass
-
-class InvalidAction(Exception):
-    """Raised when the user performs an action not contained within the
-    action space
-    """
-    pass
-
-# API errors
-
-class APIError(Error):
-    def __init__(self, message=None, http_body=None, http_status=None,
-                 json_body=None, headers=None):
-        super(APIError, self).__init__(message)
-
-        if http_body and hasattr(http_body, 'decode'):
-            try:
-                http_body = http_body.decode('utf-8')
-            except:
-                http_body = ('<Could not decode body as utf-8. '
-                             'Please report to gym@openai.com>')
-
-        self._message = message
-        self.http_body = http_body
-        self.http_status = http_status
-        self.json_body = json_body
-        self.headers = headers or {}
-        self.request_id = self.headers.get('request-id', None)
-
-    def __unicode__(self):
-        if self.request_id is not None:
-            msg = self._message or "<empty message>"
-            return u"Request {0}: {1}".format(self.request_id, msg)
-        else:
-            return self._message
-
-    if sys.version_info > (3, 0):
-        def __str__(self):
-            return self.__unicode__()
-    else:
-        def __str__(self):
-            return unicode(self).encode('utf-8')
-
-
-class APIConnectionError(APIError):
-    pass
-
-
-class InvalidRequestError(APIError):
-
-    def __init__(self, message, param, http_body=None,
-                 http_status=None, json_body=None, headers=None):
-        super(InvalidRequestError, self).__init__(
-            message, http_body, http_status, json_body,
-            headers)
-        self.param = param
-
-
-class AuthenticationError(APIError):
-    pass
-
-class RateLimitError(APIError):
-    pass
-
-# Video errors
-
-class VideoRecorderError(Error):
-    pass
-
-class InvalidFrame(Error):
-    pass
-
-# Wrapper errors
-
-class DoubleWrapperError(Error):
-    pass
-
-
-class WrapAfterConfigureError(Error):
-    pass
-
-
-class RetriesExceededError(Error):
-    pass
diff --git a/baselines/agent/utility/general_utils.py b/baselines/agent/utility/general_utils.py
deleted file mode 100644
index 9bf11e9..0000000
--- a/baselines/agent/utility/general_utils.py
+++ /dev/null
@@ -1,436 +0,0 @@
-""" This file defines general utility functions and classes. """
-import numpy as np
-import math
-
-# import sys
-#
-# sys.path.append('/home/rkojcev/ros_python3/devel/lib')
-import PyKDL as kdl
-
-class BundleType(object):
-    """
-    This class bundles many fields, similar to a record or a mutable
-    namedtuple.
-    """
-    def __init__(self, variables):
-        for var, val in variables.items():
-            object.__setattr__(self, var, val)
-
-    # Freeze fields so new ones cannot be set.
-    def __setattr__(self, key, value):
-        if not hasattr(self, key):
-            raise AttributeError("%r has no attribute %s" % (self, key))
-        object.__setattr__(self, key, value)
-
-
-def check_shape(value, expected_shape, name=''):
-    """
-    Throws a ValueError if value.shape != expected_shape.
-    Args:
-        value: Matrix to shape check.
-        expected_shape: A tuple or list of integers.
-        name: An optional name to add to the exception message.
-    """
-    if value.shape != tuple(expected_shape):
-        raise ValueError('Shape mismatch %s: Expected %s, got %s' %
-                         (name, str(expected_shape), str(value.shape)))
-
-
-def finite_differences(func, inputs, func_output_shape=(), epsilon=1e-5):
-    """
-    Computes gradients via finite differences.
-    derivative = (func(x+epsilon) - func(x-epsilon)) / (2*epsilon)
-    Args:
-        func: Function to compute gradient of. Inputs and outputs can be
-            arbitrary dimension.
-        inputs: Vector value to compute gradient at.
-        func_output_shape: Shape of the output of func. Default is
-            empty-tuple, which works for scalar-valued functions.
-        epsilon: Difference to use for computing gradient.
-    Returns:
-        Gradient vector of each dimension of func with respect to each
-        dimension of input.
-    """
-    gradient = np.zeros(inputs.shape+func_output_shape)
-    for idx, _ in np.ndenumerate(inputs):
-        test_input = np.copy(inputs)
-        test_input[idx] += epsilon
-        obj_d1 = func(test_input)
-        assert obj_d1.shape == func_output_shape
-        test_input = np.copy(inputs)
-        test_input[idx] -= epsilon
-        obj_d2 = func(test_input)
-        assert obj_d2.shape == func_output_shape
-        diff = (obj_d1 - obj_d2) / (2 * epsilon)
-        gradient[idx] += diff
-    return gradient
-
-
-def approx_equal(a, b, threshold=1e-5):
-    """
-    Return whether two numbers are equal within an absolute threshold.
-    Returns:
-        True if a and b are equal within threshold.
-    """
-    return np.all(np.abs(a - b) < threshold)
-
-
-def extract_condition(hyperparams, m):
-    """
-    Pull the relevant hyperparameters corresponding to the specified
-    condition, and return a new hyperparameter dictionary.
-    """
-    return {var: val[m] if isinstance(val, list) else val
-            for var, val in hyperparams.items()}
-
-
-def get_ee_points(offsets, ee_pos, ee_rot):
-    """
-    Helper method for computing the end effector points given a
-    position, rotation matrix, and offsets for each of the ee points.
-
-    Args:
-        offsets: N x 3 array where N is the number of points.
-        ee_pos: 1 x 3 array of the end effector position.
-        ee_rot: 3 x 3 rotation matrix of the end effector.
-    Returns:
-        3 x N array of end effector points.
-    """
-    return np.asarray(ee_rot.dot(offsets.T) + ee_pos.T)
-
-
-def get_position(tf, target, source, time):
-    """
-    Utility function that uses tf to return the position of target
-    relative to source at time
-    tf: Object that implements TransformListener
-    target: Valid label corresponding to target link
-    source: Valid label corresponding to source link
-    time: Time given in TF's time structure of secs and nsecs
-    """
-
-    # Calculate the quaternion data for the relative position
-    # between the target and source.
-    # translation, rot = tf.lookupTransform(target, source, time)
-    position, _ = tf.lookupTransform(source, target, time)
-    position = np.asarray(position)
-
-    return position
-
-def unit_vector(data, axis=None, out=None):
-    """Return ndarray normalized by length, i.e. Euclidean norm, along axis.
-
-    >>> v0 = np.random.random(3)
-    >>> v1 = unit_vector(v0)
-    >>> np.allclose(v1, v0 / np.linalg.norm(v0))
-    True
-    >>> v0 = np.random.rand(5, 4, 3)
-    >>> v1 = unit_vector(v0, axis=-1)
-    >>> v2 = v0 / np.expand_dims(np.sqrt(np.sum(v0*v0, axis=2)), 2)
-    >>> np.allclose(v1, v2)
-    True
-    >>> v1 = unit_vector(v0, axis=1)
-    >>> v2 = v0 / np.expand_dims(np.sqrt(np.sum(v0*v0, axis=1)), 1)
-    >>> np.allclose(v1, v2)
-    True
-    >>> v1 = np.empty((5, 4, 3))
-    >>> unit_vector(v0, axis=1, out=v1)
-    >>> np.allclose(v1, v2)
-    True
-    >>> list(unit_vector([]))
-    []
-    >>> list(unit_vector([1]))
-    [1.0]
-
-    """
-    if out is None:
-        data = np.array(data, dtype=np.float64, copy=True)
-        if data.ndim == 1:
-            data /= math.sqrt(np.dot(data, data))
-            return data
-    else:
-        if out is not data:
-            out[:] = np.array(data, copy=False)
-        data = out
-    length = np.atleast_1d(np.sum(data*data, axis))
-    np.sqrt(length, length)
-    if axis is not None:
-        length = np.expand_dims(length, axis)
-    data /= length
-    if out is None:
-        return data
-
-
-def get_rotation_matrix(angle, direction, point=None):
-    """Return matrix to rotate about axis defined by point and direction.
-
-    >>> R = rotation_matrix(math.pi/2, [0, 0, 1], [1, 0, 0])
-    >>> np.allclose(np.dot(R, [0, 0, 0, 1]), [1, -1, 0, 1])
-    True
-    >>> angle = (random.random() - 0.5) * (2*math.pi)
-    >>> direc = np.random.random(3) - 0.5
-    >>> point = np.random.random(3) - 0.5
-    >>> R0 = rotation_matrix(angle, direc, point)
-    >>> R1 = rotation_matrix(angle-2*math.pi, direc, point)
-    >>> is_same_transform(R0, R1)
-    True
-    >>> R0 = rotation_matrix(angle, direc, point)
-    >>> R1 = rotation_matrix(-angle, -direc, point)
-    >>> is_same_transform(R0, R1)
-    True
-    >>> I = np.identity(4, np.float64)
-    >>> np.allclose(I, rotation_matrix(math.pi*2, direc))
-    True
-    >>> np.allclose(2, np.trace(rotation_matrix(math.pi/2,
-    ...                                               direc, point)))
-    True
-
-    """
-    sina = math.sin(angle)
-    cosa = math.cos(angle)
-    direction = unit_vector(direction[:3])
-    # rotation matrix around unit vector
-    R = np.diag([cosa, cosa, cosa])
-    R += np.outer(direction, direction) * (1.0 - cosa)
-    direction *= sina
-    R += np.array([[ 0.0,         -direction[2],  direction[1]],
-                      [ direction[2], 0.0,          -direction[0]],
-                      [-direction[1], direction[0],  0.0]])
-    M = np.identity(4)
-    M[:3, :3] = R
-    if point is not None:
-        # rotation not around origin
-        point = np.array(point[:3], dtype=np.float64, copy=False)
-        M[:3, 3] = point - np.dot(R, point)
-    return M
-
-
-def rotation_from_matrix(matrix):
-    """Return rotation angle and axis from rotation matrix.
-
-    >>> angle = (random.random() - 0.5) * (2*math.pi)
-    >>> direc = np.random.random(3) - 0.5
-    >>> point = np.random.random(3) - 0.5
-    >>> R0 = rotation_matrix(angle, direc, point)
-    >>> angle, direc, point = rotation_from_matrix(R0)
-    >>> R1 = rotation_matrix(angle, direc, point)
-    >>> is_same_transform(R0, R1)
-    True
-
-    """
-    R = np.array(matrix, dtype=np.float64, copy=False)
-    R33 = R[:3, :3]
-    # direction: unit eigenvector of R33 corresponding to eigenvalue of 1
-    w, W = np.linalg.eig(R33.T)
-    i = np.where(abs(np.real(w) - 1.0) < 1e-8)[0]
-    if not len(i):
-        raise ValueError("no unit eigenvector corresponding to eigenvalue 1")
-    direction = np.real(W[:, i[-1]]).squeeze()
-    # point: unit eigenvector of R33 corresponding to eigenvalue of 1
-    w, Q = np.linalg.eig(R)
-    i = np.where(abs(np.real(w) - 1.0) < 1e-8)[0]
-    if not len(i):
-        raise ValueError("no unit eigenvector corresponding to eigenvalue 1")
-    point = np.real(Q[:, i[-1]]).squeeze()
-    point /= point[3]
-    # rotation angle depending on direction
-    cosa = (np.trace(R33) - 1.0) / 2.0
-    if abs(direction[2]) > 1e-8:
-        sina = (R[1, 0] + (cosa-1.0)*direction[0]*direction[1]) / direction[2]
-    elif abs(direction[1]) > 1e-8:
-        sina = (R[0, 2] + (cosa-1.0)*direction[0]*direction[2]) / direction[1]
-    else:
-        sina = (R[2, 1] + (cosa-1.0)*direction[1]*direction[2]) / direction[0]
-    angle = math.atan2(sina, cosa)
-    return angle, direction, point
-
-def quaternion_from_matrix(matrix, isprecise=False):
-    """Return quaternion from rotation matrix.
-
-    If isprecise is True, the input matrix is assumed to be a precise rotation
-    matrix and a faster algorithm is used.
-
-    >>> q = quaternion_from_matrix(np.identity(4), True)
-    >>> np.allclose(q, [1, 0, 0, 0])
-    True
-    >>> q = quaternion_from_matrix(np.diag([1, -1, -1, 1]))
-    >>> np.allclose(q, [0, 1, 0, 0]) or np.allclose(q, [0, -1, 0, 0])
-    True
-    >>> R = rotation_matrix(0.123, (1, 2, 3))
-    >>> q = quaternion_from_matrix(R, True)
-    >>> np.allclose(q, [0.9981095, 0.0164262, 0.0328524, 0.0492786])
-    True
-    >>> R = [[-0.545, 0.797, 0.260, 0], [0.733, 0.603, -0.313, 0],
-    ...      [-0.407, 0.021, -0.913, 0], [0, 0, 0, 1]]
-    >>> q = quaternion_from_matrix(R)
-    >>> np.allclose(q, [0.19069, 0.43736, 0.87485, -0.083611])
-    True
-    >>> R = [[0.395, 0.362, 0.843, 0], [-0.626, 0.796, -0.056, 0],
-    ...      [-0.677, -0.498, 0.529, 0], [0, 0, 0, 1]]
-    >>> q = quaternion_from_matrix(R)
-    >>> np.allclose(q, [0.82336615, -0.13610694, 0.46344705, -0.29792603])
-    True
-    >>> R = random_rotation_matrix()
-    >>> q = quaternion_from_matrix(R)
-    >>> is_same_transform(R, quaternion_matrix(q))
-    True
-    >>> is_same_quaternion(quaternion_from_matrix(R, isprecise=False),
-    ...                    quaternion_from_matrix(R, isprecise=True))
-    True
-    >>> R = euler_matrix(0.0, 0.0, np.pi/2.0)
-    >>> is_same_quaternion(quaternion_from_matrix(R, isprecise=False),
-    ...                    quaternion_from_matrix(R, isprecise=True))
-    True
-
-    """
-    M = np.array(matrix, dtype=np.float64, copy=False)[:4, :4]
-    if isprecise:
-        q = np.empty((4, ))
-        t = np.trace(M)
-        if t > M[3, 3]:
-            q[0] = t
-            q[3] = M[1, 0] - M[0, 1]
-            q[2] = M[0, 2] - M[2, 0]
-            q[1] = M[2, 1] - M[1, 2]
-        else:
-            i, j, k = 0, 1, 2
-            if M[1, 1] > M[0, 0]:
-                i, j, k = 1, 2, 0
-            if M[2, 2] > M[i, i]:
-                i, j, k = 2, 0, 1
-            t = M[i, i] - (M[j, j] + M[k, k]) + M[3, 3]
-            q[i] = t
-            q[j] = M[i, j] + M[j, i]
-            q[k] = M[k, i] + M[i, k]
-            q[3] = M[k, j] - M[j, k]
-            q = q[[3, 0, 1, 2]]
-        q *= 0.5 / math.sqrt(t * M[3, 3])
-    else:
-        m00 = M[0, 0]
-        m01 = M[0, 1]
-        m02 = M[0, 2]
-        m10 = M[1, 0]
-        m11 = M[1, 1]
-        m12 = M[1, 2]
-        m20 = M[2, 0]
-        m21 = M[2, 1]
-        m22 = M[2, 2]
-        # symmetric matrix K
-        K = np.array([[m00-m11-m22, 0.0,         0.0,         0.0],
-                         [m01+m10,     m11-m00-m22, 0.0,         0.0],
-                         [m02+m20,     m12+m21,     m22-m00-m11, 0.0],
-                         [m21-m12,     m02-m20,     m10-m01,     m00+m11+m22]])
-        K /= 3.0
-        # quaternion is eigenvector of K that corresponds to largest eigenvalue
-        w, V = np.linalg.eigh(K)
-        q = V[[3, 0, 1, 2], np.argmax(w)]
-    if q[0] < 0.0:
-        np.negative(q, q)
-
-    # exchange (w, x, y, z) to (x, y, z, w)
-    q_new = np.empty(4)
-    q_new[:3] = q[1:]
-    q_new[3] = q[0]
-    return q_new
-
-def joint_list_to_kdl(q):
-    """ Return KDL JntArray converted from list q """
-    if q is None:
-        return None
-    if type(q) == np.matrix and q.shape[1] == 0:
-        q = q.T.tolist()[0]
-    q_kdl = kdl.JntArray(len(q))
-    for i, q_i in enumerate(q):
-        q_kdl[i] = q_i
-    return q_kdl
-
-def joint_kdl_to_list(q):
-    """ Return list converted from KDL JntArray"""
-    if q == None:
-        return None
-    return [q[i] for i in range(q.rows())]
-
-
-def forward_kinematics(robot_chain, link_names, q, base_link='base', end_link='ee_link'):
-    """
-    Perform forward kinematics
-    Args:
-        robot_chain: robot's chain object
-        link_names: list of robot link names
-        q: list of joint positions
-        base_link: name of the link regarded as origin
-        end_link: name of the link regarded as target
-    Returns:
-        translation vector and rotation matrix from end_link to base_link
-
-    """
-    base_trans = do_kdl_fk(robot_chain, q, link_names.index(base_link))
-    if base_trans is None:
-        print("FK KDL failure on base transformation.")
-    end_trans = do_kdl_fk(robot_chain, q, link_names.index(end_link))
-    if end_trans is None:
-        print("FK KDL failure on end transformation.")
-    pose = np.dot(np.linalg.inv(base_trans), end_trans)
-    pos = pose[:3, 3].reshape(1, 3)
-    rot = pose[:3, :3]
-    return pos, rot
-
-def do_kdl_fk(robot_chain, q, link_number):
-    endeffec_frame = kdl.Frame()
-    fk_kdl = kdl.ChainFkSolverPos_recursive(robot_chain)
-    kinematics_status = fk_kdl.JntToCart(joint_list_to_kdl(q),
-                                         endeffec_frame,
-                                         link_number)
-    if kinematics_status >= 0:
-        p = endeffec_frame.p
-        M = endeffec_frame.M
-        return np.array([[M[0, 0], M[0, 1], M[0, 2], p.x()],
-                         [M[1, 0], M[1, 1], M[1, 2], p.y()],
-                         [M[2, 0], M[2, 1], M[2, 2], p.z()],
-                         [     0,      0,      0,     1]])
-    else:
-        return None
-
-
-
-def inverse_kinematics(robot_chain, pos, rot, q_guess=None, min_joints=None, max_joints=None):
-    """
-    Perform inverse kinematics
-    Args:
-        robot_chain: robot's chain object
-        pos: 1 x 3 or 3 x 1 array of the end effector position.
-        rot: 3 x 3 array of the end effector rotation
-        q_guess: guess values for the joint positions
-        min_joints: minimum value of the position for each joint
-        max_joints: maximum value of the position for each joint
-    Returns:
-        list of joint positions or None (no solution)
-    """
-    pos_kdl = kdl.Vector(pos[0], pos[1], pos[2])
-    rot_kdl = kdl.Rotation(rot[0, 0], rot[0, 1], rot[0, 2],
-                           rot[1, 0], rot[1, 1], rot[1, 2],
-                           rot[2, 0], rot[2, 1], rot[2, 2])
-    frame_kdl = kdl.Frame(rot_kdl, pos_kdl)
-    num_joints = robot_chain.getNrOfJoints()
-    min_joints = -np.pi * np.ones(num_joints) if min_joints is None else min_joints
-    max_joints = np.pi * np.ones(num_joints) if max_joints is None else max_joints
-    mins_kdl = joint_list_to_kdl(min_joints)
-    maxs_kdl = joint_list_to_kdl(max_joints)
-    fk_kdl = kdl.ChainFkSolverPos_recursive(robot_chain)
-    ik_v_kdl = kdl.ChainIkSolverVel_pinv(robot_chain)
-    ik_p_kdl = kdl.ChainIkSolverPos_NR_JL(robot_chain, mins_kdl, maxs_kdl,
-                                          fk_kdl, ik_v_kdl)
-
-    if q_guess == None:
-        # use the midpoint of the joint limits as the guess
-        lower_lim = np.where(np.isfinite(min_joints), min_joints, 0.)
-        upper_lim = np.where(np.isfinite(max_joints), max_joints, 0.)
-        q_guess = (lower_lim + upper_lim) / 2.0
-        q_guess = np.where(np.isnan(q_guess), [0.]*len(q_guess), q_guess)
-
-    q_kdl = kdl.JntArray(num_joints)
-    q_guess_kdl = joint_list_to_kdl(q_guess)
-    if ik_p_kdl.CartToJnt(q_guess_kdl, frame_kdl, q_kdl) >= 0:
-        return joint_kdl_to_list(q_kdl)
-    else:
-        return None
diff --git a/baselines/agent/utility/seeding.py b/baselines/agent/utility/seeding.py
deleted file mode 100644
index 360afc4..0000000
--- a/baselines/agent/utility/seeding.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import hashlib
-import numpy as np
-import os
-import random as _random
-import struct
-import sys
-
-from baselines.agent.utility import error
-
-if sys.version_info < (3,):
-    integer_types = (int, long)
-else:
-    integer_types = (int,)
-
-def np_random(seed=None):
-    if seed is not None and not (isinstance(seed, integer_types) and 0 <= seed):
-        raise error.Error('Seed must be a non-negative integer or omitted, not {}'.format(seed))
-
-    seed = _seed(seed)
-
-    rng = np.random.RandomState()
-    rng.seed(_int_list_from_bigint(hash_seed(seed)))
-    return rng, seed
-
-def hash_seed(seed=None, max_bytes=8):
-    """Any given evaluation is likely to have many PRNG's active at
-    once. (Most commonly, because the environment is running in
-    multiple processes.) There's literature indicating that having
-    linear correlations between seeds of multiple PRNG's can correlate
-    the outputs:
-
-    http://blogs.unity3d.com/2015/01/07/a-primer-on-repeatable-random-numbers/
-    http://stackoverflow.com/questions/1554958/how-different-do-random-seeds-need-to-be
-    http://dl.acm.org/citation.cfm?id=1276928
-
-    Thus, for sanity we hash the seeds before using them. (This scheme
-    is likely not crypto-strength, but it should be good enough to get
-    rid of simple correlations.)
-
-    Args:
-        seed (Optional[int]): None seeds from an operating system specific randomness source.
-        max_bytes: Maximum number of bytes to use in the hashed seed.
-    """
-    if seed is None:
-        seed = _seed(max_bytes=max_bytes)
-    hash = hashlib.sha512(str(seed).encode('utf8')).digest()
-    return _bigint_from_bytes(hash[:max_bytes])
-
-def _seed(a=None, max_bytes=8):
-    """Create a strong random seed. Otherwise, Python 2 would seed using
-    the system time, which might be non-robust especially in the
-    presence of concurrency.
-
-    Args:
-        a (Optional[int, str]): None seeds from an operating system specific randomness source.
-        max_bytes: Maximum number of bytes to use in the seed.
-    """
-    # Adapted from https://svn.python.org/projects/python/tags/r32/Lib/random.py
-    if a is None:
-        a = _bigint_from_bytes(os.urandom(max_bytes))
-    elif isinstance(a, str):
-        a = a.encode('utf8')
-        a += hashlib.sha512(a).digest()
-        a = _bigint_from_bytes(a[:max_bytes])
-    elif isinstance(a, integer_types):
-        a = a % 2**(8 * max_bytes)
-    else:
-        raise error.Error('Invalid type for seed: {} ({})'.format(type(a), a))
-
-    return a
-
-# TODO: don't hardcode sizeof_int here
-def _bigint_from_bytes(bytes):
-    sizeof_int = 4
-    padding = sizeof_int - len(bytes) % sizeof_int
-    bytes += b'\0' * padding
-    int_count = int(len(bytes) / sizeof_int)
-    unpacked = struct.unpack("{}I".format(int_count), bytes)
-    accum = 0
-    for i, val in enumerate(unpacked):
-        accum += 2 ** (sizeof_int * 8 * i) * val
-    return accum
-
-def _int_list_from_bigint(bigint):
-    # Special case 0
-    if bigint < 0:
-        raise error.Error('Seed must be non-negative, not {}'.format(bigint))
-    elif bigint == 0:
-        return [0]
-
-    ints = []
-    while bigint > 0:
-        bigint, mod = divmod(bigint, 2 ** 32)
-        ints.append(mod)
-    return ints
diff --git a/baselines/agent/utility/spaces/__init__.py b/baselines/agent/utility/spaces/__init__.py
deleted file mode 100644
index ac310c9..0000000
--- a/baselines/agent/utility/spaces/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from gym.spaces.box import Box
-from gym.spaces.discrete import Discrete
-from gym.spaces.multi_discrete import MultiDiscrete
-from gym.spaces.multi_binary import MultiBinary
-from gym.spaces.prng import seed
-from gym.spaces.tuple_space import Tuple
-from gym.spaces.dict_space import Dict
-
-__all__ = ["Box", "Discrete", "MultiDiscrete", "MultiBinary", "Tuple", "Dict"]
diff --git a/baselines/agent/utility/spaces/box.py b/baselines/agent/utility/spaces/box.py
deleted file mode 100644
index 637c83b..0000000
--- a/baselines/agent/utility/spaces/box.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import numpy as np
-
-from baselines.spaces import prng
-
-class Box(gym.Space):
-    """
-    A box in R^n.
-    I.e., each coordinate is bounded.
-
-    Example usage:
-    self.action_space = spaces.Box(low=-10, high=10, shape=(1,))
-    """
-    def __init__(self, low, high, shape=None):
-        """
-        Two kinds of valid input:
-            Box(-1.0, 1.0, (3,4)) # low and high are scalars, and shape is provided
-            Box(np.array([-1.0,-2.0]), np.array([2.0,4.0])) # low and high are arrays of the same shape
-        """
-        if shape is None:
-            assert low.shape == high.shape
-            self.low = low
-            self.high = high
-        else:
-            assert np.isscalar(low) and np.isscalar(high)
-            self.low = low + np.zeros(shape)
-            self.high = high + np.zeros(shape)
-    def sample(self):
-        return prng.np_random.uniform(low=self.low, high=self.high, size=self.low.shape)
-    def contains(self, x):
-        return x.shape == self.shape and (x >= self.low).all() and (x <= self.high).all()
-
-    def to_jsonable(self, sample_n):
-        return np.array(sample_n).tolist()
-    def from_jsonable(self, sample_n):
-        return [np.asarray(sample) for sample in sample_n]
-
-    @property
-    def shape(self):
-        return self.low.shape
-    def __repr__(self):
-        return "Box" + str(self.shape)
-    def __eq__(self, other):
-        return np.allclose(self.low, other.low) and np.allclose(self.high, other.high)
diff --git a/baselines/agent/utility/spaces/dict_space.py b/baselines/agent/utility/spaces/dict_space.py
deleted file mode 100644
index c552657..0000000
--- a/baselines/agent/utility/spaces/dict_space.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from gym import Space
-from collections import OrderedDict
-
-class Dict(Space):
-    """
-    A dictionary of simpler spaces
-
-    Example usage:
-    self.observation_space = spaces.Dict({"position": spaces.Discrete(2), "velocity": spaces.Discrete(3)})
-    """
-    def __init__(self, spaces):
-        if isinstance(spaces, dict):
-            spaces = OrderedDict(sorted(list(spaces.items())))
-        if isinstance(spaces, list):
-            spaces = OrderedDict(spaces)
-        self.spaces = spaces
-
-    def sample(self):
-        return OrderedDict([(k, space.sample()) for k, space in self.spaces.items()])
-
-    def contains(self, x):
-        if not isinstance(x, dict) or len(x) != len(self.spaces):
-            return False
-        for k, space in self.spaces.items():
-            if k not in x:
-                return False
-            if not space.contains(x[k]):
-                return False
-        return True
-
-    def __repr__(self):
-        return "Dict(" + ", ". join([k + ":" + str(s) for k, s in self.spaces.items()]) + ")"
-
-    def to_jsonable(self, sample_n):
-        # serialize as dict-repr of vectors
-        return {key: space.to_jsonable([sample[key] for sample in sample_n]) \
-                for key, space in self.spaces.items()}
-
-    def from_jsonable(self, sample_n):
-        dict_of_list = {}
-        for key, space in self.spaces.items():
-            dict_of_list[key] = space.from_jsonable(sample_n[key])
-        ret = []
-        for i, _ in enumerate(dict_of_list[key]):
-            entry = {}
-            for key, value in dict_of_list.items():
-                entry[key] = value[i]
-            ret.append(entry)
-        return ret
diff --git a/baselines/agent/utility/spaces/discrete.py b/baselines/agent/utility/spaces/discrete.py
deleted file mode 100644
index 30e6a48..0000000
--- a/baselines/agent/utility/spaces/discrete.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import numpy as np
-
-import gym, time
-from gym.spaces import prng
-
-class Discrete(gym.Space):
-    """
-    {0,1,...,n-1}
-
-    Example usage:
-    self.observation_space = spaces.Discrete(2)
-    """
-    def __init__(self, n):
-        self.n = n
-    def sample(self):
-        return prng.np_random.randint(self.n)
-    def contains(self, x):
-        if isinstance(x, int):
-            as_int = x
-        elif isinstance(x, (np.generic, np.ndarray)) and (x.dtype.kind in np.typecodes['AllInteger'] and x.shape == ()):
-            as_int = int(x)
-        else:
-            return False
-        return as_int >= 0 and as_int < self.n
-
-    @property
-    def shape(self):
-        return ()
-    def __repr__(self):
-        return "Discrete(%d)" % self.n
-    def __eq__(self, other):
-        return self.n == other.n
diff --git a/baselines/agent/utility/spaces/multi_binary.py b/baselines/agent/utility/spaces/multi_binary.py
deleted file mode 100644
index 4769902..0000000
--- a/baselines/agent/utility/spaces/multi_binary.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import gym
-from gym.spaces import prng
-import numpy as np
-
-class MultiBinary(gym.Space):
-    def __init__(self, n):
-        self.n = n
-    def sample(self):
-        return prng.np_random.randint(low=0, high=2, size=self.n)
-    def contains(self, x):
-        return ((x==0) | (x==1)).all()
-    def to_jsonable(self, sample_n):
-        return sample_n.tolist()
-    def from_jsonable(self, sample_n):
-        return np.array(sample_n)
\ No newline at end of file
diff --git a/baselines/agent/utility/spaces/multi_discrete.py b/baselines/agent/utility/spaces/multi_discrete.py
deleted file mode 100644
index 7be63f4..0000000
--- a/baselines/agent/utility/spaces/multi_discrete.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import numpy as np
-
-import gym
-from gym.spaces import prng
-
-class MultiDiscrete(gym.Space):
-    """
-    - The multi-discrete action space consists of a series of discrete action spaces with different parameters
-    - It can be adapted to both a Discrete action space or a continuous (Box) action space
-    - It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space
-    - It is parametrized by passing an array of arrays containing [min, max] for each discrete action space
-       where the discrete action space can take any integers from `min` to `max` (both inclusive)
-
-    Note: A value of 0 always need to represent the NOOP action.
-
-    e.g. Nintendo Game Controller
-    - Can be conceptualized as 3 discrete action spaces:
-
-        1) Arrow Keys: Discrete 5  - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4]  - params: min: 0, max: 4
-        2) Button A:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
-        3) Button B:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
-
-    - Can be initialized as
-
-        MultiDiscrete([ [0,4], [0,1], [0,1] ])
-
-    """
-    def __init__(self, array_of_param_array):
-        self.low = np.array([x[0] for x in array_of_param_array])
-        self.high = np.array([x[1] for x in array_of_param_array])
-        self.num_discrete_space = self.low.shape[0]
-
-    def sample(self):
-        """ Returns a array with one sample from each discrete action space """
-        # For each row: round(random .* (max - min) + min, 0)
-        random_array = prng.np_random.rand(self.num_discrete_space)
-        return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
-    def contains(self, x):
-        return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (np.array(x) <= self.high).all()
-
-    @property
-    def shape(self):
-        return self.num_discrete_space
-    def __repr__(self):
-        return "MultiDiscrete" + str(self.num_discrete_space)
-    def __eq__(self, other):
-        return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)
diff --git a/baselines/agent/utility/spaces/prng.py b/baselines/agent/utility/spaces/prng.py
deleted file mode 100644
index ffca680..0000000
--- a/baselines/agent/utility/spaces/prng.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import numpy
-
-np_random = numpy.random.RandomState()
-
-def seed(seed=None):
-    """Seed the common numpy.random.RandomState used in spaces
-
-    CF
-    https://github.com/openai/gym/commit/58e6aa95e5af2c738557431f812abb81c505a7cf#commitcomment-17669277
-    for some details about why we seed the spaces separately from the
-    envs, but tl;dr is that it's pretty uncommon for them to be used
-    within an actual algorithm, and the code becomes simpler to just
-    use this common numpy.random.RandomState.
-    """
-    np_random.seed(seed)
-
-# This numpy.random.RandomState gets used in all spaces for their
-# 'sample' method. It's not really expected that people will be using
-# these in their algorithms.
-seed(0)
diff --git a/baselines/agent/utility/spaces/tests/__init__.py b/baselines/agent/utility/spaces/tests/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/baselines/agent/utility/spaces/tests/test_spaces.py b/baselines/agent/utility/spaces/tests/test_spaces.py
deleted file mode 100644
index ece6181..0000000
--- a/baselines/agent/utility/spaces/tests/test_spaces.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import json # note: ujson fails this test due to float equality
-import numpy as np
-import pytest
-from gym.spaces import Tuple, Box, Discrete, MultiDiscrete, Dict
-
-
-@pytest.mark.parametrize("space", [
-              Discrete(3),
-              Tuple([Discrete(5), Discrete(10)]),
-              Tuple([Discrete(5), Box(np.array([0,0]),np.array([1,5]))]),
-              Tuple((Discrete(5), Discrete(2), Discrete(2))),
-              MultiDiscrete([ [0, 1], [0, 1], [0, 100] ]),
-              Dict({"position": Discrete(5), "velocity": Box(np.array([0,0]),np.array([1,5]))}),
-              ])
-def test_roundtripping(space):
-    sample_1 = space.sample()
-    sample_2 = space.sample()
-    assert space.contains(sample_1)
-    assert space.contains(sample_2)
-    json_rep = space.to_jsonable([sample_1, sample_2])
-
-    json_roundtripped = json.loads(json.dumps(json_rep))
-
-    samples_after_roundtrip = space.from_jsonable(json_roundtripped)
-    sample_1_prime, sample_2_prime = samples_after_roundtrip
-
-    s1 = space.to_jsonable([sample_1])
-    s1p = space.to_jsonable([sample_1_prime])
-    s2 = space.to_jsonable([sample_2])
-    s2p = space.to_jsonable([sample_2_prime])
-    assert s1 == s1p, "Expected {} to equal {}".format(s1, s1p)
-    assert s2 == s2p, "Expected {} to equal {}".format(s2, s2p)
diff --git a/baselines/agent/utility/spaces/tuple_space.py b/baselines/agent/utility/spaces/tuple_space.py
deleted file mode 100644
index 3985a6c..0000000
--- a/baselines/agent/utility/spaces/tuple_space.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from gym import Space
-
-class Tuple(Space):
-    """
-    A tuple (i.e., product) of simpler spaces
-
-    Example usage:
-    self.observation_space = spaces.Tuple((spaces.Discrete(2), spaces.Discrete(3)))
-    """
-    def __init__(self, spaces):
-        self.spaces = spaces
-
-    def sample(self):
-        return tuple([space.sample() for space in self.spaces])
-
-    def contains(self, x):
-        if isinstance(x, list):
-            x = tuple(x)  # Promote list to tuple for contains check
-        return isinstance(x, tuple) and len(x) == len(self.spaces) and all(
-            space.contains(part) for (space,part) in zip(self.spaces,x))
-
-    def __repr__(self):
-        return "Tuple(" + ", ". join([str(s) for s in self.spaces]) + ")"
-
-    def to_jsonable(self, sample_n):
-        # serialize as list-repr of tuple of vectors
-        return [space.to_jsonable([sample[i] for sample in sample_n]) \
-                for i, space in enumerate(self.spaces)]
-
-    def from_jsonable(self, sample_n):
-        return zip(*[space.from_jsonable(sample_n[i]) for i, space in enumerate(self.spaces)])

From 94c650de62cc35474848e08f93932690bf5f4422 Mon Sep 17 00:00:00 2001
From: JongGyun Kim <truth0233@gmail.com>
Date: Thu, 7 Mar 2019 08:13:01 +0900
Subject: [PATCH 06/37] remove one of duplicated lines. (#813)

---
 baselines/ppo2/ppo2.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py
index 0eced51..f3d1837 100644
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -164,7 +164,6 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
             envsperbatch = nenvs // nminibatches
             envinds = np.arange(nenvs)
             flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps)
-            envsperbatch = nbatch_train // nsteps
             for _ in range(noptepochs):
                 np.random.shuffle(envinds)
                 for start in range(0, nenvs, envsperbatch):

From ad3914786a2fac6cb9e8c06b9f42d67640313c53 Mon Sep 17 00:00:00 2001
From: pzhokhov <peterzhokhoff@gmail.com>
Date: Mon, 11 Mar 2019 17:28:51 -0700
Subject: [PATCH 07/37] fix freeze of ppo2 (#849)

* fix freeze of ppo2

* unit test for freeze, updated docstring

* more docstring update

* set number of threads to 1 in the test
---
 baselines/common/mpi_adam_optimizer.py | 54 ++++++++++++++++++++------
 1 file changed, 43 insertions(+), 11 deletions(-)

diff --git a/baselines/common/mpi_adam_optimizer.py b/baselines/common/mpi_adam_optimizer.py
index af381e7..acff294 100644
--- a/baselines/common/mpi_adam_optimizer.py
+++ b/baselines/common/mpi_adam_optimizer.py
@@ -1,6 +1,11 @@
 import numpy as np
 import tensorflow as tf
-from mpi4py import MPI
+from baselines.common import tf_util as U
+from baselines.common.tests.test_with_mpi import with_mpi
+try:
+    from mpi4py import MPI
+except ImportError:
+    MPI = None
 
 class MpiAdamOptimizer(tf.train.AdamOptimizer):
     """Adam optimizer that averages gradients across mpi processes."""
@@ -13,34 +18,61 @@ def compute_gradients(self, loss, var_list, **kwargs):
         flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0)
         shapes = [v.shape.as_list() for g, v in grads_and_vars]
         sizes = [int(np.prod(s)) for s in shapes]
-
         num_tasks = self.comm.Get_size()
         buf = np.zeros(sum(sizes), np.float32)
-
-        sess = tf.get_default_session()
-        assert sess is not None
         countholder = [0] # Counts how many times _collect_grads has been called
         stat = tf.reduce_sum(grads_and_vars[0][1]) # sum of first variable
-        def _collect_grads(flat_grad):
+        def _collect_grads(flat_grad, np_stat):
             self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
             np.divide(buf, float(num_tasks), out=buf)
             if countholder[0] % 100 == 0:
-                check_synced(sess, self.comm, stat)
+                check_synced(np_stat, self.comm)
             countholder[0] += 1
             return buf
 
-        avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32)
+        avg_flat_grad = tf.py_func(_collect_grads, [flat_grad, stat], tf.float32)
         avg_flat_grad.set_shape(flat_grad.shape)
         avg_grads = tf.split(avg_flat_grad, sizes, axis=0)
         avg_grads_and_vars = [(tf.reshape(g, v.shape), v)
                     for g, (_, v) in zip(avg_grads, grads_and_vars)]
         return avg_grads_and_vars
 
-def check_synced(sess, comm, tfstat):
+def check_synced(localval, comm=None):
     """
-    Check that 'tfstat' evaluates to the same thing on every MPI worker
+    It's common to forget to initialize your variables to the same values, or
+    (less commonly) if you update them in some other way than adam, to get them out of sync.
+    This function checks that variables on all MPI workers are the same, and raises
+    an AssertionError otherwise
+
+    Arguments:
+        comm: MPI communicator
+        localval: list of local variables (list of variables on current worker to be compared with the other workers)
     """
-    localval = sess.run(tfstat)
+    comm = comm or MPI.COMM_WORLD
     vals = comm.gather(localval)
     if comm.rank == 0:
         assert all(val==vals[0] for val in vals[1:])
+
+
+@with_mpi(timeout=5)
+def test_nonfreeze():
+    np.random.seed(0)
+    tf.set_random_seed(0)
+
+    a = tf.Variable(np.random.randn(3).astype('float32'))
+    b = tf.Variable(np.random.randn(2,5).astype('float32'))
+    loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
+
+    stepsize = 1e-2
+    # for some reason the session config with inter_op_parallelism_threads was causing
+    # nested sess.run calls to freeze
+    config = tf.ConfigProto(inter_op_parallelism_threads=1)
+    sess = U.get_session(config=config)
+    update_op = MpiAdamOptimizer(comm=MPI.COMM_WORLD, learning_rate=stepsize).minimize(loss)
+    sess.run(tf.global_variables_initializer())
+    losslist_ref = []
+    for i in range(100):
+        l,_ = sess.run([loss, update_op])
+        print(i, l)
+        losslist_ref.append(l)
+

From 75cbed608d39e79051e3fad064fd2809aa141327 Mon Sep 17 00:00:00 2001
From: Peter Zhokhov <peterz@openai.com>
Date: Mon, 11 Mar 2019 17:44:03 -0700
Subject: [PATCH 08/37] check for environment being vectorized in the play
 logic in run.py

---
 baselines/run.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/baselines/run.py b/baselines/run.py
index 01df981..78fec2f 100644
--- a/baselines/run.py
+++ b/baselines/run.py
@@ -5,7 +5,7 @@
 from collections import defaultdict
 import tensorflow as tf
 
-from baselines.common.vec_env import VecFrameStack, VecNormalize
+from baselines.common.vec_env import VecFrameStack, VecNormalize, VecEnv
 from baselines.common.vec_env.vec_video_recorder import VecVideoRecorder
 from baselines.common.cmd_util import common_arg_parser, parse_unknown_args, make_vec_env, make_env
 from baselines.common.tf_util import get_session
@@ -227,7 +227,7 @@ def main(args):
                 actions, _, _, _ = model.step(obs)
 
             obs, rew, done, _ = env.step(actions)
-            episode_rew += rew[0]
+            episode_rew += rew[0] if isinstance(env, VecEnv) else rew
             env.render()
             done = done.any() if isinstance(done, np.ndarray) else done
             if done:

From 8e58277e814365db58d83a26f53935d4e2c3e501 Mon Sep 17 00:00:00 2001
From: rkojcev <rkojcev@gmail.com>
Date: Tue, 12 Mar 2019 10:21:51 +0100
Subject: [PATCH 09/37] add defaults experimental

---
 baselines/ppo2/defaults.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/baselines/ppo2/defaults.py b/baselines/ppo2/defaults.py
index f83c7ff..a9625ed 100644
--- a/baselines/ppo2/defaults.py
+++ b/baselines/ppo2/defaults.py
@@ -28,7 +28,7 @@ def mara_mlp():
         num_layers = 2,
         num_hidden = 64,
         layer_norm = False,
-        nsteps = 2048,
+        nsteps = 1024,
         nminibatches = 32, #batchsize = nevn * nsteps // nminibatches
         lam = 0.95,
         gamma = 0.99,
@@ -42,15 +42,15 @@ def mara_mlp():
         seed = 0,
         value_network = 'copy',
         network = 'mlp',
-        total_timesteps = 1e8,
+        total_timesteps = 1e6,
         save_interval = 10,
         # env_name = 'MARA-v0',
         # env_name = 'MARAOrient-v0',
-        # env_name = 'MARACollision-v0',
-        env_name = 'MARACollisionOrient-v0',
+        env_name = 'MARACollision-v0',
+        # env_name = 'MARACollisionOrient-v0',
         transfer_path = None,
-        # transfer_path = '/tmp/ros2learn/MARACollision-v0/ppo2_mlp/2019-02-19_12h47min/checkpoints/best',
-        trained_path = '/home/rkojcev/MARA_NN/2019-02-25_18h02min/checkpoint/01750'
+        # transfer_path = '/tmp/ros2learn/MARACollision-v0/ppo2_mlp/2019-03-08_21h50min/checkpoints/best',
+        trained_path = '/tmp/ros2learn/MARACollision-v0/ppo2_mlp/2019-03-08_21h50min/checkpoints/best'
     )
 
 def mara_lstm():

From a0e31baf5a4bec0e2ec3a75a69502fd9cb0ac10d Mon Sep 17 00:00:00 2001
From: YueErro <yue.trbj@gmail.com>
Date: Wed, 20 Mar 2019 12:52:23 +0100
Subject: [PATCH 10/37] add new env to defaults

---
 baselines/ppo2/defaults.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/baselines/ppo2/defaults.py b/baselines/ppo2/defaults.py
index a21b3e5..2398e91 100644
--- a/baselines/ppo2/defaults.py
+++ b/baselines/ppo2/defaults.py
@@ -44,7 +44,8 @@ def mara_mlp():
         network = 'mlp',
         total_timesteps = 1e8,
         save_interval = 10,
-        env_name = 'MARA-v0',
+        # env_name = 'MARA-v0',
+        env_name = 'MARAReal-v0',
         # env_name = 'MARAOrient-v0',
         # env_name = 'MARACollision-v0',
         # env_name = 'MARACollisionOrient-v0',

From 6e6bc7405fa1cfcdf9a089facf1a956f00e0a4f5 Mon Sep 17 00:00:00 2001
From: nzlz <nestor94gonzalez@gmail.com>
Date: Wed, 27 Mar 2019 16:28:33 +0700
Subject: [PATCH 11/37] Update ppo2 hyperparams

---
 baselines/ppo2/defaults.py | 16 ++++++++--------
 baselines/ppo2/ppo2.py     |  5 ++++-
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/baselines/ppo2/defaults.py b/baselines/ppo2/defaults.py
index 2398e91..fc996cb 100644
--- a/baselines/ppo2/defaults.py
+++ b/baselines/ppo2/defaults.py
@@ -26,16 +26,16 @@ def retro():
 def mara_mlp():
     return dict(
         num_layers = 2,
-        num_hidden = 64,
+        num_hidden = 16,
         layer_norm = False,
-        nsteps = 2048,
-        nminibatches = 32, #batchsize = nevn * nsteps // nminibatches
+        nsteps = 1024,#2048
+        nminibatches = 8, #batchsize = nevn * nsteps // nminibatches
         lam = 0.95,
         gamma = 0.99,
         noptepochs = 10,
         log_interval = 1,
         ent_coef = 0.0,
-        lr = lambda f: 3e-4 * f,
+        lr = lambda f: 3e-3 * f,
         cliprange = 0.2,
         vf_coef = 0.5,
         max_grad_norm = 0.5,
@@ -44,14 +44,14 @@ def mara_mlp():
         network = 'mlp',
         total_timesteps = 1e8,
         save_interval = 10,
-        # env_name = 'MARA-v0',
-        env_name = 'MARAReal-v0',
-        # env_name = 'MARAOrient-v0',
+        env_name = 'MARA-v0',
+        #env_name = 'MARAReal-v0',
+        #env_name = 'MARAOrient-v0',
         # env_name = 'MARACollision-v0',
         # env_name = 'MARACollisionOrient-v0',
         transfer_path = None,
         # transfer_path = '/tmp/ros2learn/MARA-v0/ppo2_mlp/2019-02-19_12h47min/checkpoints/best',
-        trained_path = '/tmp/ros2learn/MARA-v0/ppo2_mlp/2019-03-12_18h29min/checkpoints/02700'
+        trained_path = '/tmp/ros2learn/MARAOrient-v0/ppo2_mlp/2019-03-26_14h27min/checkpoints/best'
     )
 
 def mara_lstm():
diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py
index 3257f3f..8d04566 100644
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -6,6 +6,7 @@
 from collections import deque
 from baselines.common import explained_variance, set_global_seeds
 from baselines.common.policies import build_policy
+import math
 try:
     from mpi4py import MPI
 except ImportError:
@@ -130,7 +131,9 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
         tstart = time.time()
         frac = 1.0 - (update - 1.0) / nupdates
         # Calculate the learning rate
-        lrnow = lr(frac)
+        f = math.e**(-0.001918*update)
+        lrnow = lr(f)
+        #lrnow = lr(frac)
         # Calculate the cliprange
         cliprangenow = cliprange(frac)
         # Get minibatch

From 592f15118788b477578fdcb1b6a2784fa107022b Mon Sep 17 00:00:00 2001
From: YueErro <yue.trbj@gmail.com>
Date: Wed, 27 Mar 2019 17:48:08 +0100
Subject: [PATCH 12/37] add necessary functions to call gg2.close()

---
 baselines/bench/monitor.py                | 3 +++
 baselines/common/vec_env/dummy_vec_env.py | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/baselines/bench/monitor.py b/baselines/bench/monitor.py
index bb34a29..c196d3a 100644
--- a/baselines/bench/monitor.py
+++ b/baselines/bench/monitor.py
@@ -91,6 +91,9 @@ def close(self):
         if self.f is not None:
             self.f.close()
 
+    def gg2(self):
+        return self.env
+
     def get_total_steps(self):
         return self.total_steps
 
diff --git a/baselines/common/vec_env/dummy_vec_env.py b/baselines/common/vec_env/dummy_vec_env.py
index 3ff3246..5b75aa2 100644
--- a/baselines/common/vec_env/dummy_vec_env.py
+++ b/baselines/common/vec_env/dummy_vec_env.py
@@ -83,6 +83,9 @@ def step_wait_runtime(self):
         return (np.copy(self._obs_from_buf()), np.copy(self.buf_rews), np.copy(self.buf_dones),
                 self.buf_infos.copy())
 
+    def dummy(self):
+        return self.envs[0]
+
     def reset(self):
         for e in range(self.num_envs):
             obs = self.envs[e].reset()

From e702be9fa56b8dd933cdcc3f55dd1ee8053a0cca Mon Sep 17 00:00:00 2001
From: pzhokhov <peterzhokhoff@gmail.com>
Date: Sat, 16 Mar 2019 11:54:47 -0700
Subject: [PATCH 13/37] remove f-strings for python 3.5 compatibility (#854)

---
 baselines/common/mpi_util.py         | 2 +-
 baselines/common/running_mean_std.py | 2 +-
 baselines/common/test_mpi_util.py    | 2 +-
 baselines/run.py                     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/baselines/common/mpi_util.py b/baselines/common/mpi_util.py
index d05d9cf..ca7044e 100644
--- a/baselines/common/mpi_util.py
+++ b/baselines/common/mpi_util.py
@@ -123,7 +123,7 @@ def mpi_weighted_mean(comm, local_name2valcount):
                     val = float(val)
                 except ValueError:
                     if comm.rank == 0:
-                        warnings.warn(f'WARNING: tried to compute mean on non-float {name}={val}')
+                        warnings.warn('WARNING: tried to compute mean on non-float {}={}'.format(name, val))
                 else:
                     name2sum[name] += val * count
                     name2count[name] += count
diff --git a/baselines/common/running_mean_std.py b/baselines/common/running_mean_std.py
index 6b42e56..963a57f 100644
--- a/baselines/common/running_mean_std.py
+++ b/baselines/common/running_mean_std.py
@@ -177,7 +177,7 @@ def profile_tf_runningmeanstd():
     outfile = '/tmp/timeline.json'
     with open(outfile, 'wt') as f:
         f.write(chrome_trace)
-    print(f'Successfully saved profile to {outfile}. Exiting.')
+    print('Successfully saved profile to {}. Exiting.'.format(outfile))
     exit(0)
     '''
 
diff --git a/baselines/common/test_mpi_util.py b/baselines/common/test_mpi_util.py
index dcabf2e..b1146ab 100644
--- a/baselines/common/test_mpi_util.py
+++ b/baselines/common/test_mpi_util.py
@@ -17,7 +17,7 @@ def test_mpi_weighted_mean():
         d = mpi_util.mpi_weighted_mean(comm, name2valcount)
         correctval = {'a' : (10 * 2 + 19) / 3.0, 'b' : 20, 'c' : 42}
         if comm.rank == 0:
-            assert d == correctval, f'{d} != {correctval}'
+            assert d == correctval, '{} != {}'.format(d, correctval)
 
         for name, (val, count) in name2valcount.items():
             for _ in range(count):
diff --git a/baselines/run.py b/baselines/run.py
index 78fec2f..3fd2e60 100644
--- a/baselines/run.py
+++ b/baselines/run.py
@@ -231,7 +231,7 @@ def main(args):
             env.render()
             done = done.any() if isinstance(done, np.ndarray) else done
             if done:
-                print(f'episode_rew={episode_rew}')
+                print('episode_rew={}'.format(episode_rew))
                 episode_rew = 0
                 obs = env.reset()
 

From 77d26aba3ab8061d9e45da59800f8b32fb6f8dd5 Mon Sep 17 00:00:00 2001
From: Jacob Hilton <jhilton@openai.com>
Date: Sun, 24 Mar 2019 12:27:14 -0700
Subject: [PATCH 14/37] build_env now doesn't apply frame stack to retro games
 twice

---
 baselines/common/cmd_util.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/baselines/common/cmd_util.py b/baselines/common/cmd_util.py
index 66c8db8..0990ddd 100644
--- a/baselines/common/cmd_util.py
+++ b/baselines/common/cmd_util.py
@@ -75,6 +75,8 @@ def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.
     if env_type == 'atari':
         env = wrap_deepmind(env, **wrapper_kwargs)
     elif env_type == 'retro':
+        if 'frame_stack' not in wrapper_kwargs:
+            wrapper_kwargs['frame_stack'] = 1
         env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs)
 
     if reward_scale != 1:

From 5bccc3394974f5fd70bfade89a4ce969ed2a5be9 Mon Sep 17 00:00:00 2001
From: Jacob Hilton <jacob.hilton@gmail.com>
Date: Mon, 25 Mar 2019 14:33:15 -0700
Subject: [PATCH 15/37] Merge pull request #860 from
 openai/build-retro-env-framestack-fix

run.py framestack bug fix

From dd68e1b05ce6f9866d89a02d57774d2088d8bbb8 Mon Sep 17 00:00:00 2001
From: Brett Daley <brett-daley@users.noreply.github.com>
Date: Thu, 28 Mar 2019 12:21:48 -0400
Subject: [PATCH 16/37] Report episode rewards/length in A2C and ACKTR (#856)

---
 baselines/a2c/a2c.py     | 8 +++++++-
 baselines/a2c/runner.py  | 8 ++++++--
 baselines/acktr/acktr.py | 6 +++++-
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/baselines/a2c/a2c.py b/baselines/a2c/a2c.py
index 2d0088e..33b8a21 100644
--- a/baselines/a2c/a2c.py
+++ b/baselines/a2c/a2c.py
@@ -11,6 +11,8 @@
 
 from baselines.a2c.utils import Scheduler, find_trainable_variables
 from baselines.a2c.runner import Runner
+from baselines.ppo2.ppo2 import safemean
+from collections import deque
 
 from tensorflow import losses
 
@@ -195,6 +197,7 @@ def learn(
 
     # Instantiate the runner object
     runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
+    epinfobuf = deque(maxlen=100)
 
     # Calculate the batch_size
     nbatch = nenvs*nsteps
@@ -204,7 +207,8 @@ def learn(
 
     for update in range(1, total_timesteps//nbatch+1):
         # Get mini batch of experiences
-        obs, states, rewards, masks, actions, values = runner.run()
+        obs, states, rewards, masks, actions, values, epinfos = runner.run()
+        epinfobuf.extend(epinfos)
 
         policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
         nseconds = time.time()-tstart
@@ -221,5 +225,7 @@ def learn(
             logger.record_tabular("policy_entropy", float(policy_entropy))
             logger.record_tabular("value_loss", float(value_loss))
             logger.record_tabular("explained_variance", float(ev))
+            logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
+            logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
             logger.dump_tabular()
     return model
diff --git a/baselines/a2c/runner.py b/baselines/a2c/runner.py
index 0181472..c9d610a 100644
--- a/baselines/a2c/runner.py
+++ b/baselines/a2c/runner.py
@@ -22,6 +22,7 @@ def run(self):
         # We initialize the lists that will contain the mb of experiences
         mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
         mb_states = self.states
+        epinfos = []
         for n in range(self.nsteps):
             # Given observations, take action and value (V(s))
             # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
@@ -34,7 +35,10 @@ def run(self):
             mb_dones.append(self.dones)
 
             # Take actions in env and look the results
-            obs, rewards, dones, _ = self.env.step(actions)
+            obs, rewards, dones, infos = self.env.step(actions)
+            for info in infos:
+                maybeepinfo = info.get('episode')
+                if maybeepinfo: epinfos.append(maybeepinfo)
             self.states = states
             self.dones = dones
             self.obs = obs
@@ -69,4 +73,4 @@ def run(self):
         mb_rewards = mb_rewards.flatten()
         mb_values = mb_values.flatten()
         mb_masks = mb_masks.flatten()
-        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
+        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, epinfos
diff --git a/baselines/acktr/acktr.py b/baselines/acktr/acktr.py
index a387786..4f6ea4b 100644
--- a/baselines/acktr/acktr.py
+++ b/baselines/acktr/acktr.py
@@ -14,6 +14,8 @@
 from baselines.acktr.runner import Runner
 from baselines.a2c.utils import Scheduler, find_trainable_variables
 from baselines.acktr import kfac
+from baselines.ppo2.ppo2 import safemean
+from collections import deque
 
 
 class Model(object):
@@ -119,6 +121,7 @@ def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interva
         model.load(load_path)
 
     runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
+    epinfobuf = deque(maxlen=100)
     nbatch = nenvs*nsteps
     tstart = time.time()
     coord = tf.train.Coordinator()
@@ -135,7 +138,6 @@ def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interva
     best_savepath = os.path.join(checkdir, "best")
 
     for update in range(1, total_timesteps//nbatch+1):
-        # obs, states, rewards, masks, actions, values = runner.run()
         obs, states, rewards, masks, actions, values, epinfos = runner.run()
         epinfobuf.extend(epinfos)
         policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
@@ -153,6 +155,8 @@ def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interva
             logger.record_tabular("policy_loss", float(policy_loss))
             logger.record_tabular("value_loss", float(value_loss))
             logger.record_tabular("explained_variance", float(ev))
+            logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
+            logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
             logger.dump_tabular()
 
         if save_interval and logger.get_dir():

From a3d2ca3c90fa06ea07976d25fcc1284a753c2591 Mon Sep 17 00:00:00 2001
From: YueErro <yue.trbj@gmail.com>
Date: Sat, 30 Mar 2019 11:13:08 +0100
Subject: [PATCH 17/37] fix joblib dependency after the last rebase

---
 baselines/common/tf_util.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/baselines/common/tf_util.py b/baselines/common/tf_util.py
index 4643438..12c1934 100644
--- a/baselines/common/tf_util.py
+++ b/baselines/common/tf_util.py
@@ -350,6 +350,7 @@ def save_variables(save_path, variables=None, sess=None):
     joblib.dump(save_dict, save_path)
 
 def save_trpo_variables(save_path, variables=None, sess=None):
+    import joblib
     sess = sess or get_session()
     variables = variables or tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
 

From ded3073c7d2edf370596ba6ec4fba54d51af0bdf Mon Sep 17 00:00:00 2001
From: Pastafarianist <Pastafarianist@users.noreply.github.com>
Date: Fri, 29 Mar 2019 23:25:56 +0300
Subject: [PATCH 18/37] Avoid using default config while requesting available
 GPUs (#863)

---
 baselines/common/tf_util.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/baselines/common/tf_util.py b/baselines/common/tf_util.py
index 4643438..02d43bf 100644
--- a/baselines/common/tf_util.py
+++ b/baselines/common/tf_util.py
@@ -305,12 +305,17 @@ def display_var_info(vars):
     logger.info("Total model parameters: %0.2f million" % (count_params*1e-6))
 
 
-def get_available_gpus():
-    # recipe from here:
-    # https://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
+def get_available_gpus(session_config=None):
+    # based on recipe from https://stackoverflow.com/a/38580201
+
+    # Unless we allocate a session here, subsequent attempts to create one
+    # will ignore our custom config (in particular, allow_growth=True will have
+    # no effect).
+    if session_config is None:
+        session_config = get_session()._config
 
     from tensorflow.python.client import device_lib
-    local_device_protos = device_lib.list_local_devices()
+    local_device_protos = device_lib.list_local_devices(session_config)
     return [x.name for x in local_device_protos if x.device_type == 'GPU']
 
 # ================================================================

From cd3c6845491408f2b8dd1638d5f4174d33ae6152 Mon Sep 17 00:00:00 2001
From: zlsh80826 <zlsh80826@gmail.com>
Date: Tue, 2 Apr 2019 06:37:32 +0800
Subject: [PATCH 19/37] ppo2: use time.perf_counter() instead of time.time()
 for time measurement (#847)

---
 baselines/ppo2/ppo2.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py
index d26c957..48db7c8 100644
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -122,13 +122,13 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
     best_savepath = osp.join(checkdir, 'best')
 
     # Start total timer
-    tfirststart = time.time()
+    tfirststart = time.perf_counter()
 
     nupdates = total_timesteps//nbatch
     for update in range(1, nupdates+1):
         assert nbatch % nminibatches == 0
         # Start timer
-        tstart = time.time()
+        tstart = time.perf_counter()
         frac = 1.0 - (update - 1.0) / nupdates
         # Calculate the learning rate
         f = math.e**(-0.001918*update)
@@ -180,7 +180,7 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
         # Feedforward --> get losses --> update
         lossvals = np.mean(mblossvals, axis=0)
         # End timer
-        tnow = time.time()
+        tnow = time.perf_counter()
         # Calculate the fps (frame per second)
         fps = int(nbatch / (tnow - tstart))
 

From 066a6b1d235db7cbd1ead884231e0fd4de00d999 Mon Sep 17 00:00:00 2001
From: Yu Feng <rainwoodman@gmail.com>
Date: Mon, 1 Apr 2019 15:38:45 -0700
Subject: [PATCH 20/37] MPI refer to workers as ranks, not threads. (#833)

---
 baselines/gail/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/baselines/gail/README.md b/baselines/gail/README.md
index 2a8941f..1c1c1bb 100644
--- a/baselines/gail/README.md
+++ b/baselines/gail/README.md
@@ -12,13 +12,13 @@ Download the expert data into `./data`, [download link](https://drive.google.com
 
 ### Step 2: Run GAIL
 
-Run with single thread:
+Run with single rank:
 
 ```bash
 python -m baselines.gail.run_mujoco
 ```
 
-Run with multiple threads:
+Run with multiple ranks:
 
 ```bash
 mpirun -np 16 python -m baselines.gail.run_mujoco

From 3547d0dc47b6ab30db52bdfeace19bf4acfde78d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dar=C3=ADo=20Here=C3=B1=C3=BA?= <magallania@gmail.com>
Date: Mon, 1 Apr 2019 19:41:52 -0300
Subject: [PATCH 21/37] Fixed typo on #092 (#824)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5330c20..a1ff65f 100644
--- a/README.md
+++ b/README.md
@@ -100,7 +100,7 @@ python -m baselines.run --alg=ppo2 --env=Humanoid-v2 --network=mlp --num_timeste
 will set entropy coefficient to 0.1, and construct fully connected network with 3 layers with 32 hidden units in each, and create a separate network for value function estimation (so that its parameters are not shared with the policy network, but the structure is the same)
 
 See docstrings in [common/models.py](baselines/common/models.py) for description of network parameters for each type of model, and
-docstring for [baselines/ppo2/ppo2.py/learn()](baselines/ppo2/ppo2.py#L152) for the description of the ppo2 hyperparamters.
+docstring for [baselines/ppo2/ppo2.py/learn()](baselines/ppo2/ppo2.py#L152) for the description of the ppo2 hyperparameters.
 
 ### Example 2. DQN on Atari
 DQN with Atari is at this point a classics of benchmarks. To run the baselines implementation of DQN on Atari Pong:

From 60510c4717862e99b4a6e34863dc8820a6291a0d Mon Sep 17 00:00:00 2001
From: Mingfei <mingfei.sun.hk@gmail.com>
Date: Tue, 2 Apr 2019 06:44:31 +0800
Subject: [PATCH 22/37] fix bugs: obs_ph normalization in adversary.py (#823)

* fix bugs: obs_ph normalization in adversary.py

* fix bug in reshape obs and acs in Mujobo_Dset
---
 baselines/gail/dataset/mujoco_dset.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/baselines/gail/dataset/mujoco_dset.py b/baselines/gail/dataset/mujoco_dset.py
index 0693262..2ada872 100644
--- a/baselines/gail/dataset/mujoco_dset.py
+++ b/baselines/gail/dataset/mujoco_dset.py
@@ -50,8 +50,12 @@ def __init__(self, expert_path, train_fraction=0.7, traj_limitation=-1, randomiz
         # obs, acs: shape (N, L, ) + S where N = # episodes, L = episode length
         # and S is the environment observation/action space.
         # Flatten to (N * L, prod(S))
-        self.obs = np.reshape(obs, [-1, np.prod(obs.shape[2:])])
-        self.acs = np.reshape(acs, [-1, np.prod(acs.shape[2:])])
+        if len(obs.shape[2:]) != 0:
+            self.obs = np.reshape(obs, [-1, np.prod(obs.shape[2:])])
+            self.acs = np.reshape(acs, [-1, np.prod(acs.shape[2:])])
+        else:
+            self.obs = np.vstack(obs)
+            self.acs = np.vstack(acs)
 
         self.rets = traj_data['ep_rets'][:traj_limitation]
         self.avg_ret = sum(self.rets)/len(self.rets)

From 922249a804614bfebddc6db64b1cc70120166123 Mon Sep 17 00:00:00 2001
From: Peter Zhokhov <peterz@openai.com>
Date: Mon, 1 Apr 2019 15:47:13 -0700
Subject: [PATCH 23/37] neaten up stacking logic in mujoco_dset in gail

---
 baselines/gail/dataset/mujoco_dset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/baselines/gail/dataset/mujoco_dset.py b/baselines/gail/dataset/mujoco_dset.py
index 2ada872..f5e9c27 100644
--- a/baselines/gail/dataset/mujoco_dset.py
+++ b/baselines/gail/dataset/mujoco_dset.py
@@ -50,7 +50,7 @@ def __init__(self, expert_path, train_fraction=0.7, traj_limitation=-1, randomiz
         # obs, acs: shape (N, L, ) + S where N = # episodes, L = episode length
         # and S is the environment observation/action space.
         # Flatten to (N * L, prod(S))
-        if len(obs.shape[2:]) != 0:
+        if len(obs.shape) > 2:
             self.obs = np.reshape(obs, [-1, np.prod(obs.shape[2:])])
             self.acs = np.reshape(acs, [-1, np.prod(acs.shape[2:])])
         else:

From 00f09d8e3ee585c7efc06a14cadb17d8fb1fb280 Mon Sep 17 00:00:00 2001
From: "Hao-Chih, Lin" <8316182+haochihlin@users.noreply.github.com>
Date: Tue, 2 Apr 2019 00:48:35 +0200
Subject: [PATCH 24/37] fix small bug in plot_results() (#864)

Remove the comma behind the last input argument
---
 baselines/common/plot_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/baselines/common/plot_util.py b/baselines/common/plot_util.py
index 6a7d3d1..efd2e38 100644
--- a/baselines/common/plot_util.py
+++ b/baselines/common/plot_util.py
@@ -248,7 +248,7 @@ def plot_results(
     figsize=None,
     legend_outside=False,
     resample=0,
-    smooth_step=1.0,
+    smooth_step=1.0
 ):
     '''
     Plot multiple Results objects

From d452993a772d42445e0c37a53aafff298b702915 Mon Sep 17 00:00:00 2001
From: JongGyun Kim <truth0233@gmail.com>
Date: Tue, 2 Apr 2019 07:49:25 +0900
Subject: [PATCH 25/37] fix the definition of `TfInput.make_feed_dict`. (#812)

---
 baselines/deepq/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/baselines/deepq/utils.py b/baselines/deepq/utils.py
index faf3c5e..6b3449e 100644
--- a/baselines/deepq/utils.py
+++ b/baselines/deepq/utils.py
@@ -20,7 +20,7 @@ def get(self):
         """
         raise NotImplementedError
 
-    def make_feed_dict(data):
+    def make_feed_dict(self, data):
         """Given data input it to the placeholder(s)."""
         raise NotImplementedError
 

From 1be03a66590dec4e581b8846b544d4a89a3fde30 Mon Sep 17 00:00:00 2001
From: Sridhar Thiagarajan <thiagarajan14111@mech.ssn.edu.in>
Date: Mon, 1 Apr 2019 16:24:02 -0700
Subject: [PATCH 26/37] Interface for U.make_session changed (#865)

---
 baselines/deepq/experiments/custom_cartpole.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/baselines/deepq/experiments/custom_cartpole.py b/baselines/deepq/experiments/custom_cartpole.py
index b5a381a..fb3d5f8 100644
--- a/baselines/deepq/experiments/custom_cartpole.py
+++ b/baselines/deepq/experiments/custom_cartpole.py
@@ -23,7 +23,7 @@ def model(inpt, num_actions, scope, reuse=False):
 
 
 if __name__ == '__main__':
-    with U.make_session(8):
+    with U.make_session(num_cpu=8):
         # Create the environment
         env = gym.make("CartPole-v0")
         # Create all the functions necessary to train the model

From 5205590f3b11c3c75d38de097cb0aced7c05001b Mon Sep 17 00:00:00 2001
From: EliasBarba <elias.barba@gmail.com>
Date: Fri, 5 Apr 2019 17:25:28 +0200
Subject: [PATCH 27/37] add posibility to add stats

---
 baselines/ppo2/defaults.py | 10 +++++-----
 baselines/ppo2/ppo2.py     |  8 +++++++-
 baselines/ppo2/runner.py   |  8 ++++----
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/baselines/ppo2/defaults.py b/baselines/ppo2/defaults.py
index 54b224b..0014363 100644
--- a/baselines/ppo2/defaults.py
+++ b/baselines/ppo2/defaults.py
@@ -36,8 +36,8 @@ def mara_mlp():
         log_interval = 1,
         ent_coef = 0.0,
         lr = lambda f: 3e-3 * f,
-        cliprange = 0.2,
-        vf_coef = 0.5,
+        cliprange = 0.25,
+        vf_coef = 1,
         max_grad_norm = 0.5,
         seed = 0,
         value_network = 'copy',
@@ -51,7 +51,7 @@ def mara_mlp():
         # env_name = 'MARACollisionOrient-v0',
         transfer_path = None,
         # transfer_path = '/tmp/ros2learn/MARA-v0/ppo2_mlp/2019-02-19_12h47min/checkpoints/best',
-        trained_path = '/tmp/ros2learn/MARAOrient-v0/ppo2_mlp/2019-03-26_14h27min/checkpoints/best'
+        trained_path = '/tmp/ros2learn/MARA-v0/ppo2_mlp/2019-04-02_13h18min/checkpoints/best'
     )
 
 def mara_lstm():
@@ -79,8 +79,8 @@ def mara_lstm():
         network = 'lstm',
         total_timesteps = 1e8,
         save_interval = 10,
-        env_name = 'MARACollisionOrientRandomTarget-v0',
-        num_envs = 2,
+        env_name = 'MARA-v0',
+        num_envs = 4,
         transfer_path = None,
         # transfer_path = '/tmp/ros2learn/MARACollisionOrientRandomTarget-v0/ppo2_lstm/checkpoints/00090',
         trained_path = '/tmp/ros2learn/MARACollisionOrientRandomTarget-v0/ppo2_lstm/checkpoints/00090'
diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py
index d26c957..3848fff 100644
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -138,6 +138,7 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
         cliprangenow = cliprange(frac)
         # Get minibatch
         obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
+
         if eval_env is not None:
             eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632
 
@@ -194,7 +195,7 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
             logger.logkv("fps", fps)
             logger.logkv("explained_variance", float(ev))
             mean_rewbuffer = safemean([epinfo['r'] for epinfo in epinfobuf])
-            logger.logkv('eprewmean', mean_rewbuffer)
+            logger.logkv('eprewmean_smooth', mean_rewbuffer)
             logger.logkv('eprewsem', np.std([epinfo['r'] for epinfo in epinfobuf]))
             logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf]))
             if eval_env is not None:
@@ -203,6 +204,11 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
             logger.logkv('time_elapsed', tnow - tfirststart)
             for (lossval, lossname) in zip(lossvals, model.loss_names):
                 logger.logkv(lossname, lossval)
+
+            key_set = [key for key in list(epinfobuf)[0].keys() if key not in ["r", "l", "t"]]
+            for key in key_set:
+                logger.logkv(key, list(epinfobuf)[-1][key])
+
             if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
                 logger.dumpkvs()
 
diff --git a/baselines/ppo2/runner.py b/baselines/ppo2/runner.py
index 5a30505..2f26987 100644
--- a/baselines/ppo2/runner.py
+++ b/baselines/ppo2/runner.py
@@ -22,6 +22,7 @@ def run(self):
         mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
         mb_states = self.states
         epinfos = []
+        maybeepinfo = []
         # For n in range number of steps
         for _ in range(self.nsteps):
             # Given observations, get action value and neglopacs
@@ -37,8 +38,9 @@ def run(self):
             # Infos contains a ton of useful informations
             self.obs[:], rewards, self.dones, infos = self.env.step(actions)
             for info in infos:
-                maybeepinfo = info.get('episode')
-                if maybeepinfo: epinfos.append(maybeepinfo)
+                for key in info.keys():
+                    maybeepinfo.append(info.get(key))
+                if maybeepinfo: epinfos.append({key:dict[key] for dict in maybeepinfo for key in dict})
             mb_rewards.append(rewards)
         #batch of steps to batch of rollouts
         mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
@@ -72,5 +74,3 @@ def sf01(arr):
     """
     s = arr.shape
     return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])
-
-

From a47368cc1520aefc12984acbdf50d3cfd7e3a69f Mon Sep 17 00:00:00 2001
From: EliasBarba <elias.barba@gmail.com>
Date: Mon, 8 Apr 2019 10:41:37 +0200
Subject: [PATCH 28/37] refactor stats addition

---
 baselines/ppo2/ppo2.py   | 2 +-
 baselines/ppo2/runner.py | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py
index 3848fff..380ae9a 100644
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -205,7 +205,7 @@ def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2
             for (lossval, lossname) in zip(lossvals, model.loss_names):
                 logger.logkv(lossname, lossval)
 
-            key_set = [key for key in list(epinfobuf)[0].keys() if key not in ["r", "l", "t"]]
+            key_set = [key for key in list(epinfobuf)[-1].keys() if key not in ["r", "l", "t"]]
             for key in key_set:
                 logger.logkv(key, list(epinfobuf)[-1][key])
 
diff --git a/baselines/ppo2/runner.py b/baselines/ppo2/runner.py
index 2f26987..deb266c 100644
--- a/baselines/ppo2/runner.py
+++ b/baselines/ppo2/runner.py
@@ -38,9 +38,8 @@ def run(self):
             # Infos contains a ton of useful informations
             self.obs[:], rewards, self.dones, infos = self.env.step(actions)
             for info in infos:
-                for key in info.keys():
-                    maybeepinfo.append(info.get(key))
-                if maybeepinfo: epinfos.append({key:dict[key] for dict in maybeepinfo for key in dict})
+                maybeepinfo = {sub_key:info[key][sub_key] for key in info.keys() for sub_key in info[key]}
+                if maybeepinfo: epinfos.append(maybeepinfo)
             mb_rewards.append(rewards)
         #batch of steps to batch of rollouts
         mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)

From e4a4b21f97af549d5cd268c8c1d72fb8bc862e99 Mon Sep 17 00:00:00 2001
From: EliasBarba <elias.barba@gmail.com>
Date: Mon, 8 Apr 2019 10:46:26 +0200
Subject: [PATCH 29/37] remove unused initialization

---
 baselines/ppo2/runner.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/baselines/ppo2/runner.py b/baselines/ppo2/runner.py
index deb266c..0529da3 100644
--- a/baselines/ppo2/runner.py
+++ b/baselines/ppo2/runner.py
@@ -22,7 +22,6 @@ def run(self):
         mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
         mb_states = self.states
         epinfos = []
-        maybeepinfo = []
         # For n in range number of steps
         for _ in range(self.nsteps):
             # Given observations, get action value and neglopacs

From 0052c1551b11e305846163b97e9cb0c67886ed1d Mon Sep 17 00:00:00 2001
From: Peter Zhokhov <peterz@openai.com>
Date: Fri, 5 Apr 2019 14:43:09 -0700
Subject: [PATCH 30/37] parse colon-separated env_id's

---
 baselines/common/cmd_util.py | 1 -
 baselines/run.py             | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/baselines/common/cmd_util.py b/baselines/common/cmd_util.py
index 0990ddd..60ce2b6 100644
--- a/baselines/common/cmd_util.py
+++ b/baselines/common/cmd_util.py
@@ -149,7 +149,6 @@ def common_arg_parser():
     parser.add_argument('--save_video_interval', help='Save video every x steps (0 = disabled)', default=0, type=int)
     parser.add_argument('--save_video_length', help='Length of recorded video. Default: 200', default=200, type=int)
     parser.add_argument('--play', default=False, action='store_true')
-    parser.add_argument('--extra_import', help='Extra module to import to access external environments', type=str, default=None)
     return parser
 
 def robotics_arg_parser():
diff --git a/baselines/run.py b/baselines/run.py
index 3fd2e60..297df69 100644
--- a/baselines/run.py
+++ b/baselines/run.py
@@ -1,4 +1,5 @@
 import sys
+import re
 import multiprocessing
 import os.path as osp
 import gym
@@ -136,6 +137,8 @@ def get_env_type(args):
             if env_id in e:
                 env_type = g
                 break
+        if ':' in env_id:
+            env_type = re.sub(r':.*', '', env_id)
         assert env_type is not None, 'env_id {} is not recognized in env types'.format(env_id, _game_envs.keys())
 
     return env_type, env_id
@@ -196,9 +199,6 @@ def main(args):
     args, unknown_args = arg_parser.parse_known_args(args)
     extra_args = parse_cmdline_kwargs(unknown_args)
 
-    if args.extra_import is not None:
-        import_module(args.extra_import)
-
     if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
         rank = 0
         logger.configure()

From b9550547c0066860489835e57bdc12fc84108913 Mon Sep 17 00:00:00 2001
From: Tim Zaman <timbobel@gmail.com>
Date: Fri, 5 Apr 2019 14:46:01 -0700
Subject: [PATCH 31/37] Add eps to normalization (#797)

---
 baselines/ddpg/ddpg_learner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/baselines/ddpg/ddpg_learner.py b/baselines/ddpg/ddpg_learner.py
index 3fc8a77..9058334 100755
--- a/baselines/ddpg/ddpg_learner.py
+++ b/baselines/ddpg/ddpg_learner.py
@@ -17,7 +17,7 @@
 def normalize(x, stats):
     if stats is None:
         return x
-    return (x - stats.mean) / stats.std
+    return (x - stats.mean) / (stats.std + 1e-8)
 
 
 def denormalize(x, stats):

From 60137f472b7571b73a5c68db8113cbe5922dd5ef Mon Sep 17 00:00:00 2001
From: Xingdong Zuo <zuoxingdong@users.noreply.github.com>
Date: Sat, 6 Apr 2019 00:16:26 +0200
Subject: [PATCH 32/37] [Update misc_util.py]: clean up unused helper functions
 (#751)

* Update misc_util.py

* Update misc_util.py
---
 baselines/common/misc_util.py | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/baselines/common/misc_util.py b/baselines/common/misc_util.py
index 6a296d4..48bc3de 100644
--- a/baselines/common/misc_util.py
+++ b/baselines/common/misc_util.py
@@ -13,27 +13,6 @@ def zipsame(*seqs):
     return zip(*seqs)
 
 
-def unpack(seq, sizes):
-    """
-    Unpack 'seq' into a sequence of lists, with lengths specified by 'sizes'.
-    None = just one bare element, not a list
-
-    Example:
-    unpack([1,2,3,4,5,6], [3,None,2]) -> ([1,2,3], 4, [5,6])
-    """
-    seq = list(seq)
-    it = iter(seq)
-    assert sum(1 if s is None else s for s in sizes) == len(seq), "Trying to unpack %s into %s" % (seq, sizes)
-    for size in sizes:
-        if size is None:
-            yield it.__next__()
-        else:
-            li = []
-            for _ in range(size):
-                li.append(it.__next__())
-            yield li
-
-
 class EzPickle(object):
     """Objects that are pickled and unpickled via their constructor
     arguments.

From 4385c198b165a234fb0b4f0eecc6d28d44cc5c62 Mon Sep 17 00:00:00 2001
From: pzhokhov <peterzhokhoff@gmail.com>
Date: Fri, 5 Apr 2019 15:18:15 -0700
Subject: [PATCH 33/37] short-circuit framestack wrapper with size 1 (#871)

---
 baselines/common/retro_wrappers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/baselines/common/retro_wrappers.py b/baselines/common/retro_wrappers.py
index 2c42926..badbbdd 100644
--- a/baselines/common/retro_wrappers.py
+++ b/baselines/common/retro_wrappers.py
@@ -215,7 +215,8 @@ def wrap_deepmind_retro(env, scale=True, frame_stack=4):
     """
     env = WarpFrame(env)
     env = ClipRewardEnv(env)
-    env = FrameStack(env, frame_stack)
+    if frame_stack > 1:
+        env = FrameStack(env, frame_stack)
     if scale:
         env = ScaledFloatFrame(env)
     return env

From 5d85341151e0f186fb6f58c5422c6cb876b76514 Mon Sep 17 00:00:00 2001
From: Peter Zhokhov <peterz@openai.com>
Date: Fri, 5 Apr 2019 15:23:46 -0700
Subject: [PATCH 34/37] fix shuffling bug in ppo1

---
 baselines/ppo1/pposgd_simple.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/baselines/ppo1/pposgd_simple.py b/baselines/ppo1/pposgd_simple.py
index 505b652..ffb52c0 100644
--- a/baselines/ppo1/pposgd_simple.py
+++ b/baselines/ppo1/pposgd_simple.py
@@ -195,7 +195,7 @@ def learn(env, policy_fn, *,
         ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
         vpredbefore = seg["vpred"] # predicted value function before udpate
         atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
-        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent)
+        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), deterministic=pi.recurrent)
         optim_batchsize = optim_batchsize or ob.shape[0]
 
         if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

From f1457bdb3322fe158d06c5209e46002739ffa843 Mon Sep 17 00:00:00 2001
From: Peter Zhokhov <peterz@openai.com>
Date: Sat, 6 Apr 2019 20:03:32 -0700
Subject: [PATCH 35/37] fix commit on atari bms page to point to a public
 commit

---
 benchmarks_atari10M.htm | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/benchmarks_atari10M.htm b/benchmarks_atari10M.htm
index 9228953..626fc0b 100644
--- a/benchmarks_atari10M.htm
+++ b/benchmarks_atari10M.htm
@@ -120,7 +120,7 @@ <h1>Atari10M Comparison</h1>
             
             <td>114.26</td>
             
-            <td>cbd21ef</td>
+            <td><a href=https://github.com/openai/baselines/commit/7bfbcf177eca8f46c0c0bfbb378e044539f5e061>7bfbcf1</a></td>
             
         </tr>
         
@@ -152,7 +152,7 @@ <h1>Atari10M Comparison</h1>
             
             <td>131.46</td>
             
-            <td>cbd21ef</td>
+            <td><a href=https://github.com/openai/baselines/commit/7bfbcf177eca8f46c0c0bfbb378e044539f5e061>7bfbcf1</a></td>
             
         </tr>
         
@@ -184,7 +184,7 @@ <h1>Atari10M Comparison</h1>
             
             <td>113.58</td>
             
-            <td>cbd21ef</td>
+            <td><a href=https://github.com/openai/baselines/commit/7bfbcf177eca8f46c0c0bfbb378e044539f5e061>7bfbcf1</a></td>
             
         </tr>
         
@@ -216,7 +216,7 @@ <h1>Atari10M Comparison</h1>
             
             <td>82.94</td>
             
-            <td>cbd21ef</td>
+            <td><a href=https://github.com/openai/baselines/commit/7bfbcf177eca8f46c0c0bfbb378e044539f5e061>7bfbcf1</a></td>
             
         </tr>
         
@@ -248,7 +248,7 @@ <h1>Atari10M Comparison</h1>
             
             <td>81.61</td>
             
-            <td>cbd21ef</td>
+            <td><a href=https://github.com/openai/baselines/commit/7bfbcf177eca8f46c0c0bfbb378e044539f5e061>7bfbcf1</a></td>
             
         </tr>
         
@@ -280,7 +280,7 @@ <h1>Atari10M Comparison</h1>
             
             <td>59.72</td>
             
-            <td>cbd21ef</td>
+            <td><a href=https://github.com/openai/baselines/commit/7bfbcf177eca8f46c0c0bfbb378e044539f5e061>7bfbcf1</a></td>
             
         </tr>
         
@@ -312,7 +312,7 @@ <h1>Atari10M Comparison</h1>
             
             <td>14.98</td>
             
-            <td>cbd21ef</td>
+            <td><a href=https://github.com/openai/baselines/commit/7bfbcf177eca8f46c0c0bfbb378e044539f5e061>7bfbcf1</a></td>
             
         </tr>
         

From 2072d7ec7dd6bd769ad4d042c1647102d3be6ae2 Mon Sep 17 00:00:00 2001
From: Nestor Gonzalez <nestor94gonzalez@gmail.com>
Date: Thu, 11 Apr 2019 11:20:54 +0700
Subject: [PATCH 36/37] mara_mlp nminibatches = 4

---
 baselines/ppo2/defaults.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/baselines/ppo2/defaults.py b/baselines/ppo2/defaults.py
index 0014363..cb34da7 100644
--- a/baselines/ppo2/defaults.py
+++ b/baselines/ppo2/defaults.py
@@ -29,7 +29,7 @@ def mara_mlp():
         num_hidden = 16,
         layer_norm = False,
         nsteps = 1024,#2048
-        nminibatches = 8, #batchsize = nevn * nsteps // nminibatches
+        nminibatches = 4, #batchsize = nevn * nsteps // nminibatches
         lam = 0.95,
         gamma = 0.99,
         noptepochs = 10,

From 6cf3d43a68404827e522b4fc7717a7c362ab383d Mon Sep 17 00:00:00 2001
From: Nestor Gonzalez <nestor94gonzalez@gmail.com>
Date: Thu, 11 Apr 2019 14:06:45 +0700
Subject: [PATCH 37/37] Increase total_timesteps in mara_mlp

---
 baselines/ppo2/defaults.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/baselines/ppo2/defaults.py b/baselines/ppo2/defaults.py
index cb34da7..bd2c59c 100644
--- a/baselines/ppo2/defaults.py
+++ b/baselines/ppo2/defaults.py
@@ -42,7 +42,7 @@ def mara_mlp():
         seed = 0,
         value_network = 'copy',
         network = 'mlp',
-        total_timesteps = 1e6,
+        total_timesteps = 1e8,
         save_interval = 10,
         env_name = 'MARA-v0',
         #env_name = 'MARAReal-v0',