From addafb1a9ec9f3ebb659fd4eed0417bbecd4fa6b Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Wed, 23 Aug 2023 02:12:33 +0800 Subject: [PATCH] docs: refine typo in docs --- Makefile | 35 ++++++++++++--- docs/requirements.txt | 1 + docs/source/algorithms/comparision.rst | 2 +- docs/source/algorithms/curve.rst | 6 +-- docs/source/algorithms/first_order.rst | 14 +++--- docs/source/algorithms/lag.rst | 6 +-- docs/source/api/buffer.rst | 2 +- docs/source/api/env.rst | 2 +- docs/source/api/logger.rst | 2 +- docs/source/conf.py | 5 +++ docs/source/index.rst | 2 +- docs/{ => source}/spelling_wordlist.txt | 22 ++++++++- docs/source/usage/eval.rst | 4 +- docs/source/usage/implement.rst | 10 ++--- docs/source/usage/make.rst | 10 ++--- docs/source/usage/train.rst | 60 ++++++++++++------------- 16 files changed, 116 insertions(+), 67 deletions(-) rename docs/{ => source}/spelling_wordlist.txt (94%) diff --git a/Makefile b/Makefile index 5e8b7bf..51dc08e 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,8 @@ default: install check_pip_install = $(PYTHON) -m pip show $(1) &>/dev/null || (cd && $(PYTHON) -m pip install $(1) --upgrade) check_pip_install_extra = $(PYTHON) -m pip show $(1) &>/dev/null || (cd && $(PYTHON) -m pip install $(2) --upgrade) +# Installations + install: $(PYTHON) -m pip install -vvv . @@ -26,6 +28,24 @@ install-editable: install-e: install-editable # alias +docs-install: + $(call check_pip_install_extra,pydocstyle,pydocstyle[toml]) + $(call check_pip_install,doc8) + $(call check_pip_install,sphinx) + $(call check_pip_install,sphinx-autoapi) + $(call check_pip_install,sphinx-autobuild) + $(call check_pip_install,sphinx-copybutton) + $(call check_pip_install,sphinx-autodoc-typehints) + $(call check_pip_install_extra,sphinxcontrib-spelling,sphinxcontrib-spelling pyenchant) + $(PYTHON) -m pip install -r docs/requirements.txt + +pytest-install: + $(call check_pip_install,pytest) + $(call check_pip_install,pytest-cov) + $(call check_pip_install,pytest-xdist) + +# Benchmark + multi-benchmark: cd safepo/multi_agent && $(PYTHON) benchmark.py --total-steps 10000000 --experiment benchmark @@ -64,13 +84,16 @@ test-benchmark: install-editable multi-test-benchmark single-test-benchmark plot benchmark: install-editable multi-benchmark single-benchmark plot eval -pytest-install: - $(call check_pip_install,pytest) - $(call check_pip_install,pytest-cov) - $(call check_pip_install,pytest-xdist) - pytest: pytest-install cd tests && \ $(PYTHON) -m pytest --verbose --color=yes --durations=0 \ --cov="../safepo" --cov-config=.coveragerc --cov-report=xml --cov-report=term-missing \ - $(PYTESTOPTS) . \ No newline at end of file + $(PYTESTOPTS) . + +# Documentation + +docs: docs-install + $(PYTHON) -m sphinx_autobuild --watch $(PROJECT_PATH) --open-browser docs/source docs/build + +spelling: docs-install + $(PYTHON) -m sphinx_autobuild -b spelling docs/source docs/build \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt index a33cd38..efaf544 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -7,3 +7,4 @@ sphinx-design moviepy pygame sphinx_github_changelog +sphinxcontrib-spelling \ No newline at end of file diff --git a/docs/source/algorithms/comparision.rst b/docs/source/algorithms/comparision.rst index 952927b..4e12261 100644 --- a/docs/source/algorithms/comparision.rst +++ b/docs/source/algorithms/comparision.rst @@ -13,7 +13,7 @@ We have compared the following algorithms: - ``CPO``: `OpenAI Baselines: Safety Starter Agents `_, `RL Safety Algorithms `_ - ``FOCOPS``: `Original Implementation `_ -We compared those alforithms in 12 tasks from `Safety-Gymnasium `_, +We compared those algorithms in 12 tasks from `Safety-Gymnasium `_, they are: - ``SafetyPointButton1-v0`` diff --git a/docs/source/algorithms/curve.rst b/docs/source/algorithms/curve.rst index 1cd785f..c35da71 100644 --- a/docs/source/algorithms/curve.rst +++ b/docs/source/algorithms/curve.rst @@ -4,7 +4,7 @@ Training Curves Safe reinforcement learning algorithms are designed to achieve high reward while satisfying the safety constraint. In this section, we evaluate the performance of SafePO's algorithms on the various environments in `Safety-Gymnasium `_. -Single Agent +Single-Agent ------------ First order @@ -97,8 +97,8 @@ Second order -Muilti-Agent ------------- +Multi-Agent +----------- .. tab-set:: diff --git a/docs/source/algorithms/first_order.rst b/docs/source/algorithms/first_order.rst index 49f058a..5c9b4e0 100644 --- a/docs/source/algorithms/first_order.rst +++ b/docs/source/algorithms/first_order.rst @@ -32,7 +32,7 @@ Implementation Details .. note:: - All experiemnts are ran under total 1e7 steps, while in the `Doggo `_ agent, 1e8 steps are used. + All experiments are ran under total 1e7 steps, while in the `Doggo `_ agent, 1e8 steps are used. This setting is the same as `Safety-Gym `_ Environment Wrapper @@ -81,8 +81,8 @@ of observations, rewards and costs: Lagrangian Multiplier ~~~~~~~~~~~~~~~~~~~~~ -Lagreangian-based alforithms use ``Lagrangian Multiplier`` to control the safety -constraint. The ``Lagrangian Multiplier`` is an intergrated part of +Lagrangian-based algorithms use ``Lagrangian Multiplier`` to control the safety +constraint. The ``Lagrangian Multiplier`` is an Integrated part of SafePO. Some key points: @@ -132,9 +132,9 @@ We provide how ``SafePO`` implements the two stage projection: .. tab-item:: CUP - CUP first make a PPO update to imporve the policy reward. + CUP first make a PPO update to improve the policy reward. Then it projects the policy back to the safe set. - We will foccus on the projection part. + We will focus on the projection part. - Get the cost advantage from buffer and prepare training data. @@ -149,7 +149,7 @@ We provide how ``SafePO`` implements the two stage projection: shuffle=True, ) - - Update the policy by using cost adavantage and kl divergence. + - Update the policy by using cost advantage and kl divergence. .. code:: python @@ -177,7 +177,7 @@ We provide how ``SafePO`` implements the two stage projection: distribution, old_distribution_b ).sum(-1, keepdim=True) - - Then, update the policy by using cost adavantage and kl divergence. + - Then, update the policy by using cost advantage and kl divergence. .. code:: python diff --git a/docs/source/algorithms/lag.rst b/docs/source/algorithms/lag.rst index 7bdde4c..c4e4746 100644 --- a/docs/source/algorithms/lag.rst +++ b/docs/source/algorithms/lag.rst @@ -32,7 +32,7 @@ Implement Details .. note:: - All experiemnts are ran under total 1e7 steps, while in the `Doggo `_ agent, 1e8 steps are used. + All experiments are ran under total 1e7 steps, while in the `Doggo `_ agent, 1e8 steps are used. This setting is the same as `Safety-Gym `_ Environment Wrapper @@ -81,8 +81,8 @@ of observations, rewards and costs: Lagrangian Multiplier ~~~~~~~~~~~~~~~~~~~~~ -Lagreangian-based alforithms use ``Lagrangian Multiplier`` to control the safety -constraint. The ``Lagrangian Multiplier`` is an intergrated part of +Lagrangian-based algorithms use ``Lagrangian Multiplier`` to control the safety +constraint. The ``Lagrangian Multiplier`` is an Integrated part of SafePO. Some key points: diff --git a/docs/source/api/buffer.rst b/docs/source/api/buffer.rst index 25ed0de..b4b7081 100644 --- a/docs/source/api/buffer.rst +++ b/docs/source/api/buffer.rst @@ -3,7 +3,7 @@ Buffer .. currentmodule:: safepo.common.buffer -Single Agent Buffer +Single-Agent Buffer ------------------- .. autoclass:: VectorizedOnPolicyBuffer diff --git a/docs/source/api/env.rst b/docs/source/api/env.rst index fb56160..183e06a 100644 --- a/docs/source/api/env.rst +++ b/docs/source/api/env.rst @@ -3,7 +3,7 @@ Environment Maker .. currentmodule:: safepo.common.env -Single Agent Environment +Single-Agent Environment ------------------------ MuJoCo Environment diff --git a/docs/source/api/logger.rst b/docs/source/api/logger.rst index ca139b0..6071b71 100644 --- a/docs/source/api/logger.rst +++ b/docs/source/api/logger.rst @@ -1,7 +1,7 @@ Logger ====== -Simple usage +Simple Usage ------------ .. code-block:: python diff --git a/docs/source/conf.py b/docs/source/conf.py index e3682cc..4c7b37a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -6,6 +6,7 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information import pathlib +import os import sys @@ -33,6 +34,10 @@ 'sphinx_design', ] +if not os.getenv('READTHEDOCS', None): + extensions.append('sphinxcontrib.spelling') + +source_suffix = {'.rst': 'restructuredtext', '.md': 'markdown'} templates_path = ['_templates'] exclude_patterns = [] diff --git a/docs/source/index.rst b/docs/source/index.rst index 59c6511..4aa9bc2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -41,7 +41,7 @@ One line to run SafePO benchmark: make benchmark Then you can check the runs in ``safepo/runs``. After that, you can check the -results (eavluation outcomes, training curves) in ``safepo/results``. +results (evaluation outcomes, training curves) in ``safepo/results``. .. toctree:: diff --git a/docs/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt similarity index 94% rename from docs/spelling_wordlist.txt rename to docs/source/spelling_wordlist.txt index dd53fd5..1236177 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/source/spelling_wordlist.txt @@ -18,7 +18,7 @@ algos config configs timestep -timesteps +steps rollout GAE PPO @@ -405,3 +405,23 @@ Unsqueeze rescales affinely rescales +eval +dir +cpu +tensorboard +rollout +benchmarking +conda +num +rnn +probs +randn +csv +hyperparameters +reproducibility +Dimensionality +Normalizer +Stooke +pkl +serializable +subclasses diff --git a/docs/source/usage/eval.rst b/docs/source/usage/eval.rst index e77e3ca..c4477aa 100644 --- a/docs/source/usage/eval.rst +++ b/docs/source/usage/eval.rst @@ -16,7 +16,7 @@ This will evaluate the model in the last checkpoint of the training, and save th Training Curve Plotter ---------------------- -Training curves reveal the episodic reward and cost overtime, which is usefull to evaluate the performance of the algorithms. +Training curves reveal the episodic reward and cost overtime, which is useful to evaluate the performance of the algorithms. suppose you have ran the training script in `algorithms training <./train.html>`_ and saved the training log in `safepo/runs/ppo_lag_exp`, then you can plot the training curve by running: @@ -27,7 +27,7 @@ suppose you have ran the training script in `algorithms training <./train.html>` .. note:: - This plotter is also suitable for mmulti-agent algorithms plotting. However, in experiment we found that + This plotter is also suitable for multi-agent algorithms plotting. However, in experiment we found that the cost value training curve of multi-agent safe and unsafe algorithms are largely different, which makes the plot not very clear. So we recommend to plot the multi-agent training curve by running the plotter in ``safepo/multi_agent/plot_for_benchmark``. diff --git a/docs/source/usage/implement.rst b/docs/source/usage/implement.rst index ba1ebb1..496c812 100644 --- a/docs/source/usage/implement.rst +++ b/docs/source/usage/implement.rst @@ -18,20 +18,20 @@ To verify the correctness of the classic RL algorithms, we provide the performan -Intergrated Safe RL Pipeline +Integrated Safe RL Pipeline ---------------------------- SafePO's classic RL algorithms are integrated with the Safe RL pipeline, though they make no use of the constraint. You can customize the Safe RL algorithms based on the classic RL algorithms. -Breifly, the ``PPO`` in SafePO has the following characteristics, which are also suitable for other customization of safe RL algorithms. +Briefly, the ``PPO`` in SafePO has the following characteristics, which are also suitable for other customization of safe RL algorithms. -- ``VectorizedOnPolicyBuffer``: A vectorized buffer supporting cost adavantage estimation. +- ``VectorizedOnPolicyBuffer``: A vectorized buffer supporting cost advantage estimation. - ``ActorVCritic``: A actor-critic network supporting cost value estimation. - ``Lagrange``: A lagrangian multiplier for constraint violation control. Beyond the above characteristics, the ``PPO`` in SafePO also provides a training pipeline for data collection and training. -You can customize new alforithms based on it. +You can customize new algorithms based on it. Next we will provide a detailed example to show how to customize the ``PPO`` algorithm to ``PPO-Lag`` algorithm. @@ -39,7 +39,7 @@ Example: PPO-Lag ---------------- The Lagrangian multiplier is a useful tool to control the constraint violation in the Safe RL algorithms. -Classic RL algorithms combined with the Lagrangian multiplier are exellent baselines for Safe RL algorithms. +Classic RL algorithms combined with the Lagrangian multiplier are trustworthy baselines for Safe RL algorithms. .. note:: diff --git a/docs/source/usage/make.rst b/docs/source/usage/make.rst index 4fd6d67..c1c48a0 100644 --- a/docs/source/usage/make.rst +++ b/docs/source/usage/make.rst @@ -1,10 +1,10 @@ Efficient Commands ================== -To help users quickly reporduce our results, +To help users quickly reproduce our results, we provide a command line tool for easy installation, benchmarking, and evaluation. -One line benchmark running +One Line Benchmark Running -------------------------- First, create a conda environment with Python 3.8. @@ -20,16 +20,16 @@ Then, run the following command to install SafePO and run the full benchmark: make benchmark -This command will install SafePO in editable mode and excute the training process parallelly. +This command will install SafePO in editable mode and execute the training process of all algorithms on all environments. After the training process is finished, it will evaluate the trained policies and generate the benchmark results, including training curves and evaluation rewards and costs. -Simple benchmark running +Simple Benchmark Running ------------------------ The full benchmark is time-consuming. To verify the performance of SafePO, we provide a simple benchmark command, -which runs all alforithms on sampled environments and evaluate the trained policies. +which runs all algorithms on sampled environments and evaluate the trained policies. .. code-block:: bash diff --git a/docs/source/usage/train.rst b/docs/source/usage/train.rst index aa7218d..79d11d6 100644 --- a/docs/source/usage/train.rst +++ b/docs/source/usage/train.rst @@ -11,7 +11,7 @@ To run the algorithms with default configuration, you need to specify the enviro .. code-block:: bash cd safepo/single_agent - python ppo_lag.py --task SafetyPointGoal1-v0 --experiment ppo_lag_exp + python ppo_lag.py task SafetyPointGoal1-v0 experiment ppo_lag_exp Then you can check the results in the ``runs/ppo_lag_exp`` folder. @@ -23,12 +23,12 @@ The multi-agent algorithms running is similar to the single-agent algorithms. Fo .. code-block:: bash cd safepo/multi_agent - python mappolag.py --task Safety2x4AntVelocity-v0 --experiment mappo_lag_exp + python mappolag.py task Safety2x4AntVelocity-v0 experiment mappo_lag_exp Then you can check the results in the ``runs/mappo_lag_exp`` folder. -Cunstomizing Training ---------------------- +Customizing Training +-------------------- We use command line interface to support training customization. We provide the detailed description of the command line arguments in the following @@ -40,71 +40,71 @@ We provide the detailed description of the command line arguments in the followi +--------------------+----------------------------------+-----------------------------------------------+ | Argument | Description | Default Value | +====================+==================================+===============================================+ - | --seed | Seed of the experiment | 0 | + | seed | Seed of the experiment | 0 | +--------------------+----------------------------------+-----------------------------------------------+ - | --device | Device to run the code | "cpu" | + | device | Device to run the code | "cpu" | +--------------------+----------------------------------+-----------------------------------------------+ - | --num-envs | Number of parallel game | 10 | + | num-envs | Number of parallel game | 10 | | | environments | | +--------------------+----------------------------------+-----------------------------------------------+ - | --total-steps | Total timesteps of the | 10000000 | + | total-steps | Total steps of the | 10000000 | | | experiments | | +--------------------+----------------------------------+-----------------------------------------------+ - | --task | ID of the environment | "SafetyPointGoal1-v0" | + | task | ID of the environment | "SafetyPointGoal1-v0" | +--------------------+----------------------------------+-----------------------------------------------+ - | --use-eval | Toggles evaluation | False | + | use-eval | Toggles evaluation | False | +--------------------+----------------------------------+-----------------------------------------------+ - | --steps-per-epoch | Number of steps to run in each | 20000 | + | steps-per-epoch | Number of steps to run in each | 20000 | | | environment per policy rollout | | +--------------------+----------------------------------+-----------------------------------------------+ - | --critic-lr | Learning rate of the critic | 1e-3 | + | critic-lr | Learning rate of the critic | 1e-3 | | | network | | +--------------------+----------------------------------+-----------------------------------------------+ - | --log-dir | Directory to save agent logs | "../runs" | + | log-dir | Directory to save agent logs | "../runs" | +--------------------+----------------------------------+-----------------------------------------------+ - | --experiment | Name of the experiment | "single_agent_experiment" | + | experiment | Name of the experiment | "single_agent_experiment" | +--------------------+----------------------------------+-----------------------------------------------+ - | --write-terminal | Toggles terminal logging | True | + | write-terminal | Toggles terminal logging | True | +--------------------+----------------------------------+-----------------------------------------------+ - | --use-tensorboard | Toggles tensorboard logging | False | + | use-tensorboard | Toggles tensorboard logging | False | +--------------------+----------------------------------+-----------------------------------------------+ .. tab-item:: Multi-agent Algorithms +-------------------+--------------------------------+----------------------------------------------+ - | Parameter | Description | Default Value | + | Argument | Description | Default Value | +===================+================================+==============================================+ - | --use-eval | Use evaluation environment | False | + | use-eval | Use evaluation environment | False | | | for testing | | +-------------------+--------------------------------+----------------------------------------------+ - | --task | The task to run | "MujocoVelocity" | + | task | The task to run | "Safety2x4AntVelocity-v0" | +-------------------+--------------------------------+----------------------------------------------+ - | --experiment | Experiment name | "Base" | - | | If used with --metadata flag, | | + | experiment | Experiment name | "multi_agent_experiment" | + | | If used with metadata flag, | | | | additional information about | | | | physics engine, sim device, | | | | pipeline and domain | | | | randomization will be added | | | | to the name | | +-------------------+--------------------------------+----------------------------------------------+ - | --seed | Random seed | 0 | + | seed | Random seed | 0 | +-------------------+--------------------------------+----------------------------------------------+ - | --model-dir | Choose a model dir | "" | + | model-dir | The model dir | "" | +-------------------+--------------------------------+----------------------------------------------+ - | --safety-bound | cost_limit | 25.0 | + | safety-bound | Cost_limit | 25.0 | +-------------------+--------------------------------+----------------------------------------------+ - | --device | The device to run the model on | "cpu" | + | device | The device to run the model on | "cpu" | +-------------------+--------------------------------+----------------------------------------------+ - | --device-id | The device id to run the | 0 | + | device-id | The device id to run the | 0 | | | model on | | +-------------------+--------------------------------+----------------------------------------------+ - | --write-terminal | Toggles terminal logging | True | + | write-terminal | Toggles terminal logging | True | +-------------------+--------------------------------+----------------------------------------------+ - | --headless | Toggles headless mode | False | + | headless | Toggles headless mode | False | +-------------------+--------------------------------+----------------------------------------------+ - | --total-steps | Total timesteps of the | None | + | total-steps | Total steps of the | None | | | experiments | | +-------------------+--------------------------------+----------------------------------------------+ - | --num-envs | The number of parallel game | None | + | num-envs | The number of parallel game | None | | | environments | | +-------------------+--------------------------------+----------------------------------------------+